Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@

- doc: document that default output shows top-level matches only; -v/-vv show nested matches @devs6186 #1410
- doc: fix typo in usage.md, add documentation links to README @devs6186 #2274
- fixtures: add dynamic sample discovery and reverse MD5 lookup @kami922 #1743
- binja: add mypy config for top-level binaryninja module to fix mypy issues @devs6186 #2399
- ci: deprecate macos-13 runner and use Python v3.13 for testing @mike-hunhoff #2777

Expand Down
86 changes: 86 additions & 0 deletions tests/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import hashlib
import logging
import contextlib
import collections
Expand Down Expand Up @@ -59,6 +60,72 @@
DNFILE_TESTFILES = DOTNET_DIR / "dnfile-testfiles"


def _is_hash_name(stem: str) -> bool:
"""Return True if the stem looks like a hex hash (MD5=32 chars, SHA256=64 chars)."""
return len(stem) in (32, 64) and all(c in "0123456789abcdefABCDEF" for c in stem)


def _build_sample_cache(samples_path: Path) -> dict[str, Path]:
"""
Walk the samples directory and build lookup tables indexed by MD5, SHA256, and filename.

This eliminates the need for hardcoded file/hash mappings.
Similar to collect_samples() in scripts/lint.py.

Returns a dictionary mapping:
- MD5 hash (lower/upper) -> Path
- SHA256 hash (lower/upper) -> Path
- Filename -> Path
- Filename stem -> Path (for files with extensions)
"""
cache: dict[str, Path] = {}

# Skip files that are build artifacts or database files
skip_extensions = {".viv", ".idb", ".i64", ".frz", ".fnames", ".bndb"}

for path in samples_path.rglob("*"):
if not path.is_file():
continue

if path.suffix in skip_extensions:
continue

try:
buf = path.read_bytes()
except OSError:
continue

sha256 = hashlib.sha256(buf).hexdigest()
md5 = hashlib.md5(buf).hexdigest()

# prefer friendly names over hash-named duplicates (same content, two filenames)
# this ensures filesystem traversal order doesn't affect the result
if sha256.lower() not in cache or _is_hash_name(cache[sha256.lower()].stem):
cache[sha256.lower()] = path
cache[sha256.upper()] = path
if md5.lower() not in cache or _is_hash_name(cache[md5.lower()].stem):
cache[md5.lower()] = path
cache[md5.upper()] = path

# Index by first 8 characters of hashes as shortcuts
cache[sha256.lower()[:8]] = path
cache[md5.lower()[:8]] = path

cache[path.name] = path

if path.stem:
cache[path.stem] = path
stem_cleaned = path.stem.rstrip("_")
if stem_cleaned != path.stem:
cache[stem_cleaned] = path

return cache


# Build the sample cache once at module import time
_SAMPLE_CACHE = _build_sample_cache(CD / "data")


@contextlib.contextmanager
def xfail(condition, reason=None):
"""
Expand Down Expand Up @@ -638,6 +705,25 @@ def get_sample_md5_by_name(name):
raise ValueError(f"unexpected sample fixture: {name}")


def get_sample_short_name_by_md5(md5: str) -> str:
"""
Reverse lookup: given an MD5 hash, return the sample's shortened name.

Uses the dynamically-built sample cache instead of hardcoded mappings.
The cache is built once at module import time by walking the tests/data directory.

Raises:
ValueError: If no sample with the given MD5 is found
"""
md5_lower = md5.lower()

if md5_lower in _SAMPLE_CACHE:
path = _SAMPLE_CACHE[md5_lower]
return path.stem

raise ValueError(f"unexpected sample MD5: {md5}")


def resolve_sample(sample):
return get_data_path_by_name(sample)

Expand Down
138 changes: 138 additions & 0 deletions tests/test_fixtures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for fixtures module, covering sample discovery and reverse MD5 lookups."""

import pytest
import fixtures

# Known sample names for testing (simple named samples)
SIMPLE_SAMPLES = [
"mimikatz",
"kernel32",
"kernel32-64",
"pma12-04",
"pma16-01",
"pma01-01",
"pma21-01",
"al-khaser x86",
"al-khaser x64",
]

# Hash-based samples (where file name is a hash prefix)
HASH_BASED_SAMPLES = [
"39c05",
"499c2",
"9324d",
"a1982",
"a933a",
"bfb9b",
"c9188",
"64d9f",
"82bf6",
"77329",
"3b13b",
"7351f",
"79abd",
"946a9",
"b9f5b",
"294b8d",
"2bf18d",
"ea2876",
]


class TestGetSampleMd5ByName:
"""Tests for get_sample_md5_by_name function."""

@pytest.mark.parametrize("name", SIMPLE_SAMPLES)
def test_simple_sample_lookup(self, name):
"""Test that simple sample names have valid MD5 hashes."""
md5 = fixtures.get_sample_md5_by_name(name)
assert isinstance(md5, str)
assert len(md5) == 32
assert all(c in "0123456789abcdef" for c in md5)

@pytest.mark.parametrize("name", HASH_BASED_SAMPLES)
def test_hash_sample_lookup(self, name):
"""Test that hash-based samples have valid MD5 hashes."""
md5 = fixtures.get_sample_md5_by_name(name)
assert isinstance(md5, str)
assert len(md5) == 32
assert all(c in "0123456789abcdef" for c in md5)

def test_unknown_sample_raises_error(self):
"""Test that unknown samples raise ValueError."""
with pytest.raises(ValueError, match="unexpected sample fixture"):
fixtures.get_sample_md5_by_name("nonexistent_sample")

def test_empty_string_raises_error(self):
"""Test that empty string raises ValueError."""
with pytest.raises(ValueError, match="unexpected sample fixture"):
fixtures.get_sample_md5_by_name("")


class TestGetSampleShortNameByMd5:
"""Tests for get_sample_short_name_by_md5 function (reverse lookup)."""

@pytest.mark.parametrize(
"md5, name",
[
("5f66b82558ca92e54e77f216ef4c066c", "mimikatz"),
("e80758cf485db142fca1ee03a34ead05", "kernel32"),
("a8565440629ac87f6fef7d588fe3ff0f", "kernel32-64"),
("db648cd247281954344f1d810c6fd590", "al-khaser_x86"),
("3cb21ae76ff3da4b7e02d77ff76e82be", "al-khaser_x64"),
],
)
def test_reverse_lookup(self, md5, name):
"""Test reverse MD5 lookup returns correct sample name."""
result = fixtures.get_sample_short_name_by_md5(md5)
assert result == name

def test_unknown_md5_raises_error(self):
"""Test that unknown MD5 hash raises ValueError."""
with pytest.raises(ValueError, match="unexpected sample MD5"):
fixtures.get_sample_short_name_by_md5("00000000000000000000000000000000")

def test_empty_md5_raises_error(self):
"""Test that empty MD5 raises ValueError."""
with pytest.raises(ValueError, match="unexpected sample MD5"):
fixtures.get_sample_short_name_by_md5("")

def test_malformed_md5_raises_error(self):
"""Test that malformed MD5 raises ValueError."""
with pytest.raises(ValueError, match="unexpected sample MD5"):
fixtures.get_sample_short_name_by_md5("not_a_valid_md5")


class TestMd5NameLookupRoundtrip:
"""Tests for round-trip MD5/name lookups."""

@pytest.mark.parametrize("name", ["mimikatz", "kernel32", "kernel32-64"])
def test_roundtrip_simple_samples(self, name):
"""Test name->MD5->name roundtrip for samples whose filename stem matches the lookup name."""
md5 = fixtures.get_sample_md5_by_name(name)
result_name = fixtures.get_sample_short_name_by_md5(md5)
assert result_name == name

@pytest.mark.parametrize("name", HASH_BASED_SAMPLES)
def test_roundtrip_hash_samples(self, name):
"""Test name->MD5->name roundtrip for hash-based samples."""
md5 = fixtures.get_sample_md5_by_name(name)
result_name = fixtures.get_sample_short_name_by_md5(md5)
# Verify the roundtrip succeeds and returns a valid result
# Note: Hash-based filenames may use MD5 while lookup uses SHA256 prefix
assert isinstance(result_name, str)
assert len(result_name) > 0