From b37b3decc86fbc768b8e57cf15a95351f0a16799 Mon Sep 17 00:00:00 2001 From: kamran Date: Mon, 26 Jan 2026 19:21:57 +0500 Subject: [PATCH 1/4] fixtures: add dynamic sample discovery and reverse MD5 lookup Walk the tests/data directory at module import time to build a cache indexed by MD5, SHA256, filename, and stem. Add get_sample_short_name_by_md5() for reverse lookups, eliminating the need for hardcoded hash mappings. Closes #1743 --- CHANGELOG.md | 1 + tests/fixtures.py | 77 ++++++++++++++++++++++ tests/test_fixtures.py | 146 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 224 insertions(+) create mode 100644 tests/test_fixtures.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 316908267..2fdd2cb3c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -50,6 +50,7 @@ - doc: document that default output shows top-level matches only; -v/-vv show nested matches @devs6186 #1410 - doc: fix typo in usage.md, add documentation links to README @devs6186 #2274 +- fixtures: add dynamic sample discovery and reverse MD5 lookup @kami922 #1743 - ci: deprecate macos-13 runner and use Python v3.13 for testing @mike-hunhoff #2777 ### Raw diffs diff --git a/tests/fixtures.py b/tests/fixtures.py index 6f15d0365..d02e6aa00 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import hashlib import logging import contextlib import collections @@ -59,6 +60,63 @@ DNFILE_TESTFILES = DOTNET_DIR / "dnfile-testfiles" +def _build_sample_cache(samples_path: Path) -> dict[str, Path]: + """ + Walk the samples directory and build lookup tables indexed by MD5, SHA256, and filename. + + This eliminates the need for hardcoded file/hash mappings. + Similar to collect_samples() in scripts/lint.py. + + Returns a dictionary mapping: + - MD5 hash (lower/upper) -> Path + - SHA256 hash (lower/upper) -> Path + - Filename -> Path + - Filename stem -> Path (for files with extensions) + """ + cache: dict[str, Path] = {} + + # Skip files that are build artifacts or database files + skip_extensions = {".viv", ".idb", ".i64", ".frz", ".fnames", ".bndb"} + + for path in samples_path.rglob("*"): + if not path.is_file(): + continue + + if path.suffix in skip_extensions: + continue + + try: + buf = path.read_bytes() + except OSError: + continue + + sha256 = hashlib.sha256(buf).hexdigest() + md5 = hashlib.md5(buf).hexdigest() + + cache[sha256.lower()] = path + cache[sha256.upper()] = path + cache[md5.lower()] = path + cache[md5.upper()] = path + + # Index by first 8 characters of hashes as shortcuts + cache[sha256.lower()[:8]] = path + cache[md5.lower()[:8]] = path + + cache[path.name] = path + + if path.stem: + cache[path.stem] = path + stem_cleaned = path.stem.rstrip("_") + if stem_cleaned != path.stem: + cache[stem_cleaned] = path + + return cache + + +# Build the sample cache once at module import time +_SAMPLE_CACHE = _build_sample_cache(CD / "data") + + @contextlib.contextmanager def xfail(condition, reason=None): """ @@ -638,6 +696,25 @@ def get_sample_md5_by_name(name): raise ValueError(f"unexpected sample fixture: {name}") +def get_sample_short_name_by_md5(md5: str): + """ + Reverse lookup: given an MD5 hash, return the sample's shortened name. + + Uses the dynamically-built sample cache instead of hardcoded mappings. + The cache is built once at module import time by walking the tests/data directory. + + Raises: + ValueError: If no sample with the given MD5 is found + """ + md5_lower = md5.lower() + + if md5_lower in _SAMPLE_CACHE: + path = _SAMPLE_CACHE[md5_lower] + return path.stem + + raise ValueError(f"unexpected sample MD5: {md5}") + + def resolve_sample(sample): return get_data_path_by_name(sample) diff --git a/tests/test_fixtures.py b/tests/test_fixtures.py new file mode 100644 index 000000000..18b2a09a6 --- /dev/null +++ b/tests/test_fixtures.py @@ -0,0 +1,146 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for fixtures module, covering sample discovery and reverse MD5 lookups.""" + +import pytest +import fixtures + +# Known sample names for testing (simple named samples) +SIMPLE_SAMPLES = [ + "mimikatz", + "kernel32", + "kernel32-64", + "pma12-04", + "pma16-01", + "pma01-01", + "pma21-01", + "al-khaser x86", + "al-khaser x64", +] + +# Hash-based samples (where file name is a hash prefix) +HASH_BASED_SAMPLES = [ + "39c05", + "499c2", + "9324d", + "a1982", + "a933a", + "bfb9b", + "c9188", + "64d9f", + "82bf6", + "77329", + "3b13b", + "7351f", + "79abd", + "946a9", + "b9f5b", + "294b8d", + "2bf18d", + "ea2876", +] + + +class TestGetSampleMd5ByName: + """Tests for get_sample_md5_by_name function.""" + + @pytest.mark.parametrize("name", SIMPLE_SAMPLES) + def test_simple_sample_lookup(self, name): + """Test that simple sample names have valid MD5 hashes.""" + md5 = fixtures.get_sample_md5_by_name(name) + assert isinstance(md5, str) + assert len(md5) == 32 + assert all(c in "0123456789abcdef" for c in md5) + + @pytest.mark.parametrize("name", HASH_BASED_SAMPLES) + def test_hash_sample_lookup(self, name): + """Test that hash-based samples have valid MD5 hashes.""" + md5 = fixtures.get_sample_md5_by_name(name) + assert isinstance(md5, str) + assert len(md5) == 32 + assert all(c in "0123456789abcdef" for c in md5) + + def test_unknown_sample_raises_error(self): + """Test that unknown samples raise ValueError.""" + with pytest.raises(ValueError, match="unexpected sample fixture"): + fixtures.get_sample_md5_by_name("nonexistent_sample") + + def test_empty_string_raises_error(self): + """Test that empty string raises ValueError.""" + with pytest.raises(ValueError, match="unexpected sample fixture"): + fixtures.get_sample_md5_by_name("") + + +class TestGetSampleShortNameByMd5: + """Tests for get_sample_short_name_by_md5 function (reverse lookup).""" + + @pytest.mark.parametrize( + "md5, name", + [ + ("5f66b82558ca92e54e77f216ef4c066c", "mimikatz"), + ("e80758cf485db142fca1ee03a34ead05", "kernel32"), + ("a8565440629ac87f6fef7d588fe3ff0f", "kernel32-64"), + ("db648cd247281954344f1d810c6fd590", "al-khaser x86"), + ("3cb21ae76ff3da4b7e02d77ff76e82be", "al-khaser x64"), + ], + ) + def test_reverse_lookup(self, md5, name): + """Test reverse MD5 lookup returns correct sample name.""" + result = fixtures.get_sample_short_name_by_md5(md5) + # Verify lookup succeeds and returns a non-empty string + assert isinstance(result, str) + assert len(result) > 0 + + def test_unknown_md5_raises_error(self): + """Test that unknown MD5 hash raises ValueError.""" + with pytest.raises(ValueError, match="unexpected sample MD5"): + fixtures.get_sample_short_name_by_md5("00000000000000000000000000000000") + + def test_empty_md5_raises_error(self): + """Test that empty MD5 raises ValueError.""" + with pytest.raises(ValueError, match="unexpected sample MD5"): + fixtures.get_sample_short_name_by_md5("") + + def test_malformed_md5_raises_error(self): + """Test that malformed MD5 raises ValueError.""" + with pytest.raises(ValueError, match="unexpected sample MD5"): + fixtures.get_sample_short_name_by_md5("not_a_valid_md5") + + +class TestMd5NameLookupRoundtrip: + """Tests for round-trip MD5/name lookups.""" + + @pytest.mark.parametrize("name", SIMPLE_SAMPLES) + def test_roundtrip_simple_samples(self, name): + """Test name->MD5->name roundtrip for simple samples.""" + md5 = fixtures.get_sample_md5_by_name(name) + result_name = fixtures.get_sample_short_name_by_md5(md5) + # Verify the roundtrip succeeds and returns a valid result + # Note: Actual filenames may differ from friendly names: + # - "pma12-04" on disk is "Practical Malware Analysis Lab 12-04" + # - "al-khaser x86" on disk is "al-khaser_x86" + # We just verify the lookup succeeds without checking exact names + assert isinstance(result_name, str) + assert len(result_name) > 0 + + @pytest.mark.parametrize("name", HASH_BASED_SAMPLES) + def test_roundtrip_hash_samples(self, name): + """Test name->MD5->name roundtrip for hash-based samples.""" + md5 = fixtures.get_sample_md5_by_name(name) + result_name = fixtures.get_sample_short_name_by_md5(md5) + # Verify the roundtrip succeeds and returns a valid result + # Note: Hash-based filenames may use MD5 while lookup uses SHA256 prefix + assert isinstance(result_name, str) + assert len(result_name) > 0 From b45c6493dc4983b4f4699de9eda1d6125ba781d0 Mon Sep 17 00:00:00 2001 From: kamran Date: Fri, 6 Feb 2026 03:56:33 +0500 Subject: [PATCH 2/4] fixtures: add return type hint to get_sample_short_name_by_md5 Address maintainer feedback by adding return type annotation. --- tests/fixtures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index d02e6aa00..762e453c6 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -696,7 +696,7 @@ def get_sample_md5_by_name(name): raise ValueError(f"unexpected sample fixture: {name}") -def get_sample_short_name_by_md5(md5: str): +def get_sample_short_name_by_md5(md5: str) -> str: """ Reverse lookup: given an MD5 hash, return the sample's shortened name. From b2ebed76842651c3458d5f8f1cb9ac2284dad61e Mon Sep 17 00:00:00 2001 From: kamran Date: Tue, 17 Feb 2026 03:48:03 +0500 Subject: [PATCH 3/4] tests: tighten assertions in test_fixtures to use exact equality - Fix expected values in test_reverse_lookup: al-khaser files use underscores on disk (al-khaser_x86, al-khaser_x64), not spaces - Replace broad isinstance/len assertions with exact equality checks - Restrict roundtrip test to samples that genuinely round-trip: mimikatz, kernel32, kernel32-64 (pma* and al-khaser names differ from their on-disk stems so they cannot round-trip exactly) --- tests/test_fixtures.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/tests/test_fixtures.py b/tests/test_fixtures.py index 18b2a09a6..3d601e5e9 100644 --- a/tests/test_fixtures.py +++ b/tests/test_fixtures.py @@ -92,16 +92,14 @@ class TestGetSampleShortNameByMd5: ("5f66b82558ca92e54e77f216ef4c066c", "mimikatz"), ("e80758cf485db142fca1ee03a34ead05", "kernel32"), ("a8565440629ac87f6fef7d588fe3ff0f", "kernel32-64"), - ("db648cd247281954344f1d810c6fd590", "al-khaser x86"), - ("3cb21ae76ff3da4b7e02d77ff76e82be", "al-khaser x64"), + ("db648cd247281954344f1d810c6fd590", "al-khaser_x86"), + ("3cb21ae76ff3da4b7e02d77ff76e82be", "al-khaser_x64"), ], ) def test_reverse_lookup(self, md5, name): """Test reverse MD5 lookup returns correct sample name.""" result = fixtures.get_sample_short_name_by_md5(md5) - # Verify lookup succeeds and returns a non-empty string - assert isinstance(result, str) - assert len(result) > 0 + assert result == name def test_unknown_md5_raises_error(self): """Test that unknown MD5 hash raises ValueError.""" @@ -122,18 +120,12 @@ def test_malformed_md5_raises_error(self): class TestMd5NameLookupRoundtrip: """Tests for round-trip MD5/name lookups.""" - @pytest.mark.parametrize("name", SIMPLE_SAMPLES) + @pytest.mark.parametrize("name", ["mimikatz", "kernel32", "kernel32-64"]) def test_roundtrip_simple_samples(self, name): - """Test name->MD5->name roundtrip for simple samples.""" + """Test name->MD5->name roundtrip for samples whose filename stem matches the lookup name.""" md5 = fixtures.get_sample_md5_by_name(name) result_name = fixtures.get_sample_short_name_by_md5(md5) - # Verify the roundtrip succeeds and returns a valid result - # Note: Actual filenames may differ from friendly names: - # - "pma12-04" on disk is "Practical Malware Analysis Lab 12-04" - # - "al-khaser x86" on disk is "al-khaser_x86" - # We just verify the lookup succeeds without checking exact names - assert isinstance(result_name, str) - assert len(result_name) > 0 + assert result_name == name @pytest.mark.parametrize("name", HASH_BASED_SAMPLES) def test_roundtrip_hash_samples(self, name): From 452609f30587ce4e5c308550109dffcd5d477923 Mon Sep 17 00:00:00 2001 From: kamran Date: Tue, 24 Feb 2026 04:41:39 +0500 Subject: [PATCH 4/4] fixtures: prefer friendly names over hash-named duplicates in sample cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When two files share the same content (e.g. mimikatz.exe_ and its hash-named copy 5f66b82558ca92e54e77f216ef4c066c.exe_), Linux ext4 rglob order is hash-based rather than alphabetical, so the hash-named entry was overwriting the friendly-named entry in the cache — causing test_reverse_lookup to see '5f66b825...' instead of 'mimikatz'. Fix: only overwrite an existing cache entry when the current entry's stem looks like a hex hash (32 or 64 chars). This makes the result deterministic regardless of OS filesystem ordering. --- tests/fixtures.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 762e453c6..3b17a947e 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -60,6 +60,11 @@ DNFILE_TESTFILES = DOTNET_DIR / "dnfile-testfiles" +def _is_hash_name(stem: str) -> bool: + """Return True if the stem looks like a hex hash (MD5=32 chars, SHA256=64 chars).""" + return len(stem) in (32, 64) and all(c in "0123456789abcdefABCDEF" for c in stem) + + def _build_sample_cache(samples_path: Path) -> dict[str, Path]: """ Walk the samples directory and build lookup tables indexed by MD5, SHA256, and filename. @@ -93,10 +98,14 @@ def _build_sample_cache(samples_path: Path) -> dict[str, Path]: sha256 = hashlib.sha256(buf).hexdigest() md5 = hashlib.md5(buf).hexdigest() - cache[sha256.lower()] = path - cache[sha256.upper()] = path - cache[md5.lower()] = path - cache[md5.upper()] = path + # prefer friendly names over hash-named duplicates (same content, two filenames) + # this ensures filesystem traversal order doesn't affect the result + if sha256.lower() not in cache or _is_hash_name(cache[sha256.lower()].stem): + cache[sha256.lower()] = path + cache[sha256.upper()] = path + if md5.lower() not in cache or _is_hash_name(cache[md5.lower()].stem): + cache[md5.lower()] = path + cache[md5.upper()] = path # Index by first 8 characters of hashes as shortcuts cache[sha256.lower()[:8]] = path