From 29bb083a8ea0c77e045cadb2994b9f9073e6d963 Mon Sep 17 00:00:00 2001 From: devs6186 Date: Wed, 25 Feb 2026 01:49:32 +0530 Subject: [PATCH] loader: fix recycled TID overwriting matched calls in dynamic layout When an OS recycles a thread ID (TID) within the same process, compute_dynamic_layout() was resetting calls_by_thread[t.address] to an empty list on the second encounter of that ThreadAddress. This erased the matched calls accumulated from the first thread instance. Those calls were still present in matched_calls (added by the rule engine), so the renderer could not locate them in the layout and raised ValueError("name not found for call"). Fix the overwrite by using setdefault() instead of direct assignment, and guard the threads_by_process.append() with a membership check so a recycled TID does not produce a duplicate thread entry in the layout. A dedicated unit test covering the recycled-TID scenario is added in tests/test_loader.py. Fixes #2619 --- CHANGELOG.md | 1 + capa/loader.py | 129 +++++++++++++++++++++++++++++++++---------- tests/test_loader.py | 117 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 217 insertions(+), 30 deletions(-) create mode 100644 tests/test_loader.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d4425ad29..18b3554ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ - ### Bug Fixes +- loader: fix recycled TID overwriting matched calls in dynamic layout, causing ValueError during rendering @devs6186 #2619 - main: suggest --os flag in unsupported OS error message to help users override ELF OS detection @devs6186 #2577 - render: escape sample-controlled strings before passing to Rich to prevent MarkupError @devs6186 #2699 - Fixed insecure deserialization vulnerability in YAML loading @0x1622 (#2770) diff --git a/capa/loader.py b/capa/loader.py index d89d4c09f..02f81c036 100644 --- a/capa/loader.py +++ b/capa/loader.py @@ -31,7 +31,11 @@ import capa.features.extractors.common from capa.rules import RuleSet from capa.engine import MatchResults -from capa.exceptions import UnsupportedOSError, UnsupportedArchError, UnsupportedFormatError +from capa.exceptions import ( + UnsupportedOSError, + UnsupportedArchError, + UnsupportedFormatError, +) from capa.features.common import ( OS_AUTO, FORMAT_PE, @@ -160,9 +164,13 @@ def get_workspace(path: Path, input_format: str, sigpaths: list[Path]): vw = viv_utils.getWorkspace(str(path), analyze=False, should_save=False) elif input_format == FORMAT_SC32: # these are not analyzed nor saved. - vw = viv_utils.getShellcodeWorkspaceFromFile(str(path), arch="i386", analyze=False) + vw = viv_utils.getShellcodeWorkspaceFromFile( + str(path), arch="i386", analyze=False + ) elif input_format == FORMAT_SC64: - vw = viv_utils.getShellcodeWorkspaceFromFile(str(path), arch="amd64", analyze=False) + vw = viv_utils.getShellcodeWorkspaceFromFile( + str(path), arch="amd64", analyze=False + ) else: raise ValueError("unexpected format: " + input_format) except envi.exc.SegmentationViolation as e: @@ -231,12 +239,16 @@ def get_extractor( import capa.features.extractors.drakvuf.extractor report = capa.helpers.load_jsonl_from_path(input_path) - return capa.features.extractors.drakvuf.extractor.DrakvufExtractor.from_report(report) + return capa.features.extractors.drakvuf.extractor.DrakvufExtractor.from_report( + report + ) elif backend == BACKEND_VMRAY: import capa.features.extractors.vmray.extractor - return capa.features.extractors.vmray.extractor.VMRayExtractor.from_zipfile(input_path) + return capa.features.extractors.vmray.extractor.VMRayExtractor.from_zipfile( + input_path + ) elif backend == BACKEND_DOTNET: import capa.features.extractors.dnfile.extractor @@ -244,7 +256,9 @@ def get_extractor( if input_format not in (FORMAT_PE, FORMAT_DOTNET): raise UnsupportedFormatError() - return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(input_path) + return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor( + input_path + ) elif backend == BACKEND_BINJA: import capa.features.extractors.binja.find_binja_api as finder @@ -303,11 +317,15 @@ def get_extractor( vw.saveWorkspace() except IOError: # see #168 for discussion around how to handle non-writable directories - logger.info("source directory is not writable, won't save intermediate workspace") + logger.info( + "source directory is not writable, won't save intermediate workspace" + ) else: logger.debug("CAPA_SAVE_WORKSPACE unset, not saving workspace") - return capa.features.extractors.viv.extractor.VivisectFeatureExtractor(vw, input_path, os_) + return capa.features.extractors.viv.extractor.VivisectFeatureExtractor( + vw, input_path, os_ + ) elif backend == BACKEND_FREEZE: return frz.load(input_path.read_bytes()) @@ -320,7 +338,9 @@ def get_extractor( assert sample_path is not None buf = sample_path.read_bytes() - return capa.features.extractors.binexport2.extractor.BinExport2FeatureExtractor(be2, buf) + return capa.features.extractors.binexport2.extractor.BinExport2FeatureExtractor( + be2, buf + ) elif backend == BACKEND_IDA: import capa.features.extractors.ida.idalib as idalib @@ -351,7 +371,9 @@ def get_extractor( # -1 - Generic errors (database already open, auto-analysis failed, etc.) # -2 - User cancelled operation ret = idapro.open_database( - str(input_path), run_auto_analysis=True, args="-Olumina:host=0.0.0.0 -Osecondary_lumina:host=0.0.0.0 -R" + str(input_path), + run_auto_analysis=True, + args="-Olumina:host=0.0.0.0 -Osecondary_lumina:host=0.0.0.0 -R", ) if ret != 0: raise RuntimeError("failed to analyze input file") @@ -386,12 +408,19 @@ def get_extractor( monitor = TaskMonitor.DUMMY # Import file - loader = pyghidra.program_loader().project(project).source(str(input_path)).name(input_path.name) + loader = ( + pyghidra.program_loader() + .project(project) + .source(str(input_path)) + .name(input_path.name) + ) with loader.load() as load_results: load_results.save(monitor) # Open program - program, consumer = pyghidra.consume_program(project, "/" + input_path.name) + program, consumer = pyghidra.consume_program( + project, "/" + input_path.name + ) # Analyze pyghidra.analyze(program, monitor) @@ -424,7 +453,9 @@ def __exit__(self, exc_type, exc_val, exc_tb): import capa.features.extractors.ghidra.extractor - return capa.features.extractors.ghidra.extractor.GhidraFeatureExtractor(ctx_manager=cm, tmpdir=tmpdir) + return capa.features.extractors.ghidra.extractor.GhidraFeatureExtractor( + ctx_manager=cm, tmpdir=tmpdir + ) else: raise ValueError("unexpected backend: " + backend) @@ -461,37 +492,55 @@ def get_file_extractors(input_file: Path, input_format: str) -> list[FeatureExtr if input_format == FORMAT_PE: import capa.features.extractors.pefile - file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input_file)) + file_extractors.append( + capa.features.extractors.pefile.PefileFeatureExtractor(input_file) + ) elif input_format == FORMAT_DOTNET: import capa.features.extractors.pefile import capa.features.extractors.dotnetfile - file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input_file)) - file_extractors.append(capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor(input_file)) + file_extractors.append( + capa.features.extractors.pefile.PefileFeatureExtractor(input_file) + ) + file_extractors.append( + capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor(input_file) + ) elif input_format == FORMAT_ELF: import capa.features.extractors.elffile - file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(input_file)) + file_extractors.append( + capa.features.extractors.elffile.ElfFeatureExtractor(input_file) + ) elif input_format == FORMAT_CAPE: import capa.features.extractors.cape.extractor report = capa.helpers.load_json_from_path(input_file) - file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report)) + file_extractors.append( + capa.features.extractors.cape.extractor.CapeExtractor.from_report(report) + ) elif input_format == FORMAT_DRAKVUF: import capa.helpers import capa.features.extractors.drakvuf.extractor report = capa.helpers.load_jsonl_from_path(input_file) - file_extractors.append(capa.features.extractors.drakvuf.extractor.DrakvufExtractor.from_report(report)) + file_extractors.append( + capa.features.extractors.drakvuf.extractor.DrakvufExtractor.from_report( + report + ) + ) elif input_format == FORMAT_VMRAY: import capa.features.extractors.vmray.extractor - file_extractors.append(capa.features.extractors.vmray.extractor.VMRayExtractor.from_zipfile(input_file)) + file_extractors.append( + capa.features.extractors.vmray.extractor.VMRayExtractor.from_zipfile( + input_file + ) + ) elif input_format == FORMAT_BINEXPORT2: file_extractors = _get_binexport2_file_extractors(input_file) @@ -501,7 +550,9 @@ def get_file_extractors(input_file: Path, input_format: str) -> list[FeatureExtr def get_signatures(sigs_path: Path) -> list[Path]: if not sigs_path.exists(): - raise IOError(f"signatures path {sigs_path} does not exist or cannot be accessed") + raise IOError( + f"signatures path {sigs_path} does not exist or cannot be accessed" + ) paths: list[Path] = [] if sigs_path.is_file(): @@ -525,7 +576,9 @@ def get_signatures(sigs_path: Path) -> list[Path]: return paths -def get_sample_analysis(format_, arch, os_, extractor, rules_path, feature_counts, library_functions): +def get_sample_analysis( + format_, arch, os_, extractor, rules_path, feature_counts, library_functions +): if isinstance(extractor, StaticFeatureExtractor): return rdoc.StaticAnalysis( format=format_, @@ -575,12 +628,20 @@ def collect_metadata( md5, sha1, sha256 = sample_hashes.md5, sample_hashes.sha1, sample_hashes.sha256 global_feats = list(extractor.extract_global_features()) - extractor_format = [f.value for (f, _) in global_feats if isinstance(f, capa.features.common.Format)] - extractor_arch = [f.value for (f, _) in global_feats if isinstance(f, capa.features.common.Arch)] - extractor_os = [f.value for (f, _) in global_feats if isinstance(f, capa.features.common.OS)] + extractor_format = [ + f.value for (f, _) in global_feats if isinstance(f, capa.features.common.Format) + ] + extractor_arch = [ + f.value for (f, _) in global_feats if isinstance(f, capa.features.common.Arch) + ] + extractor_os = [ + f.value for (f, _) in global_feats if isinstance(f, capa.features.common.OS) + ] input_format = ( - str(extractor_format[0]) if extractor_format else "unknown" if input_format == FORMAT_AUTO else input_format + str(extractor_format[0]) + if extractor_format + else "unknown" if input_format == FORMAT_AUTO else input_format ) arch = str(extractor_arch[0]) if extractor_arch else "unknown" os_ = str(extractor_os[0]) if extractor_os else "unknown" if os_ == OS_AUTO else os_ @@ -655,14 +716,18 @@ def result_rec(result: capa.features.common.Result): threads_by_process[p.address] = [] for t in extractor.get_threads(p): - calls_by_thread[t.address] = [] + # use setdefault so that a recycled TID (same pid+tid seen again) accumulates + # calls from all its instances rather than overwriting the prior instance's calls. + calls_by_thread.setdefault(t.address, []) for c in extractor.get_calls(p, t): if c.address in matched_calls: names_by_call[c.address] = extractor.get_call_name(p, t, c) calls_by_thread[t.address].append(c.address) - if calls_by_thread[t.address]: + # only register the thread address once; a recycled TID must not create + # a duplicate entry in threads_by_process or cause a double-add to matched_threads. + if calls_by_thread[t.address] and t.address not in matched_threads: matched_threads.add(t.address) threads_by_process[p.address].append(t.address) @@ -700,7 +765,9 @@ def result_rec(result: capa.features.common.Result): return layout -def compute_static_layout(rules: RuleSet, extractor: StaticFeatureExtractor, capabilities) -> rdoc.StaticLayout: +def compute_static_layout( + rules: RuleSet, extractor: StaticFeatureExtractor, capabilities +) -> rdoc.StaticLayout: """ compute a metadata structure that links basic blocks to the functions in which they're found. @@ -730,7 +797,9 @@ def compute_static_layout(rules: RuleSet, extractor: StaticFeatureExtractor, cap rdoc.FunctionLayout( address=frz.Address.from_capa(f), matched_basic_blocks=tuple( - rdoc.BasicBlockLayout(address=frz.Address.from_capa(bb)) for bb in bbs if bb in matched_bbs + rdoc.BasicBlockLayout(address=frz.Address.from_capa(bb)) + for bb in bbs + if bb in matched_bbs ), # this object is open to extension in the future, # such as with the function name, etc. ) diff --git a/tests/test_loader.py b/tests/test_loader.py new file mode 100644 index 000000000..2684087be --- /dev/null +++ b/tests/test_loader.py @@ -0,0 +1,117 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest.mock import MagicMock + +import capa.loader +import capa.features.common +from capa.features.extractors.base_extractor import ( + CallHandle, + SampleHashes, + ThreadHandle, + ProcessHandle, + ThreadAddress, + ProcessAddress, + DynamicCallAddress, + DynamicFeatureExtractor, +) + + +def test_compute_dynamic_layout_recycled_tid(): + """ + When an OS recycles a thread ID within the same process, compute_dynamic_layout + should accumulate calls from all instances of that TID rather than overwriting + the first instance's call list with an empty list. + + Without the fix the second iteration of the recycled TID resets + calls_by_thread[t.address] to [], causing matched calls from the first instance + to disappear from the layout and raising a ValueError during rendering. + + See #2619. + """ + proc_addr = ProcessAddress(pid=1000, ppid=0) + thread_addr = ThreadAddress(proc_addr, tid=42) # TID that will be recycled + + # A call from the *first* thread instance that a rule matched. + call_addr = DynamicCallAddress(thread_addr, id=0) + + proc_handle = ProcessHandle(address=proc_addr, inner=None) + # Two distinct handle objects sharing the same address — a recycled TID. + thread_handle_1 = ThreadHandle(address=thread_addr, inner="instance-1") + thread_handle_2 = ThreadHandle(address=thread_addr, inner="instance-2") + call_handle = CallHandle(address=call_addr, inner=None) + + class RecycledTidExtractor(DynamicFeatureExtractor): + def extract_global_features(self): + return iter([]) + + def extract_file_features(self): + return iter([]) + + def get_processes(self): + yield proc_handle + + def extract_process_features(self, ph): + return iter([]) + + def get_process_name(self, ph): + return "test.exe" + + def get_threads(self, ph): + # Yield the same thread address twice to simulate TID recycling. + yield thread_handle_1 + yield thread_handle_2 + + def extract_thread_features(self, ph, th): + return iter([]) + + def get_calls(self, ph, th): + # Only the first instance has the matched call; the second is empty. + if th is thread_handle_1: + yield call_handle + + def extract_call_features(self, ph, th, ch): + return iter([]) + + def get_call_name(self, ph, th, ch): + return "CreateFile(hFile)" + + extractor = RecycledTidExtractor( + SampleHashes(md5="a" * 32, sha1="a" * 40, sha256="a" * 64) + ) + + # Simulate a rule match at call_addr so it ends up in matched_calls. + result = capa.features.common.Result( + success=True, + statement=MagicMock(), + children=[], + locations={call_addr}, + ) + capabilities = {"test rule": [(call_addr, result)]} + + # Before the fix this raised a ValueError during rendering because the + # second thread instance (recycled TID) wiped the first instance's calls + # from calls_by_thread. + layout = capa.loader.compute_dynamic_layout(MagicMock(), extractor, capabilities) + + assert len(layout.processes) == 1 + proc_layout = layout.processes[0] + + # The thread should appear exactly once in the layout. + assert len(proc_layout.matched_threads) == 1 + thread_layout = proc_layout.matched_threads[0] + + # The call from the first thread instance must be present in the layout. + assert len(thread_layout.matched_calls) == 1 + assert thread_layout.matched_calls[0].name == "CreateFile(hFile)"