Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,30 @@ dynamic = ["version"]
description = "dwarffi is a Python library for parsing ISF files and providing an interface to access kernel symbols and types."
readme = "README.md"
requires-python = ">=3.10"
dependencies = []
dependencies = [
"msgspec",
]
authors = [
{ name = "Luke Craig", email = "luke.craig@mit.edu" }
]
classifiers = [
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Intended Audience :: Developers",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: System :: Operating System Kernels",
]

[project.urls]
Repository = "https://github.com/rehosting/dwarffi"
Issues = "https://github.com/rehosting/dwarffi/issues"

[project.optional-dependencies]
dev = [
"pytest>=8.0.0",
"ruff>=0.4.0",
"ujson>=5.11.0",
"ruff>=0.4.0"
]

[tool.pytest.ini_options]
Expand Down
156 changes: 156 additions & 0 deletions scripts/benchmark_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import gc
import json
import time
import tracemalloc

try:
import orjson
except ImportError:
orjson = None

import msgspec

# Import the new ISFData struct from your refactored codebase
from dwarffi.types import ISFData


def generate_heavy_isf_payload() -> bytes:
print("Generating synthetic heavy ISF payload...")
isf = {
"metadata": {"format": "1.0.0", "producer": {"name": "benchmark"}},
"base_types": {
"int": {"kind": "int", "size": 4, "signed": True, "endian": "little"},
"pointer": {"kind": "pointer", "size": 8, "endian": "little"}
},
"user_types": {},
"enums": {},
"symbols": {},
"typedefs": {}
}

# Generate 15,000 Structs
for i in range(15000):
isf["user_types"][f"struct_heavy_{i}"] = {
"kind": "struct",
"size": 16,
"fields": {
"field_a": {"offset": 0, "type": {"kind": "base", "name": "int"}},
"field_b": {"offset": 8, "type": {"kind": "pointer", "subtype": {"kind": "base", "name": "int"}}}
}
}

# Generate 5,000 Enums
for i in range(5000):
isf["enums"][f"enum_heavy_{i}"] = {
"size": 4,
"base": "int",
"constants": {"A": 0, "B": 1, "C": 2}
}

# Generate 50,000 Symbols
for i in range(50000):
isf["symbols"][f"sys_func_{i}"] = {
"address": 0xFFFFFFFF81000000 + (i * 16),
"type": {"kind": "struct", "name": f"struct_heavy_{i % 15000}"}
}

payload = json.dumps(isf).encode("utf-8")
print(f"Payload generated: {len(payload) / 1024 / 1024:.2f} MB\n")
return payload


def parse_old_way_json(data: bytes):
"""Simulates the legacy VtypeJson parsing and validation using standard json."""
raw_data = json.loads(data)

# Legacy manual schema validation
required_sections = ["base_types", "user_types"]
missing = [s for s in required_sections if s not in raw_data]
if missing:
raise ValueError(f"ISF is missing required top-level sections: {missing}")

for name, definition in raw_data.get("user_types", {}).items():
if "kind" not in definition:
raise ValueError(f"User type '{name}' is missing the required 'kind' field.")

return raw_data


def parse_old_way_orjson(data: bytes):
"""Simulates the legacy VtypeJson parsing and validation using orjson (if available)."""
raw_data = orjson.loads(data)

required_sections = ["base_types", "user_types"]
missing = [s for s in required_sections if s not in raw_data]
if missing:
raise ValueError(f"ISF is missing required top-level sections: {missing}")

for name, definition in raw_data.get("user_types", {}).items():
if "kind" not in definition:
raise ValueError(f"User type '{name}' is missing the required 'kind' field.")

return raw_data


def parse_new_way_msgspec(data: bytes):
"""Simulates the new msgspec strict-schema decoding."""
return msgspec.json.decode(data, type=ISFData)


def run_benchmark(name: str, func, payload: bytes, iterations: int = 5):
print(f"--- Benchmarking: {name} ---")

# 1. Measure Time
gc.disable() # Disable GC to isolate parsing CPU time
start_time = time.perf_counter()
for _ in range(iterations):
_ = func(payload)
end_time = time.perf_counter()
gc.enable()

avg_time = (end_time - start_time) / iterations

# 2. Measure Memory
gc.collect()
tracemalloc.start()

parsed_obj = func(payload)

current_mem, peak_mem = tracemalloc.get_traced_memory()
tracemalloc.stop()

# Keep object alive intentionally so 'current_mem' reflects retained size
_ = parsed_obj

print(f" Average Time : {avg_time:.4f} seconds")
print(f" Peak Memory : {peak_mem / 1024 / 1024:.2f} MB")
print(f" Retained Mem : {current_mem / 1024 / 1024:.2f} MB\n")

return avg_time, peak_mem, current_mem


def main():
payload = generate_heavy_isf_payload()
iterations = 10

time_json, peak_json, ret_json = run_benchmark("Legacy Parser (Standard 'json')", parse_old_way_json, payload, iterations)

if orjson:
time_orjson, peak_orjson, ret_orjson = run_benchmark("Legacy Parser ('orjson')", parse_old_way_orjson, payload, iterations)
else:
print("--- Benchmarking: Legacy Parser ('orjson') ---\n [orjson not installed, skipping]\n")
time_orjson = float('inf')

time_msgspec, peak_msgspec, ret_msgspec = run_benchmark("New Parser ('msgspec')", parse_new_way_msgspec, payload, iterations)

print("================ SUMMARY ================")
print(f"Speedup vs standard json : {time_json / time_msgspec:.2f}x faster")
if orjson:
print(f"Speedup vs orjson : {time_orjson / time_msgspec:.2f}x faster")

print(f"Peak Memory Reduction : {peak_json / peak_msgspec:.2f}x less RAM used during parsing")
print(f"Retained Memory Reduction: {ret_json / ret_msgspec:.2f}x less RAM held by parsed objects")
print("=========================================")

if __name__ == "__main__":
main()
48 changes: 24 additions & 24 deletions src/dwarffi/dffi.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from .backend import BytesBackend, LiveMemoryProxy, MemoryBackend
from .instances import BoundArrayView, BoundTypeInstance, Ptr
from .parser import VtypeJson
from .types import VtypeBaseType, VtypeEnum, VtypeSymbol, VtypeUserType
from .types import VtypeBaseType, VtypeEnum, VtypeStructField, VtypeSymbol, VtypeUserType

# Clean, unified Type Aliases
VTYPE_CLASSES = (VtypeBaseType, VtypeEnum, VtypeUserType)
Expand Down Expand Up @@ -96,7 +96,7 @@ def symbols(self) -> Dict[str, Any]:
# First loaded wins => iterate in load order and don't overwrite.
for path in self._file_order:
vj = self.vtypejsons[path]
for sym_name in vj._raw_symbols.keys():
for sym_name in vj._isf.symbols.keys():
if sym_name in merged:
continue
sym = self.get_symbol(sym_name, path=path, include_incomplete=True)
Expand All @@ -110,7 +110,7 @@ def types(self) -> Dict[str, "VtypeUserType"]:
"""Returns a dictionary of all user types (structs/unions) across all loaded ISF files."""
merged = {}
for path in reversed(self._file_order):
for type_name in self.vtypejsons[path]._raw_user_types.keys():
for type_name in self.vtypejsons[path]._isf.user_types.keys():
t = self.get_user_type(type_name)
if t:
merged[type_name] = t
Expand All @@ -121,7 +121,7 @@ def base_types(self) -> Dict[str, "VtypeBaseType"]:
"""Returns a dictionary of all base types across all loaded ISF files."""
merged = {}
for path in reversed(self._file_order):
for type_name in self.vtypejsons[path]._raw_base_types.keys():
for type_name in self.vtypejsons[path]._isf.base_types.keys():
t = self.get_base_type(type_name)
if t:
merged[type_name] = t
Expand All @@ -132,7 +132,7 @@ def enums(self) -> Dict[str, "VtypeEnum"]:
"""Returns a dictionary of all enums across all loaded ISF files."""
merged = {}
for path in reversed(self._file_order):
for enum_name in self.vtypejsons[path]._raw_enums.keys():
for enum_name in self.vtypejsons[path]._isf.enums.keys():
t = self.get_enum(enum_name)
if t:
merged[enum_name] = t
Expand All @@ -152,7 +152,7 @@ def _resolve_type_info(self, type_info: Dict[str, Any]) -> Dict[str, Any]:

td = None
for f in self._file_order:
td = self.vtypejsons[f]._raw_typedefs.get(name)
td = self.vtypejsons[f]._isf.typedefs.get(name)
if td:
break
if not td:
Expand Down Expand Up @@ -599,12 +599,12 @@ def new(self, ctype: Union[str, Vtype, dict], init: Any = None) -> BoundType:
# without modifying the core instances engine.
dummy_name = f"__dummy_{id(buf)}"
primary_isf_path = self._file_order[0]
self.vtypejsons[primary_isf_path]._raw_user_types[dummy_name] = {
"kind": "struct",
"size": size,
"fields": {"arr": {"offset": 0, "type": t}},
}
self.vtypejsons[primary_isf_path]._parsed_user_types_cache.pop(dummy_name, None)
self.vtypejsons[primary_isf_path]._isf.user_types[dummy_name] = VtypeUserType(
kind="struct",
size=size,
fields={"arr": VtypeStructField(type_info=t, offset=0, name="arr")},
name=dummy_name
)

instance = self._create_instance(dummy_name, buf)
arr_view = instance.arr
Expand Down Expand Up @@ -709,12 +709,12 @@ def from_address(self, ctype: Union[str, Vtype, dict], address: int) -> BoundTyp
dummy_size = count * elem_size
dummy_name = f"__dummy_backend_{address}_{hash(str(t_dummy))}"
primary_isf_path = self._file_order[0]
self.vtypejsons[primary_isf_path]._raw_user_types[dummy_name] = {
"kind": "struct",
"size": dummy_size,
"fields": {"arr": {"offset": 0, "type": t_dummy}},
}
self.vtypejsons[primary_isf_path]._parsed_user_types_cache.pop(dummy_name, None)
self.vtypejsons[primary_isf_path]._isf.user_types[dummy_name] = VtypeUserType(
kind="struct",
size=dummy_size,
fields={"arr": VtypeStructField(type_info=t_dummy, offset=0, name="arr")},
name=dummy_name
)

instance = self._create_instance(dummy_name, proxy, instance_offset_in_buffer=address)
return instance.arr
Expand Down Expand Up @@ -782,12 +782,12 @@ def from_buffer(
dummy_name = f"__dummy_{id(python_buffer)}_{offset}_{hash(str(t))}"
primary_isf_path = self._file_order[0]

self.vtypejsons[primary_isf_path]._raw_user_types[dummy_name] = {
"kind": "struct",
"size": dummy_size,
"fields": {"arr": {"offset": 0, "type": t}},
}
self.vtypejsons[primary_isf_path]._parsed_user_types_cache.pop(dummy_name, None)
self.vtypejsons[primary_isf_path]._isf.user_types[dummy_name] = VtypeUserType(
kind="struct",
size=dummy_size,
fields={"arr": VtypeStructField(type_info=t, offset=0, name="arr")},
name=dummy_name
)

instance = self._create_instance(dummy_name, python_buffer, instance_offset_in_buffer=offset, base_address=address)
return instance.arr
Expand Down
Loading
Loading