rehosting · lacraig2 · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,13 +8,30 @@ dynamic = ["version"]
 description = "dwarffi is a Python library for parsing ISF files and providing an interface to access kernel symbols and types."
 readme = "README.md"
 requires-python = ">=3.10"
-dependencies = []
+dependencies = [
+  "msgspec",
+]
+authors = [
+  { name = "Luke Craig", email = "luke.craig@mit.edu" }
+]
+classifiers = [
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Intended Audience :: Developers",
+  "Topic :: Software Development :: Libraries :: Python Modules",
+  "Topic :: System :: Operating System Kernels",
+]
+
+[project.urls]
+Repository = "https://github.com/rehosting/dwarffi"
+Issues = "https://github.com/rehosting/dwarffi/issues"
 
 [project.optional-dependencies]
 dev = [
   "pytest>=8.0.0",
-  "ruff>=0.4.0",
-  "ujson>=5.11.0",
+  "ruff>=0.4.0"
 ]
 
 [tool.pytest.ini_options]

diff --git a/scripts/benchmark_json.py b/scripts/benchmark_json.py
@@ -0,0 +1,156 @@
+import gc
+import json
+import time
+import tracemalloc
+
+try:
+    import orjson
+except ImportError:
+    orjson = None
+
+import msgspec
+
+# Import the new ISFData struct from your refactored codebase
+from dwarffi.types import ISFData
+
+
+def generate_heavy_isf_payload() -> bytes:
+    print("Generating synthetic heavy ISF payload...")
+    isf = {
+        "metadata": {"format": "1.0.0", "producer": {"name": "benchmark"}},
+        "base_types": {
+            "int": {"kind": "int", "size": 4, "signed": True, "endian": "little"},
+            "pointer": {"kind": "pointer", "size": 8, "endian": "little"}
+        },
+        "user_types": {},
+        "enums": {},
+        "symbols": {},
+        "typedefs": {}
+    }
+
+    # Generate 15,000 Structs
+    for i in range(15000):
+        isf["user_types"][f"struct_heavy_{i}"] = {
+            "kind": "struct",
+            "size": 16,
+            "fields": {
+                "field_a": {"offset": 0, "type": {"kind": "base", "name": "int"}},
+                "field_b": {"offset": 8, "type": {"kind": "pointer", "subtype": {"kind": "base", "name": "int"}}}
+            }
+        }
+
+    # Generate 5,000 Enums
+    for i in range(5000):
+        isf["enums"][f"enum_heavy_{i}"] = {
+            "size": 4,
+            "base": "int",
+            "constants": {"A": 0, "B": 1, "C": 2}
+        }
+
+    # Generate 50,000 Symbols
+    for i in range(50000):
+        isf["symbols"][f"sys_func_{i}"] = {
+            "address": 0xFFFFFFFF81000000 + (i * 16),
+            "type": {"kind": "struct", "name": f"struct_heavy_{i % 15000}"}
+        }
+
+    payload = json.dumps(isf).encode("utf-8")
+    print(f"Payload generated: {len(payload) / 1024 / 1024:.2f} MB\n")
+    return payload
+
+
+def parse_old_way_json(data: bytes):
+    """Simulates the legacy VtypeJson parsing and validation using standard json."""
+    raw_data = json.loads(data)
+
+    # Legacy manual schema validation
+    required_sections = ["base_types", "user_types"]
+    missing = [s for s in required_sections if s not in raw_data]
+    if missing:
+        raise ValueError(f"ISF is missing required top-level sections: {missing}")
+
+    for name, definition in raw_data.get("user_types", {}).items():
+        if "kind" not in definition:
+            raise ValueError(f"User type '{name}' is missing the required 'kind' field.")
+
+    return raw_data
+
+
+def parse_old_way_orjson(data: bytes):
+    """Simulates the legacy VtypeJson parsing and validation using orjson (if available)."""
+    raw_data = orjson.loads(data)
+
+    required_sections = ["base_types", "user_types"]
+    missing = [s for s in required_sections if s not in raw_data]
+    if missing:
+        raise ValueError(f"ISF is missing required top-level sections: {missing}")
+
+    for name, definition in raw_data.get("user_types", {}).items():
+        if "kind" not in definition:
+            raise ValueError(f"User type '{name}' is missing the required 'kind' field.")
+
+    return raw_data
+
+
+def parse_new_way_msgspec(data: bytes):
+    """Simulates the new msgspec strict-schema decoding."""
+    return msgspec.json.decode(data, type=ISFData)
+
+
+def run_benchmark(name: str, func, payload: bytes, iterations: int = 5):
+    print(f"--- Benchmarking: {name} ---")
+
+    # 1. Measure Time
+    gc.disable() # Disable GC to isolate parsing CPU time
+    start_time = time.perf_counter()
+    for _ in range(iterations):
+        _ = func(payload)
+    end_time = time.perf_counter()
+    gc.enable()
+
+    avg_time = (end_time - start_time) / iterations
+
+    # 2. Measure Memory
+    gc.collect()
+    tracemalloc.start()
+
+    parsed_obj = func(payload)
+
+    current_mem, peak_mem = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+
+    # Keep object alive intentionally so 'current_mem' reflects retained size
+    _ = parsed_obj 
+
+    print(f"  Average Time : {avg_time:.4f} seconds")
+    print(f"  Peak Memory  : {peak_mem / 1024 / 1024:.2f} MB")
+    print(f"  Retained Mem : {current_mem / 1024 / 1024:.2f} MB\n")
+
+    return avg_time, peak_mem, current_mem
+
+
+def main():
+    payload = generate_heavy_isf_payload()
+    iterations = 10
+
+    time_json, peak_json, ret_json = run_benchmark("Legacy Parser (Standard 'json')", parse_old_way_json, payload, iterations)
+
+    if orjson:
+        time_orjson, peak_orjson, ret_orjson = run_benchmark("Legacy Parser ('orjson')", parse_old_way_orjson, payload, iterations)
+    else:
+        print("--- Benchmarking: Legacy Parser ('orjson') ---\n  [orjson not installed, skipping]\n")
+        time_orjson = float('inf')
+
+    time_msgspec, peak_msgspec, ret_msgspec = run_benchmark("New Parser ('msgspec')", parse_new_way_msgspec, payload, iterations)
+
+    print("================ SUMMARY ================")
+    print(f"Speedup vs standard json : {time_json / time_msgspec:.2f}x faster")
+    if orjson:
+        print(f"Speedup vs orjson        : {time_orjson / time_msgspec:.2f}x faster")
+
+    print(f"Peak Memory Reduction    : {peak_json / peak_msgspec:.2f}x less RAM used during parsing")
+    print(f"Retained Memory Reduction: {ret_json / ret_msgspec:.2f}x less RAM held by parsed objects")
+    print("=========================================")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/dwarffi/dffi.py b/src/dwarffi/dffi.py
@@ -13,7 +13,7 @@
 from .backend import BytesBackend, LiveMemoryProxy, MemoryBackend
 from .instances import BoundArrayView, BoundTypeInstance, Ptr
 from .parser import VtypeJson
-from .types import VtypeBaseType, VtypeEnum, VtypeSymbol, VtypeUserType
+from .types import VtypeBaseType, VtypeEnum, VtypeStructField, VtypeSymbol, VtypeUserType
 
 # Clean, unified Type Aliases
 VTYPE_CLASSES = (VtypeBaseType, VtypeEnum, VtypeUserType)
@@ -96,7 +96,7 @@ def symbols(self) -> Dict[str, Any]:
         # First loaded wins => iterate in load order and don't overwrite.
         for path in self._file_order:
             vj = self.vtypejsons[path]
-            for sym_name in vj._raw_symbols.keys():
+            for sym_name in vj._isf.symbols.keys():
                 if sym_name in merged:
                     continue
                 sym = self.get_symbol(sym_name, path=path, include_incomplete=True)
@@ -110,7 +110,7 @@ def types(self) -> Dict[str, "VtypeUserType"]:
         """Returns a dictionary of all user types (structs/unions) across all loaded ISF files."""
         merged = {}
         for path in reversed(self._file_order):
-            for type_name in self.vtypejsons[path]._raw_user_types.keys():
+            for type_name in self.vtypejsons[path]._isf.user_types.keys():
                 t = self.get_user_type(type_name)
                 if t:
                     merged[type_name] = t
@@ -121,7 +121,7 @@ def base_types(self) -> Dict[str, "VtypeBaseType"]:
         """Returns a dictionary of all base types across all loaded ISF files."""
         merged = {}
         for path in reversed(self._file_order):
-            for type_name in self.vtypejsons[path]._raw_base_types.keys():
+            for type_name in self.vtypejsons[path]._isf.base_types.keys():
                 t = self.get_base_type(type_name)
                 if t:
                     merged[type_name] = t
@@ -132,7 +132,7 @@ def enums(self) -> Dict[str, "VtypeEnum"]:
         """Returns a dictionary of all enums across all loaded ISF files."""
         merged = {}
         for path in reversed(self._file_order):
-            for enum_name in self.vtypejsons[path]._raw_enums.keys():
+            for enum_name in self.vtypejsons[path]._isf.enums.keys():
                 t = self.get_enum(enum_name)
                 if t:
                     merged[enum_name] = t
@@ -152,7 +152,7 @@ def _resolve_type_info(self, type_info: Dict[str, Any]) -> Dict[str, Any]:
 
             td = None
             for f in self._file_order:
-                td = self.vtypejsons[f]._raw_typedefs.get(name)
+                td = self.vtypejsons[f]._isf.typedefs.get(name)
                 if td:
                     break
             if not td:
@@ -599,12 +599,12 @@ def new(self, ctype: Union[str, Vtype, dict], init: Any = None) -> BoundType:
             # without modifying the core instances engine.
             dummy_name = f"__dummy_{id(buf)}"
             primary_isf_path = self._file_order[0]
-            self.vtypejsons[primary_isf_path]._raw_user_types[dummy_name] = {
-                "kind": "struct",
-                "size": size,
-                "fields": {"arr": {"offset": 0, "type": t}},
-            }
-            self.vtypejsons[primary_isf_path]._parsed_user_types_cache.pop(dummy_name, None)
+            self.vtypejsons[primary_isf_path]._isf.user_types[dummy_name] = VtypeUserType(
+                kind="struct",
+                size=size,
+                fields={"arr": VtypeStructField(type_info=t, offset=0, name="arr")},
+                name=dummy_name
+            )
 
             instance = self._create_instance(dummy_name, buf)
             arr_view = instance.arr
@@ -709,12 +709,12 @@ def from_address(self, ctype: Union[str, Vtype, dict], address: int) -> BoundTyp
                 dummy_size = count * elem_size
             dummy_name = f"__dummy_backend_{address}_{hash(str(t_dummy))}"
             primary_isf_path = self._file_order[0]
-            self.vtypejsons[primary_isf_path]._raw_user_types[dummy_name] = {
-                "kind": "struct",
-                "size": dummy_size,
-                "fields": {"arr": {"offset": 0, "type": t_dummy}},
-            }
-            self.vtypejsons[primary_isf_path]._parsed_user_types_cache.pop(dummy_name, None)
+            self.vtypejsons[primary_isf_path]._isf.user_types[dummy_name] = VtypeUserType(
+                kind="struct",
+                size=dummy_size,
+                fields={"arr": VtypeStructField(type_info=t_dummy, offset=0, name="arr")},
+                name=dummy_name
+            )
 
             instance = self._create_instance(dummy_name, proxy, instance_offset_in_buffer=address)
             return instance.arr
@@ -782,12 +782,12 @@ def from_buffer(
             dummy_name = f"__dummy_{id(python_buffer)}_{offset}_{hash(str(t))}"
             primary_isf_path = self._file_order[0]
 
-            self.vtypejsons[primary_isf_path]._raw_user_types[dummy_name] = {
-                "kind": "struct",
-                "size": dummy_size,
-                "fields": {"arr": {"offset": 0, "type": t}},
-            }
-            self.vtypejsons[primary_isf_path]._parsed_user_types_cache.pop(dummy_name, None)
+            self.vtypejsons[primary_isf_path]._isf.user_types[dummy_name] = VtypeUserType(
+                kind="struct",
+                size=dummy_size,
+                fields={"arr": VtypeStructField(type_info=t, offset=0, name="arr")},
+                name=dummy_name
+            )
 
             instance = self._create_instance(dummy_name, python_buffer, instance_offset_in_buffer=offset, base_address=address)
             return instance.arr