From ecd63e9d004442af59fb7f67ad487d54946487f5 Mon Sep 17 00:00:00 2001 From: Miauwkeru Date: Thu, 8 Jan 2026 17:33:33 +0100 Subject: [PATCH 01/13] Use temporary table for adding entry table --- dissect/evidence/asdf/asdf.py | 96 ++++++++++++++++++++--------------- tests/test_asdf.py | 18 +++---- 2 files changed, 64 insertions(+), 50 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index 7622316..a877f88 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -9,7 +9,7 @@ import uuid from bisect import bisect_right from collections import defaultdict -from typing import TYPE_CHECKING, BinaryIO +from typing import TYPE_CHECKING, Any, BinaryIO from dissect.util import ts from dissect.util.stream import AlignedStream, RangeStream @@ -23,7 +23,7 @@ ) if TYPE_CHECKING: - from collections.abc import Callable, Iterator + from collections.abc import Callable, Iterator, ValuesView SnapshotTableEntry = tuple[int, int, int, int] @@ -42,6 +42,49 @@ SPARSE_BYTES = b"\xa5\xdf" +class Table: + def __init__(self) -> None: + self._table: dict[int, list[tuple[int, ...]]] = defaultdict(list) + self._lookup: dict[int, list[int]] = defaultdict(list) + self._entries = 0 + self.offset = 0 + + def __bool__(self): + return bool(self._table) + + def __contains__(self, obj: Any) -> bool: + return obj in self._table + + def get(self, index: int) -> tuple[list, list]: + return self._table[index], self._lookup[index] + + def add(self, index: int, table_idx: int, entry: tuple, offset: int) -> None: + self._table[index].insert(table_idx, entry) + self._lookup[index].insert(table_idx, offset) + self._entries += 1 + + def lookup(self, idx: int) -> list[int]: + return self._lookup.get(idx) + + def values(self) -> ValuesView[list[tuple]]: + return self._table.values() + + def write(self) -> bytes: + result = [] + for stream_table in self._table.values(): + for flags, idx, offset, size, file_offset, file_size in stream_table: + table_entry = c_asdf.table_entry( + flags=flags, + idx=idx, + offset=offset, + size=size, + file_offset=file_offset, + file_size=file_size, + ) + result.append(table_entry.dumps()) + return b"".join(result) + + class AsdfWriter(io.RawIOBase): """ASDF file writer. @@ -79,9 +122,7 @@ def __init__( self.block_crc = block_crc self.block_compress = False # Disabled for now - self._table = defaultdict(list) - self._table_lookup = defaultdict(list) - self._table_offset = 0 + self._table = Table() self._meta_buf = io.BytesIO() self._meta_tar = tarfile.open(fileobj=self._meta_buf, mode="w") # noqa: SIM115 @@ -232,8 +273,7 @@ def _write_block(self, source: BinaryIO, offset: int, size: int, idx: int = 0, b """ absolute_offset = base + offset - lookup_table = self._table_lookup[idx] - entry_table = self._table[idx] + entry_table, lookup_table = self._table.get(idx) table_idx, absolute_offset, size = _table_fit( absolute_offset, size, entry_table, lookup_table, lambda e: (e[2], e[3]) @@ -271,9 +311,7 @@ def _write_block(self, source: BinaryIO, offset: int, size: int, idx: int = 0, b outfh.finalize() data_size = self.fh.tell() - data_offset - - lookup_table.insert(table_idx, absolute_offset) - entry_table.insert(table_idx, (flags, idx, absolute_offset, size, block_offset, data_size)) + self._table.add(idx, table_idx, (flags, idx, absolute_offset, size, block_offset, data_size), absolute_offset) def _write_meta(self) -> None: """Write the metadata tar to the destination file-like object.""" @@ -286,17 +324,7 @@ def _write_meta(self) -> None: def _write_table(self) -> None: """Write the ASDF block table to the destination file-like object.""" self._table_offset = self.fh.tell() - for stream_table in self._table.values(): - for flags, idx, offset, size, file_offset, file_size in stream_table: - table_entry = c_asdf.table_entry( - flags=flags, - idx=idx, - offset=offset, - size=size, - file_offset=file_offset, - file_size=file_size, - ) - table_entry.write(self.fh) + self.fh.write(self._table.write()) def _write_footer(self) -> None: """Write the ASDF footer to the destination file-like object.""" @@ -327,8 +355,7 @@ def __init__(self, fh: BinaryIO, recover: bool = False): self.timestamp = ts.from_unix(self.header.timestamp) self.guid = uuid.UUID(bytes_le=self.header.guid) - self.table: dict[list[SnapshotTableEntry]] = defaultdict(list) - self._table_lookup: dict[list[int]] = defaultdict(list) + self.table = Table() footer_offset = self.fh.seek(-len(c_asdf.footer), io.SEEK_END) @@ -361,11 +388,9 @@ def _recover_block_table(self) -> None: self._table_insert(block.idx, block.offset, block.size, file_offset) def _table_insert(self, idx: int, offset: int, size: int, file_offset: int) -> None: - stream_idx = idx entry_data_offset = file_offset + len(c_asdf.block) - lookup_table = self._table_lookup[stream_idx] - entry_table = self.table[stream_idx] + entry_table, lookup_table = self.table.get(idx) table_idx, entry_offset, entry_size = _table_fit( offset, size, entry_table, lookup_table, lambda e: (e[0], e[1]) @@ -375,17 +400,7 @@ def _table_insert(self, idx: int, offset: int, size: int, file_offset: int) -> N return entry_data_offset += entry_offset - offset - - lookup_table.insert(table_idx, entry_offset) - entry_table.insert( - table_idx, - ( - entry_offset, - entry_size, - file_offset, - entry_data_offset, - ), - ) + self.table.add(idx, table_idx, (entry_offset, entry_size, file_offset, entry_data_offset), entry_offset) def contains(self, idx: int) -> bool: """Check whether this file contains the given stream index. @@ -407,12 +422,12 @@ def open(self, idx: int) -> AsdfStream: def streams(self) -> Iterator[AsdfStream]: """Iterate over all streams in the file.""" - for i in sorted(self.table.keys()): + for i in sorted(self.table._table.keys()): yield self.open(i) def disks(self) -> Iterator[AsdfStream]: """Iterate over all non-reserved streams in the file.""" - for i in sorted(self.table.keys()): + for i in sorted(self.table._table.keys()): if i in RESERVED_IDX: continue yield self.open(i) @@ -459,8 +474,7 @@ def __init__(self, asdf: AsdfSnapshot, idx: int): self.fh = asdf.fh self.asdf = asdf self.idx = idx - self.table = asdf.table[idx] - self._table_lookup = asdf._table_lookup[idx] + self.table, self._table_lookup = asdf.table.get(idx) # We don't actually know the size of the source disk # Doesn't really matter though, just take the last run offset + size diff --git a/tests/test_asdf.py b/tests/test_asdf.py index d1b6e33..10f89bc 100644 --- a/tests/test_asdf.py +++ b/tests/test_asdf.py @@ -62,16 +62,16 @@ def test_asdf(asdf_writer: AsdfWriter) -> None: def test_asdf_overlap(asdf_writer: AsdfWriter) -> None: asdf_writer.add_bytes(b"\x01" * 100, base=0) asdf_writer.add_bytes(b"\x02" * 100, base=200) - assert asdf_writer._table_lookup[0] == [0, 200] + assert asdf_writer._table.lookup(0) == [0, 200] asdf_writer.add_bytes(b"\x03" * 100, base=50) - assert asdf_writer._table_lookup[0] == [0, 100, 200] + assert asdf_writer._table.lookup(0) == [0, 100, 200] asdf_writer.add_bytes(b"\x04" * 150, base=100) - assert asdf_writer._table_lookup[0] == [0, 100, 150, 200] + assert asdf_writer._table.lookup(0) == [0, 100, 150, 200] asdf_writer.add_bytes(b"\x05" * 50, base=25) - assert asdf_writer._table_lookup[0] == [0, 100, 150, 200] + assert asdf_writer._table.lookup(0) == [0, 100, 150, 200] asdf_writer.close() asdf_writer._fh.seek(0) @@ -93,9 +93,9 @@ def test_asdf_overlap_all(asdf_writer: AsdfWriter) -> None: asdf_writer.add_bytes(b"\x02" * 100, base=200) asdf_writer.add_bytes(b"\x03" * 100, base=50) asdf_writer.add_bytes(b"\x04" * 150, base=100) - assert asdf_writer._table_lookup[0] == [0, 100, 150, 200] + assert asdf_writer._table.lookup(0) == [0, 100, 150, 200] asdf_writer.add_bytes(b"\x06" * 400, base=0) - assert asdf_writer._table_lookup[0] == [0, 100] + assert asdf_writer._table.lookup(0) == [0, 100] asdf_writer.close() asdf_writer._fh.seek(0) @@ -113,10 +113,10 @@ def test_asdf_overlap_all(asdf_writer: AsdfWriter) -> None: def test_asdf_overlap_contiguous(asdf_writer: AsdfWriter) -> None: asdf_writer.add_bytes(b"\x01" * 100, base=0) asdf_writer.add_bytes(b"\x02" * 100, base=100) - assert asdf_writer._table_lookup[0] == [0, 100] + assert asdf_writer._table.lookup(0) == [0, 100] asdf_writer.add_bytes(b"\x03" * 75, base=50) - assert asdf_writer._table_lookup[0] == [0, 100] + assert asdf_writer._table.lookup(0) == [0, 100] asdf_writer.close() asdf_writer._fh.seek(0) @@ -135,7 +135,7 @@ def test_asdf_overlap_seek(asdf_writer: AsdfWriter) -> None: asdf_writer.add_bytes(b"\x00" * 100, base=0) asdf_writer.add_bytes(b"\x00" * 100, base=200) asdf_writer.add_bytes(bytes(range(200)), base=50) - assert asdf_writer._table_lookup[0] == [0, 100, 200] + assert asdf_writer._table.lookup(0) == [0, 100, 200] asdf_writer.close() asdf_writer._fh.seek(0) From a154376397a014821ea30d97cc4d5254b0773c1a Mon Sep 17 00:00:00 2001 From: Miauwkeru Date: Wed, 21 Jan 2026 13:46:45 +0100 Subject: [PATCH 02/13] Add new table_index checks Add functionality to search for the specific index tables --- dissect/evidence/asdf/asdf.py | 102 ++++++++++++++++++++++++------- dissect/evidence/asdf/c_asdf.py | 6 ++ dissect/evidence/asdf/c_asdf.pyi | 16 ++++- dissect/evidence/asdf/stream.py | 1 + tests/conftest.py | 13 ++-- 5 files changed, 110 insertions(+), 28 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index a877f88..86ac845 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -4,6 +4,7 @@ import gzip import io +import itertools import shutil import tarfile import uuid @@ -41,10 +42,14 @@ FOOTER_MAGIC = b"FT\xa5\xdf" SPARSE_BYTES = b"\xa5\xdf" +DEFAULT_NR_OF_ENTRIES = 4 * 1024 * 1024 // len(c_asdf.table_entry) + class Table: - def __init__(self) -> None: + def __init__(self, fh: BinaryIO) -> None: + self._fh = fh self._table: dict[int, list[tuple[int, ...]]] = defaultdict(list) + self._table_offsets: list[tuple[int, list[int]]] = [] self._lookup: dict[int, list[int]] = defaultdict(list) self._entries = 0 self.offset = 0 @@ -52,6 +57,9 @@ def __init__(self) -> None: def __bool__(self): return bool(self._table) + def __len__(self): + return self._entries + def __contains__(self, obj: Any) -> bool: return obj in self._table @@ -63,25 +71,53 @@ def add(self, index: int, table_idx: int, entry: tuple, offset: int) -> None: self._lookup[index].insert(table_idx, offset) self._entries += 1 - def lookup(self, idx: int) -> list[int]: - return self._lookup.get(idx) + def indexes(self) -> list[int]: + indexes = sum(1 << key for key in self._table) + return [(indexes >> (x * 64)) & 0xFFFF_FFFF_FFFF_FFFF for x in range(256 // 64)] - def values(self) -> ValuesView[list[tuple]]: + def lookup(self, idx: int) -> list[int]: + prev_offset = self._fh.tell() + index_index = (idx // 64) - (1 if idx != 0 else 0) + look = [] + + # Lookup offsets + for offset, indexes in sorted(self._table_offsets, key=lambda x: x[0]): + index = indexes[index_index] + if (1 << (idx % 64)) & index: + self._fh.seek(offset, 0) + table = c_asdf.table_index(self._fh) + count = table.size // len(c_asdf.table_entry) + entries = c_asdf.table_entry[count](self._fh.read(table.size)) + look.extend(filter(lambda x: x.idx == idx, entries)) + self._fh.seek(prev_offset, 0) + if idx in self._table: + look.extend(x[2] for x in self._table[idx]) + + return look + + def values(self) -> ValuesView[list[tuple[int, ...]]]: return self._table.values() def write(self) -> bytes: result = [] - for stream_table in self._table.values(): - for flags, idx, offset, size, file_offset, file_size in stream_table: - table_entry = c_asdf.table_entry( - flags=flags, - idx=idx, - offset=offset, - size=size, - file_offset=file_offset, - file_size=file_size, - ) - result.append(table_entry.dumps()) + for flags, idx, offset, size, file_offset, file_size in itertools.chain(*self._table.values()): + table_entry = c_asdf.table_entry( + flags=flags, + idx=idx, + offset=offset, + size=size, + file_offset=file_offset, + file_size=file_size, + ) + result.append(table_entry.dumps()) + + index = c_asdf.table_index( + prev_table=self.offset, size=len(result) * len(c_asdf.table_entry), indexes=self.indexes() + ) + result.insert(0, index.dumps()) + self._table.clear() + self._lookup.clear() + self._entries = 0 return b"".join(result) @@ -108,6 +144,7 @@ def __init__( guid: uuid.UUID | None = None, compress: bool = False, block_crc: bool = True, + table_size: int = DEFAULT_NR_OF_ENTRIES, ): self._fh = fh self.fh = self._fh @@ -122,7 +159,11 @@ def __init__( self.block_crc = block_crc self.block_compress = False # Disabled for now - self._table = Table() + if table_size < 1: + raise ValueError("Table size can't be 0 or smaller") + + self._max_entries = table_size + self._table = Table(self.fh) self._meta_buf = io.BytesIO() self._meta_tar = tarfile.open(fileobj=self._meta_buf, mode="w") # noqa: SIM115 @@ -313,6 +354,9 @@ def _write_block(self, source: BinaryIO, offset: int, size: int, idx: int = 0, b data_size = self.fh.tell() - data_offset self._table.add(idx, table_idx, (flags, idx, absolute_offset, size, block_offset, data_size), absolute_offset) + if len(self._table) >= self._max_entries: + self._write_table() + def _write_meta(self) -> None: """Write the metadata tar to the destination file-like object.""" self._meta_tar.close() @@ -323,14 +367,16 @@ def _write_meta(self) -> None: def _write_table(self) -> None: """Write the ASDF block table to the destination file-like object.""" - self._table_offset = self.fh.tell() + tmp_offset = self.fh.tell() + self._table._table_offsets.append((tmp_offset, self._table.indexes())) self.fh.write(self._table.write()) + self._table.offset = tmp_offset def _write_footer(self) -> None: """Write the ASDF footer to the destination file-like object.""" footer = c_asdf.footer( magic=FOOTER_MAGIC, - table_offset=self._table_offset, + table_offset=self._table.offset, sha256=self.fh.digest(), ) footer.write(self.fh) @@ -355,7 +401,7 @@ def __init__(self, fh: BinaryIO, recover: bool = False): self.timestamp = ts.from_unix(self.header.timestamp) self.guid = uuid.UUID(bytes_le=self.header.guid) - self.table = Table() + self.table = Table(self.fh) footer_offset = self.fh.seek(-len(c_asdf.footer), io.SEEK_END) @@ -376,11 +422,21 @@ def __init__(self, fh: BinaryIO, recover: bool = False): def _parse_block_table(self, offset: int, count: int) -> None: """Parse the block table, getting rid of overlapping blocks.""" self.fh.seek(offset) - table_data = io.BytesIO(self.fh.read(count * len(c_asdf.table_entry))) - for _ in range(count): - entry = c_asdf.table_entry(table_data) - self._table_insert(entry.idx, entry.offset, entry.size, entry.file_offset) + while True: + prev_offset = self.fh.tell() + table_index = c_asdf.table_index(self.fh) + self.table._table_offsets.append((prev_offset, table_index.indexes)) + _count = table_index.size // len(c_asdf.table_entry) + table_data = io.BytesIO(self.fh.read(table_index.size)) + + for _ in range(_count): + entry = c_asdf.table_entry(table_data) + self._table_insert(entry.idx, entry.offset, entry.size, entry.file_offset) + + if table_index.prev_table in [0, 0xFFFFFFFFFFFFFFFF]: + break + self.fh.seek(table_index.prev_table) def _recover_block_table(self) -> None: self.fh.seek(len(c_asdf.header)) diff --git a/dissect/evidence/asdf/c_asdf.py b/dissect/evidence/asdf/c_asdf.py index 8da56ce..2f43f42 100644 --- a/dissect/evidence/asdf/c_asdf.py +++ b/dissect/evidence/asdf/c_asdf.py @@ -31,6 +31,12 @@ uint64 size; // Size of block in stream }; +struct table_index { + uint64 prev_table; // Offset of the previous table FFFFFFFFF denotes last table + uint64 size; // Amount of bytes of the table + uint64 indexes[4]; // Which table entries are inside +}; + struct table_entry { BLOCK_FLAG flags; // Block flags uint8 idx; // Stream index, some reserved values have special meaning diff --git a/dissect/evidence/asdf/c_asdf.pyi b/dissect/evidence/asdf/c_asdf.pyi index 58c6f12..61c08ff 100644 --- a/dissect/evidence/asdf/c_asdf.pyi +++ b/dissect/evidence/asdf/c_asdf.pyi @@ -1,5 +1,5 @@ # Generated by cstruct-stubgen -from typing import BinaryIO, TypeAlias, overload +from typing import BinaryIO, Literal, TypeAlias, overload import dissect.cstruct as __cs__ @@ -53,6 +53,20 @@ class _c_asdf(__cs__.cstruct): @overload def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + class table_index(__cs__.Structure): + prev_table: _c_asdf.uint64 + size: _c_asdf.uint64 + indexes: __cs__.Array[_c_asdf.uint64] + @overload + def __init__( + self, + prev_table: _c_asdf.uint64 | None = ..., + size: _c_asdf.uint64 | None = ..., + indexes: __cs__.Array[_c_asdf.uint64] | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + class table_entry(__cs__.Structure): flags: _c_asdf.BLOCK_FLAG idx: _c_asdf.uint8 diff --git a/dissect/evidence/asdf/stream.py b/dissect/evidence/asdf/stream.py index 4bd1fb2..0c0efd1 100644 --- a/dissect/evidence/asdf/stream.py +++ b/dissect/evidence/asdf/stream.py @@ -16,6 +16,7 @@ class SubStreamBase(io.RawIOBase): def __init__(self, fh: BinaryIO): self.fh = fh + self.read = fh.read def write(self, b: bytes) -> int: return self.fh.write(b) diff --git a/tests/conftest.py b/tests/conftest.py index 3cfa66f..f2f3d02 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,7 +6,7 @@ import pytest -from dissect.evidence.asdf import AsdfWriter +from dissect.evidence.asdf.asdf import DEFAULT_NR_OF_ENTRIES, AsdfWriter from tests._utils import absolute_path if TYPE_CHECKING: @@ -80,11 +80,16 @@ def ewf_data() -> Iterator[BinaryIO]: yield from open_data("_data/ewf/ewf.E01") -@pytest.fixture -def asdf_writer() -> AsdfWriter: +@pytest.fixture( + params=[ + pytest.param(1, id="table_size=1"), + pytest.param(DEFAULT_NR_OF_ENTRIES, id="table_size=DEFAULT"), + ] +) +def asdf_writer(request: pytest.FixtureRequest) -> AsdfWriter: def noop() -> None: pass fh = BytesIO() fh.close = noop # Prevent clearing the buffer, we need it - return AsdfWriter(fh) + return AsdfWriter(fh, table_size=request.param) From 509db93579ad6ed083e77e0ab95dd64dbf3fb84d Mon Sep 17 00:00:00 2001 From: Miauwkeru Date: Wed, 21 Jan 2026 17:20:48 +0100 Subject: [PATCH 03/13] Change tuples to use c_asdf.table_entry --- dissect/evidence/asdf/asdf.py | 99 +++++++++++++++++++---------------- tests/test_asdf.py | 12 ++--- 2 files changed, 60 insertions(+), 51 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index 86ac845..586e3d9 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -24,7 +24,7 @@ ) if TYPE_CHECKING: - from collections.abc import Callable, Iterator, ValuesView + from collections.abc import Iterator, ValuesView SnapshotTableEntry = tuple[int, int, int, int] @@ -48,7 +48,7 @@ class Table: def __init__(self, fh: BinaryIO) -> None: self._fh = fh - self._table: dict[int, list[tuple[int, ...]]] = defaultdict(list) + self._table: dict[int, list[c_asdf.table_entry]] = defaultdict(list) self._table_offsets: list[tuple[int, list[int]]] = [] self._lookup: dict[int, list[int]] = defaultdict(list) self._entries = 0 @@ -63,12 +63,12 @@ def __len__(self): def __contains__(self, obj: Any) -> bool: return obj in self._table - def get(self, index: int) -> tuple[list, list]: + def get(self, index: int) -> tuple[list[c_asdf.table_entry], list[int]]: return self._table[index], self._lookup[index] - def add(self, index: int, table_idx: int, entry: tuple, offset: int) -> None: - self._table[index].insert(table_idx, entry) - self._lookup[index].insert(table_idx, offset) + def add(self, table_idx: int, entry: c_asdf.table_entry) -> None: + self._table[entry.idx].insert(table_idx, entry) + self._lookup[entry.idx].insert(table_idx, entry.offset) self._entries += 1 def indexes(self) -> list[int]: @@ -88,28 +88,18 @@ def lookup(self, idx: int) -> list[int]: table = c_asdf.table_index(self._fh) count = table.size // len(c_asdf.table_entry) entries = c_asdf.table_entry[count](self._fh.read(table.size)) - look.extend(filter(lambda x: x.idx == idx, entries)) + look.extend(entry.offset for entry in filter(lambda x: x.idx == idx, entries)) self._fh.seek(prev_offset, 0) if idx in self._table: - look.extend(x[2] for x in self._table[idx]) + look.extend(entry.offset for entry in self._table[idx]) return look - def values(self) -> ValuesView[list[tuple[int, ...]]]: + def values(self) -> ValuesView[list[c_asdf.table_entry]]: return self._table.values() def write(self) -> bytes: - result = [] - for flags, idx, offset, size, file_offset, file_size in itertools.chain(*self._table.values()): - table_entry = c_asdf.table_entry( - flags=flags, - idx=idx, - offset=offset, - size=size, - file_offset=file_offset, - file_size=file_size, - ) - result.append(table_entry.dumps()) + result = [entry.dumps() for entry in itertools.chain(*self._table.values())] index = c_asdf.table_index( prev_table=self.offset, size=len(result) * len(c_asdf.table_entry), indexes=self.indexes() @@ -316,9 +306,7 @@ def _write_block(self, source: BinaryIO, offset: int, size: int, idx: int = 0, b entry_table, lookup_table = self._table.get(idx) - table_idx, absolute_offset, size = _table_fit( - absolute_offset, size, entry_table, lookup_table, lambda e: (e[2], e[3]) - ) + table_idx, absolute_offset, size = _table_fit(absolute_offset, size, entry_table, lookup_table) if table_idx is None: return @@ -352,7 +340,17 @@ def _write_block(self, source: BinaryIO, offset: int, size: int, idx: int = 0, b outfh.finalize() data_size = self.fh.tell() - data_offset - self._table.add(idx, table_idx, (flags, idx, absolute_offset, size, block_offset, data_size), absolute_offset) + self._table.add( + table_idx, + c_asdf.table_entry( + idx=idx, + offset=absolute_offset, + flags=flags, + size=size, + file_offset=block_offset, + file_size=data_size, + ), + ) if len(self._table) >= self._max_entries: self._write_table() @@ -448,15 +446,22 @@ def _table_insert(self, idx: int, offset: int, size: int, file_offset: int) -> N entry_table, lookup_table = self.table.get(idx) - table_idx, entry_offset, entry_size = _table_fit( - offset, size, entry_table, lookup_table, lambda e: (e[0], e[1]) - ) + table_idx, entry_offset, entry_size = _table_fit(offset, size, entry_table, lookup_table) if table_idx is None: return entry_data_offset += entry_offset - offset - self.table.add(idx, table_idx, (entry_offset, entry_size, file_offset, entry_data_offset), entry_offset) + self.table.add( + table_idx, + c_asdf.table_entry( + offset=entry_offset, + idx=idx, + size=entry_size, + file_offset=file_offset, + file_size=entry_data_offset, + ), + ) def contains(self, idx: int) -> bool: """Check whether this file contains the given stream index. @@ -534,7 +539,7 @@ def __init__(self, asdf: AsdfSnapshot, idx: int): # We don't actually know the size of the source disk # Doesn't really matter though, just take the last run offset + size - size = self.table[-1][0] + self.table[-1][1] + size = self.table[-1].offset + self.table[-1].size super().__init__(size) def _read(self, offset: int, length: int) -> bytes: @@ -543,15 +548,13 @@ def _read(self, offset: int, length: int) -> bytes: size = self.size run_idx = bisect_right(self._table_lookup, offset) - 1 runlist_len = len(self.table) - while length > 0 and run_idx < runlist_len: - run_start, run_size, run_file_offset, run_data_offset = self.table[run_idx] - run_end = run_start + run_size + entry = self.table[run_idx] + # Use file_size of the run_data_offset: FIXME: + run_data_offset = entry.file_size + run_end = entry.offset + entry.size - if run_idx + 1 < runlist_len: - next_run_start, _, _, _ = self.table[run_idx + 1] - else: - next_run_start = None + next_run_start = self.table[run_idx + 1].offset if (run_idx + 1 < runlist_len) else None if run_idx < 0: # Missing first block @@ -580,20 +583,20 @@ def _read(self, offset: int, length: int) -> bytes: # Proceed to next run run_idx += 1 - elif offset < run_start: + elif offset < entry.offset: # Previous run consumed, and next run is far away - sparse_remaining = run_start - offset + sparse_remaining = entry.offset - offset read_count = min(size - offset, min(sparse_remaining, length)) result.append(SPARSE_BYTES * (read_count // len(SPARSE_BYTES))) # Don't proceed to next run, next loop iteration we'll be within the current run else: # We're in a run with data - run_pos = offset - run_start - run_remaining = run_size - run_pos + run_pos = offset - entry.offset + run_remaining = entry.size - run_pos read_count = min(size - offset, min(run_remaining, length)) - self.fh.seek(run_file_offset) + self.fh.seek(entry.file_offset) if self.fh.read(4) != BLOCK_MAGIC: raise InvalidBlock("invalid block magic") @@ -656,7 +659,10 @@ def scrape_blocks(fh: BinaryIO, buffer_size: int = io.DEFAULT_BUFFER_SIZE) -> It def _table_fit( - entry_offset: int, entry_size: int, entry_table: list, lookup_table: list, getentry: Callable + entry_offset: int, + entry_size: int, + entry_table: list[c_asdf.table_entry], + lookup_table: list[int], ) -> tuple[int, int, int]: """Calculate where to insert an entry with the given offset and size into the entry table. @@ -680,10 +686,12 @@ def _table_fit( table_idx = bisect_right(lookup_table, entry_offset) if table_idx > 0: - prev_start, prev_size = getentry(entry_table[table_idx - 1]) + _entry = entry_table[table_idx - 1] + prev_start, prev_size = _entry.offset, _entry.size prev_end = prev_start + prev_size if table_idx < len(lookup_table): - next_start, next_size = getentry(entry_table[table_idx]) + _entry = entry_table[table_idx] + next_start, next_size = _entry.offset, _entry.size next_end = next_start + next_size if prev_end and prev_end >= entry_end: @@ -700,7 +708,8 @@ def _table_fit( entry_table.pop(table_idx) if table_idx < len(lookup_table): - next_start, next_size = getentry(entry_table[table_idx]) + _entry = entry_table[table_idx] + next_start, next_size = _entry.offset, _entry.size next_end = next_start + next_size else: next_start, next_end = None, None diff --git a/tests/test_asdf.py b/tests/test_asdf.py index 10f89bc..46165e3 100644 --- a/tests/test_asdf.py +++ b/tests/test_asdf.py @@ -28,7 +28,7 @@ def test_asdf(asdf_writer: AsdfWriter) -> None: reader = AsdfSnapshot(asdf_writer._fh) stream_0 = reader.open(0) - assert [(run_start, run_size) for run_start, run_size, _, _ in stream_0.table] == [ + assert [(entry.offset, entry.size) for entry in stream_0.table] == [ (0, 0x1000), (0x4000, 0x1000), (0x8000, 0x1000), @@ -79,7 +79,7 @@ def test_asdf_overlap(asdf_writer: AsdfWriter) -> None: reader = AsdfSnapshot(asdf_writer._fh) stream = reader.open(0) - assert [(run_start, run_size) for run_start, run_size, _, _ in stream.table] == [ + assert [(entry.offset, entry.size) for entry in stream.table] == [ (0, 100), (100, 50), (150, 50), @@ -103,7 +103,7 @@ def test_asdf_overlap_all(asdf_writer: AsdfWriter) -> None: reader = AsdfSnapshot(asdf_writer._fh) stream = reader.open(0) - assert [(run_start, run_size) for run_start, run_size, _, _ in stream.table] == [ + assert [(entry.offset, entry.size) for entry in stream.table] == [ (0, 100), (100, 300), ] @@ -124,7 +124,7 @@ def test_asdf_overlap_contiguous(asdf_writer: AsdfWriter) -> None: reader = AsdfSnapshot(asdf_writer._fh) stream = reader.open(0) - assert [(run_start, run_size) for run_start, run_size, _, _ in stream.table] == [ + assert [(entry.offset, entry.size) for entry in stream.table] == [ (0, 100), (100, 100), ] @@ -143,7 +143,7 @@ def test_asdf_overlap_seek(asdf_writer: AsdfWriter) -> None: reader = AsdfSnapshot(asdf_writer._fh) stream = reader.open(0) - assert [(run_start, run_size) for run_start, run_size, _, _ in stream.table] == [ + assert [(entry.offset, entry.size) for entry in stream.table] == [ (0, 100), (100, 100), (200, 100), @@ -237,7 +237,7 @@ def test_asdf_scrape(asdf_writer: AsdfWriter) -> None: reader = AsdfSnapshot(asdf_writer._fh, recover=True) stream = reader.open(0) - assert [(run_start, run_size) for run_start, run_size, _, _ in stream.table] == [ + assert [(entry.offset, entry.size) for entry in stream.table] == [ (0, 0x1000), (0x4000, 0x1000), (0x8000, 0x1000), From 1636503180f1d563fdcab94d3de87938fc5d30f3 Mon Sep 17 00:00:00 2001 From: Miauwkeru Date: Wed, 28 Jan 2026 11:32:43 +0100 Subject: [PATCH 04/13] Make Table more generic to hold a ReadEntry This is for AsdfSnapshot that includes the offset inside the block --- dissect/evidence/asdf/asdf.py | 46 +++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index 586e3d9..536980e 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -10,7 +10,8 @@ import uuid from bisect import bisect_right from collections import defaultdict -from typing import TYPE_CHECKING, Any, BinaryIO +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, BinaryIO, Generic, TypeVar from dissect.util import ts from dissect.util.stream import AlignedStream, RangeStream @@ -45,11 +46,26 @@ DEFAULT_NR_OF_ENTRIES = 4 * 1024 * 1024 // len(c_asdf.table_entry) -class Table: +@dataclass +class ReadEntry: + idx: int + offset: int + size: int + file_offset: int + data_offset: int + + def dumps(self) -> bytes: + return b"" + + +T = TypeVar("T", ReadEntry, c_asdf.table_entry) + + +class Table(Generic[T]): def __init__(self, fh: BinaryIO) -> None: self._fh = fh - self._table: dict[int, list[c_asdf.table_entry]] = defaultdict(list) - self._table_offsets: list[tuple[int, list[int]]] = [] + self._table_offsets: list[tuple[int, c_asdf.table_index]] = [] + self._table: dict[int, list[T]] = defaultdict(list) self._lookup: dict[int, list[int]] = defaultdict(list) self._entries = 0 self.offset = 0 @@ -63,10 +79,10 @@ def __len__(self): def __contains__(self, obj: Any) -> bool: return obj in self._table - def get(self, index: int) -> tuple[list[c_asdf.table_entry], list[int]]: + def get(self, index: int) -> tuple[list[T], list[int]]: return self._table[index], self._lookup[index] - def add(self, table_idx: int, entry: c_asdf.table_entry) -> None: + def add(self, table_idx: int, entry: T) -> None: self._table[entry.idx].insert(table_idx, entry) self._lookup[entry.idx].insert(table_idx, entry.offset) self._entries += 1 @@ -95,7 +111,7 @@ def lookup(self, idx: int) -> list[int]: return look - def values(self) -> ValuesView[list[c_asdf.table_entry]]: + def values(self) -> ValuesView[list[T]]: return self._table.values() def write(self) -> bytes: @@ -153,7 +169,7 @@ def __init__( raise ValueError("Table size can't be 0 or smaller") self._max_entries = table_size - self._table = Table(self.fh) + self._table = Table[c_asdf.table_entry](self.fh) self._meta_buf = io.BytesIO() self._meta_tar = tarfile.open(fileobj=self._meta_buf, mode="w") # noqa: SIM115 @@ -399,7 +415,7 @@ def __init__(self, fh: BinaryIO, recover: bool = False): self.timestamp = ts.from_unix(self.header.timestamp) self.guid = uuid.UUID(bytes_le=self.header.guid) - self.table = Table(self.fh) + self.table = Table[ReadEntry](self.fh) footer_offset = self.fh.seek(-len(c_asdf.footer), io.SEEK_END) @@ -452,14 +468,15 @@ def _table_insert(self, idx: int, offset: int, size: int, file_offset: int) -> N return entry_data_offset += entry_offset - offset + self.table.add( table_idx, - c_asdf.table_entry( - offset=entry_offset, + ReadEntry( idx=idx, + offset=entry_offset, size=entry_size, file_offset=file_offset, - file_size=entry_data_offset, + data_offset=entry_data_offset, ), ) @@ -550,8 +567,7 @@ def _read(self, offset: int, length: int) -> bytes: runlist_len = len(self.table) while length > 0 and run_idx < runlist_len: entry = self.table[run_idx] - # Use file_size of the run_data_offset: FIXME: - run_data_offset = entry.file_size + run_data_offset = entry.data_offset run_end = entry.offset + entry.size next_run_start = self.table[run_idx + 1].offset if (run_idx + 1 < runlist_len) else None @@ -661,7 +677,7 @@ def scrape_blocks(fh: BinaryIO, buffer_size: int = io.DEFAULT_BUFFER_SIZE) -> It def _table_fit( entry_offset: int, entry_size: int, - entry_table: list[c_asdf.table_entry], + entry_table: list[T], lookup_table: list[int], ) -> tuple[int, int, int]: """Calculate where to insert an entry with the given offset and size into the entry table. From 6285c1a2f3a940436aada1533a106c401301169e Mon Sep 17 00:00:00 2001 From: Miauwkeru Date: Wed, 28 Jan 2026 11:34:56 +0100 Subject: [PATCH 05/13] Move all write logic to Table.write So that the table is required for return the table data and cleanup of itself --- dissect/evidence/asdf/asdf.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index 536980e..41cb8b7 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -32,6 +32,7 @@ VERSION = 1 DEFAULT_BLOCK_SIZE = 4096 MAX_BLOCK_TABLE_SIZE = 2**32 +OFFSET_MASK = (1 << 64) - 1 MAX_IDX = 253 IDX_MEMORY = 254 @@ -67,8 +68,9 @@ def __init__(self, fh: BinaryIO) -> None: self._table_offsets: list[tuple[int, c_asdf.table_index]] = [] self._table: dict[int, list[T]] = defaultdict(list) self._lookup: dict[int, list[int]] = defaultdict(list) + self._entries = 0 - self.offset = 0 + self.prev_table_offset = OFFSET_MASK def __bool__(self): return bool(self._table) @@ -89,7 +91,7 @@ def add(self, table_idx: int, entry: T) -> None: def indexes(self) -> list[int]: indexes = sum(1 << key for key in self._table) - return [(indexes >> (x * 64)) & 0xFFFF_FFFF_FFFF_FFFF for x in range(256 // 64)] + return [(indexes >> (x * 64)) & OFFSET_MASK for x in range(256 // 64)] def lookup(self, idx: int) -> list[int]: prev_offset = self._fh.tell() @@ -114,13 +116,20 @@ def lookup(self, idx: int) -> list[int]: def values(self) -> ValuesView[list[T]]: return self._table.values() - def write(self) -> bytes: + def write(self, table_offset: int = -1) -> bytes: + """Creates a table to be writen to the fileheader""" + indexes = self.indexes() result = [entry.dumps() for entry in itertools.chain(*self._table.values())] index = c_asdf.table_index( - prev_table=self.offset, size=len(result) * len(c_asdf.table_entry), indexes=self.indexes() + prev_table=self.prev_table_offset, size=len(result) * len(c_asdf.table_entry), indexes=indexes ) result.insert(0, index.dumps()) + + if table_offset != -1: + self.prev_table_offset = table_offset + self._table_offsets.append((table_offset, index)) + self._table.clear() self._lookup.clear() self._entries = 0 @@ -381,16 +390,13 @@ def _write_meta(self) -> None: def _write_table(self) -> None: """Write the ASDF block table to the destination file-like object.""" - tmp_offset = self.fh.tell() - self._table._table_offsets.append((tmp_offset, self._table.indexes())) - self.fh.write(self._table.write()) - self._table.offset = tmp_offset + self.fh.write(self._table.write(self.fh.tell())) def _write_footer(self) -> None: """Write the ASDF footer to the destination file-like object.""" footer = c_asdf.footer( magic=FOOTER_MAGIC, - table_offset=self._table.offset, + table_offset=self._table.prev_table_offset, sha256=self.fh.digest(), ) footer.write(self.fh) From 2aa0db90fc4dbf5ed38474e0cb76c79dabe5b858 Mon Sep 17 00:00:00 2001 From: Miauwkeru Date: Wed, 28 Jan 2026 11:36:35 +0100 Subject: [PATCH 06/13] Change Table.lookup to use _table_fit to find all related entries in previous tables --- dissect/evidence/asdf/asdf.py | 85 ++++++++++++++++++++++++----------- 1 file changed, 60 insertions(+), 25 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index 41cb8b7..ad63b61 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -94,24 +94,52 @@ def indexes(self) -> list[int]: return [(indexes >> (x * 64)) & OFFSET_MASK for x in range(256 // 64)] def lookup(self, idx: int) -> list[int]: + """Finds the entries specified by the flushed tables""" prev_offset = self._fh.tell() - index_index = (idx // 64) - (1 if idx != 0 else 0) - look = [] - - # Lookup offsets - for offset, indexes in sorted(self._table_offsets, key=lambda x: x[0]): - index = indexes[index_index] - if (1 << (idx % 64)) & index: - self._fh.seek(offset, 0) - table = c_asdf.table_index(self._fh) - count = table.size // len(c_asdf.table_entry) - entries = c_asdf.table_entry[count](self._fh.read(table.size)) - look.extend(entry.offset for entry in filter(lambda x: x.idx == idx, entries)) - self._fh.seek(prev_offset, 0) - if idx in self._table: - look.extend(entry.offset for entry in self._table[idx]) - - return look + table_index = (idx // 64) - (1 if idx != 0 else 0) + lookup_value = 1 << (idx % 64) + + lookup = [] + entries = [] + + # Go through all the tables + for offset, table in self._table_offsets: + index = table.indexes[table_index] + if not (lookup_value & index): + continue + self._fh.seek(offset + len(c_asdf.table_index), io.SEEK_SET) + + count = table.size // len(c_asdf.table_entry) + for entry in c_asdf.table_entry[count](self._fh.read(table.size)): + if idx != entry.idx: + continue + tab_idx, offset, size = _table_fit(entry.offset, entry.size, entries, lookup) + if tab_idx is None: + continue + entry.offset = offset + entry.size = size + entries.insert(tab_idx, entry) + lookup.insert(tab_idx, offset) + + self._fh.seek(prev_offset, io.SEEK_SET) + + for entry in self._table.get(idx, []): + tab_idx, offset, size = _table_fit(entry.offset, entry.size, entries, lookup) + if tab_idx is None: + continue + _entry = c_asdf.table_entry( + idx=idx, + flags=entry.flags, + offset=offset, + size=size, + file_size=entry.file_size, + file_offset=entry.file_offset, + ) + + entries.insert(tab_idx, _entry) + lookup.insert(tab_idx, offset) + + return lookup def values(self) -> ValuesView[list[T]]: return self._table.values() @@ -442,22 +470,29 @@ def __init__(self, fh: BinaryIO, recover: bool = False): def _parse_block_table(self, offset: int, count: int) -> None: """Parse the block table, getting rid of overlapping blocks.""" self.fh.seek(offset) + table_offsets = [] + # Read all the tables and their offsets in reverse order while True: - prev_offset = self.fh.tell() + table_offset = self.fh.tell() table_index = c_asdf.table_index(self.fh) - self.table._table_offsets.append((prev_offset, table_index.indexes)) + table_offsets.append((table_offset, table_index)) + if table_index.prev_table == OFFSET_MASK: + break + self.fh.seek(table_index.prev_table) + + table_offsets.reverse() + self.table._table_offsets = table_offsets + + # Read all the table entries and add them to the table + for offset, table_index in table_offsets: + self.fh.seek(offset + len(c_asdf.table_index)) _count = table_index.size // len(c_asdf.table_entry) table_data = io.BytesIO(self.fh.read(table_index.size)) - for _ in range(_count): - entry = c_asdf.table_entry(table_data) + for entry in c_asdf.table_entry[_count](table_data): self._table_insert(entry.idx, entry.offset, entry.size, entry.file_offset) - if table_index.prev_table in [0, 0xFFFFFFFFFFFFFFFF]: - break - self.fh.seek(table_index.prev_table) - def _recover_block_table(self) -> None: self.fh.seek(len(c_asdf.header)) for block, file_offset in scrape_blocks(self.fh): From 08f476101106c97a311d766ed469164b03c76498 Mon Sep 17 00:00:00 2001 From: Miauwkeru Date: Wed, 28 Jan 2026 11:38:02 +0100 Subject: [PATCH 07/13] Adjust typings to be more specific --- dissect/evidence/asdf/asdf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index ad63b61..3ee70f6 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -670,7 +670,7 @@ def _read(self, offset: int, length: int) -> bytes: return b"".join(result) -def scrape_blocks(fh: BinaryIO, buffer_size: int = io.DEFAULT_BUFFER_SIZE) -> Iterator[c_asdf.block, int]: +def scrape_blocks(fh: BinaryIO, buffer_size: int = io.DEFAULT_BUFFER_SIZE) -> Iterator[tuple[c_asdf.block, int]]: """Scrape for block headers in ``fh`` and yield parsed block headers and their offset. Args: @@ -720,7 +720,7 @@ def _table_fit( entry_size: int, entry_table: list[T], lookup_table: list[int], -) -> tuple[int, int, int]: +) -> tuple[int | None, int | None, int | None]: """Calculate where to insert an entry with the given offset and size into the entry table. Moves or shrinks the entry to prevent block overlap, and remove any overlapping blocks. From ecaba55b78c995ce5f523ebeb4fd9d1185f2f2bd Mon Sep 17 00:00:00 2001 From: Miauwkeru Date: Thu, 19 Feb 2026 15:56:48 +0100 Subject: [PATCH 08/13] Update documentation and add `fh` to lookup --- dissect/evidence/asdf/asdf.py | 84 +++++++++++++++++++++++++-------- dissect/evidence/asdf/c_asdf.py | 5 +- tests/test_asdf.py | 18 +++---- 3 files changed, 77 insertions(+), 30 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index 3ee70f6..2d423af 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -47,8 +47,14 @@ DEFAULT_NR_OF_ENTRIES = 4 * 1024 * 1024 // len(c_asdf.table_entry) -@dataclass +@dataclass(slots=True) class ReadEntry: + """An entry representing the data to read in :class:`ASDFSnapshot` + + Very similar to ``c_asdf.table_entry`` however uses the offset of the data inside + the stream instead of ``file_size``. + """ + idx: int offset: int size: int @@ -63,11 +69,23 @@ def dumps(self) -> bytes: class Table(Generic[T]): - def __init__(self, fh: BinaryIO) -> None: - self._fh = fh - self._table_offsets: list[tuple[int, c_asdf.table_index]] = [] - self._table: dict[int, list[T]] = defaultdict(list) - self._lookup: dict[int, list[int]] = defaultdict(list) + """A single point for the table entries to get collected for reading and writing.""" + + _table: dict[int, list[T]] + """Keeps an order for the table entries for a specific stream""" + _lookup: dict[int, list[int]] + """Keeps an order for all the stream offsets for a specific stream""" + _table_offsets: list[tuple[int, c_asdf.table_index]] + """Keeps account of any previously flushed table, containing both its start offset and table_index.""" + _entries: int + """The current number of entries inside of the table""" + last_table_offset: int + """Offset of the previously flushed table""" + + def __init__(self) -> None: + self._table_offsets = [] + self._table = defaultdict(list) + self._lookup = defaultdict(list) self._entries = 0 self.prev_table_offset = OFFSET_MASK @@ -90,43 +108,72 @@ def add(self, table_idx: int, entry: T) -> None: self._entries += 1 def indexes(self) -> list[int]: + """Returns which stream indexes are inside this table. + + Creates a 256-bit string that represends the stream indexes currently inside the table. + The bit string gets divided into 4 64-bit numbers. + """ indexes = sum(1 << key for key in self._table) return [(indexes >> (x * 64)) & OFFSET_MASK for x in range(256 // 64)] - def lookup(self, idx: int) -> list[int]: - """Finds the entries specified by the flushed tables""" - prev_offset = self._fh.tell() - table_index = (idx // 64) - (1 if idx != 0 else 0) + def lookup(self, idx: int, fh: BinaryIO) -> list[int]: + """Finds entries belonging to a stream index inside of any flushed table. + + In the worst case scenario, where a table gets flushed every time an entry gets added to it, + the lookup function finds all the offsets and returns it in the correct order. + + TODO: Only returns the offsets for now, can be rewritten and reused for :class:`.ASDFSnapshot`. + + Args: + idx: The stream idx which we want to lookup + fh: The filehandle of the asdf file. + + Returns: + a list of offsets of a specific stream. + """ + prev_offset = fh.tell() + # Which parts of table.indexes to look into for a stream + index_idx = (idx // 64) - (1 if idx != 0 else 0) lookup_value = 1 << (idx % 64) lookup = [] entries = [] - # Go through all the tables + # Go through all previously flushed tables for offset, table in self._table_offsets: - index = table.indexes[table_index] + # Determine whether the stream_idx is inside this flushed table + index = table.indexes[index_idx] if not (lookup_value & index): + # This table does not contain the index, so we continue to the next one continue - self._fh.seek(offset + len(c_asdf.table_index), io.SEEK_SET) + + fh.seek(offset + len(c_asdf.table_index), io.SEEK_SET) count = table.size // len(c_asdf.table_entry) - for entry in c_asdf.table_entry[count](self._fh.read(table.size)): + for entry in c_asdf.table_entry[count](fh.read(table.size)): + # Determine whether this entry can be skipped if idx != entry.idx: continue + tab_idx, offset, size = _table_fit(entry.offset, entry.size, entries, lookup) if tab_idx is None: + # The block can be skipped, continuing continue + entry.offset = offset entry.size = size entries.insert(tab_idx, entry) lookup.insert(tab_idx, offset) - self._fh.seek(prev_offset, io.SEEK_SET) + fh.seek(prev_offset, io.SEEK_SET) + # Fit all the entries inside the current table for entry in self._table.get(idx, []): tab_idx, offset, size = _table_fit(entry.offset, entry.size, entries, lookup) if tab_idx is None: continue + + # Copy the entry, so we don't change the data that's currently inside the table _entry = c_asdf.table_entry( idx=idx, flags=entry.flags, @@ -145,7 +192,7 @@ def values(self) -> ValuesView[list[T]]: return self._table.values() def write(self, table_offset: int = -1) -> bytes: - """Creates a table to be writen to the fileheader""" + """Writes a table directly to the fileheader""" indexes = self.indexes() result = [entry.dumps() for entry in itertools.chain(*self._table.values())] @@ -206,7 +253,7 @@ def __init__( raise ValueError("Table size can't be 0 or smaller") self._max_entries = table_size - self._table = Table[c_asdf.table_entry](self.fh) + self._table = Table[c_asdf.table_entry]() self._meta_buf = io.BytesIO() self._meta_tar = tarfile.open(fileobj=self._meta_buf, mode="w") # noqa: SIM115 @@ -449,7 +496,7 @@ def __init__(self, fh: BinaryIO, recover: bool = False): self.timestamp = ts.from_unix(self.header.timestamp) self.guid = uuid.UUID(bytes_le=self.header.guid) - self.table = Table[ReadEntry](self.fh) + self.table = Table[ReadEntry]() footer_offset = self.fh.seek(-len(c_asdf.footer), io.SEEK_END) @@ -730,7 +777,6 @@ def _table_fit( entry_size: The entry size to calculate the insert for. entry_table: The entry table to insert into or remove entries from. lookup_table: The lookup table for the entry_table. - getentry: A callable to return the ``(offset, size)`` tuple from an entry. Returns: A tuple of the table index to insert into, an adjusted entry offset and an adjusted entry size. diff --git a/dissect/evidence/asdf/c_asdf.py b/dissect/evidence/asdf/c_asdf.py index 2f43f42..20bf864 100644 --- a/dissect/evidence/asdf/c_asdf.py +++ b/dissect/evidence/asdf/c_asdf.py @@ -31,10 +31,11 @@ uint64 size; // Size of block in stream }; +// A structure to keep track of previously flushed tables struct table_index { - uint64 prev_table; // Offset of the previous table FFFFFFFFF denotes last table + uint64 prev_table; // Offset of the previous table 0xFFFFFFFF_FFFFFFF denotes last table uint64 size; // Amount of bytes of the table - uint64 indexes[4]; // Which table entries are inside + uint64 indexes[4]; // Which stream indexes are available inside the table }; struct table_entry { diff --git a/tests/test_asdf.py b/tests/test_asdf.py index 46165e3..c3220b3 100644 --- a/tests/test_asdf.py +++ b/tests/test_asdf.py @@ -62,16 +62,16 @@ def test_asdf(asdf_writer: AsdfWriter) -> None: def test_asdf_overlap(asdf_writer: AsdfWriter) -> None: asdf_writer.add_bytes(b"\x01" * 100, base=0) asdf_writer.add_bytes(b"\x02" * 100, base=200) - assert asdf_writer._table.lookup(0) == [0, 200] + assert asdf_writer._table.lookup(0, asdf_writer._fh) == [0, 200] asdf_writer.add_bytes(b"\x03" * 100, base=50) - assert asdf_writer._table.lookup(0) == [0, 100, 200] + assert asdf_writer._table.lookup(0, asdf_writer._fh) == [0, 100, 200] asdf_writer.add_bytes(b"\x04" * 150, base=100) - assert asdf_writer._table.lookup(0) == [0, 100, 150, 200] + assert asdf_writer._table.lookup(0, asdf_writer._fh) == [0, 100, 150, 200] asdf_writer.add_bytes(b"\x05" * 50, base=25) - assert asdf_writer._table.lookup(0) == [0, 100, 150, 200] + assert asdf_writer._table.lookup(0, asdf_writer._fh) == [0, 100, 150, 200] asdf_writer.close() asdf_writer._fh.seek(0) @@ -93,9 +93,9 @@ def test_asdf_overlap_all(asdf_writer: AsdfWriter) -> None: asdf_writer.add_bytes(b"\x02" * 100, base=200) asdf_writer.add_bytes(b"\x03" * 100, base=50) asdf_writer.add_bytes(b"\x04" * 150, base=100) - assert asdf_writer._table.lookup(0) == [0, 100, 150, 200] + assert asdf_writer._table.lookup(0, asdf_writer._fh) == [0, 100, 150, 200] asdf_writer.add_bytes(b"\x06" * 400, base=0) - assert asdf_writer._table.lookup(0) == [0, 100] + assert asdf_writer._table.lookup(0, asdf_writer._fh) == [0, 100] asdf_writer.close() asdf_writer._fh.seek(0) @@ -113,10 +113,10 @@ def test_asdf_overlap_all(asdf_writer: AsdfWriter) -> None: def test_asdf_overlap_contiguous(asdf_writer: AsdfWriter) -> None: asdf_writer.add_bytes(b"\x01" * 100, base=0) asdf_writer.add_bytes(b"\x02" * 100, base=100) - assert asdf_writer._table.lookup(0) == [0, 100] + assert asdf_writer._table.lookup(0, asdf_writer._fh) == [0, 100] asdf_writer.add_bytes(b"\x03" * 75, base=50) - assert asdf_writer._table.lookup(0) == [0, 100] + assert asdf_writer._table.lookup(0, asdf_writer._fh) == [0, 100] asdf_writer.close() asdf_writer._fh.seek(0) @@ -135,7 +135,7 @@ def test_asdf_overlap_seek(asdf_writer: AsdfWriter) -> None: asdf_writer.add_bytes(b"\x00" * 100, base=0) asdf_writer.add_bytes(b"\x00" * 100, base=200) asdf_writer.add_bytes(bytes(range(200)), base=50) - assert asdf_writer._table.lookup(0) == [0, 100, 200] + assert asdf_writer._table.lookup(0, asdf_writer._fh) == [0, 100, 200] asdf_writer.close() asdf_writer._fh.seek(0) From 58b851253e46474caa7580818ca07ab54ecb4352 Mon Sep 17 00:00:00 2001 From: Miauwkeru Date: Thu, 19 Feb 2026 15:58:29 +0100 Subject: [PATCH 09/13] Rename DEFAULT_NR_OF_ENTRIES to DEFAULT_TABLE_SIZE --- dissect/evidence/asdf/asdf.py | 4 ++-- tests/conftest.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index 2d423af..e312ebc 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -44,7 +44,7 @@ FOOTER_MAGIC = b"FT\xa5\xdf" SPARSE_BYTES = b"\xa5\xdf" -DEFAULT_NR_OF_ENTRIES = 4 * 1024 * 1024 // len(c_asdf.table_entry) +DEFAULT_TABLE_SIZE = 4 * 1024 * 1024 // len(c_asdf.table_entry) @dataclass(slots=True) @@ -234,7 +234,7 @@ def __init__( guid: uuid.UUID | None = None, compress: bool = False, block_crc: bool = True, - table_size: int = DEFAULT_NR_OF_ENTRIES, + table_size: int = DEFAULT_TABLE_SIZE, ): self._fh = fh self.fh = self._fh diff --git a/tests/conftest.py b/tests/conftest.py index f2f3d02..6398a54 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,7 +6,7 @@ import pytest -from dissect.evidence.asdf.asdf import DEFAULT_NR_OF_ENTRIES, AsdfWriter +from dissect.evidence.asdf.asdf import DEFAULT_TABLE_SIZE, AsdfWriter from tests._utils import absolute_path if TYPE_CHECKING: @@ -83,7 +83,7 @@ def ewf_data() -> Iterator[BinaryIO]: @pytest.fixture( params=[ pytest.param(1, id="table_size=1"), - pytest.param(DEFAULT_NR_OF_ENTRIES, id="table_size=DEFAULT"), + pytest.param(DEFAULT_TABLE_SIZE, id="table_size=DEFAULT"), ] ) def asdf_writer(request: pytest.FixtureRequest) -> AsdfWriter: From 2fcd5ebe1fb83595ffcced9b43781b6ef636c209 Mon Sep 17 00:00:00 2001 From: Miauwkeru Date: Thu, 19 Feb 2026 16:01:54 +0100 Subject: [PATCH 10/13] change Table.write to write to a filehandle directly --- dissect/evidence/asdf/asdf.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index e312ebc..d197ad2 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -191,24 +191,25 @@ def lookup(self, idx: int, fh: BinaryIO) -> list[int]: def values(self) -> ValuesView[list[T]]: return self._table.values() - def write(self, table_offset: int = -1) -> bytes: + def write(self, fh: BinaryIO) -> None: """Writes a table directly to the fileheader""" indexes = self.indexes() result = [entry.dumps() for entry in itertools.chain(*self._table.values())] index = c_asdf.table_index( - prev_table=self.prev_table_offset, size=len(result) * len(c_asdf.table_entry), indexes=indexes + prev_table=self.last_table_offset, size=len(result) * len(c_asdf.table_entry), indexes=indexes ) result.insert(0, index.dumps()) - if table_offset != -1: - self.prev_table_offset = table_offset - self._table_offsets.append((table_offset, index)) + table_offset = fh.tell() + self.last_table_offset = table_offset + self._table_offsets.append((table_offset, index)) + + fh.writelines(result) self._table.clear() self._lookup.clear() self._entries = 0 - return b"".join(result) class AsdfWriter(io.RawIOBase): @@ -465,7 +466,7 @@ def _write_meta(self) -> None: def _write_table(self) -> None: """Write the ASDF block table to the destination file-like object.""" - self.fh.write(self._table.write(self.fh.tell())) + self._table.write(self.fh) def _write_footer(self) -> None: """Write the ASDF footer to the destination file-like object.""" From a453b77eada2f936d7dfb627189dd13e709b27f0 Mon Sep 17 00:00:00 2001 From: Miauwkeru Date: Thu, 19 Feb 2026 16:02:35 +0100 Subject: [PATCH 11/13] Rename prev_table_offset to last_table_offset --- dissect/evidence/asdf/asdf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index d197ad2..1c2d4ea 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -88,7 +88,7 @@ def __init__(self) -> None: self._lookup = defaultdict(list) self._entries = 0 - self.prev_table_offset = OFFSET_MASK + self.last_table_offset = OFFSET_MASK def __bool__(self): return bool(self._table) @@ -472,7 +472,7 @@ def _write_footer(self) -> None: """Write the ASDF footer to the destination file-like object.""" footer = c_asdf.footer( magic=FOOTER_MAGIC, - table_offset=self._table.prev_table_offset, + table_offset=self._table.last_table_offset, sha256=self.fh.digest(), ) footer.write(self.fh) From 6febd9f30726f9555f12dc96c3be59562f9bf278 Mon Sep 17 00:00:00 2001 From: Miauwkeru Date: Thu, 19 Feb 2026 16:03:15 +0100 Subject: [PATCH 12/13] Add Table.keys() which returns all the stream indexes in the current table --- dissect/evidence/asdf/asdf.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index 1c2d4ea..2cf6a3c 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -25,7 +25,7 @@ ) if TYPE_CHECKING: - from collections.abc import Iterator, ValuesView + from collections.abc import Iterator, KeysView, ValuesView SnapshotTableEntry = tuple[int, int, int, int] @@ -191,6 +191,9 @@ def lookup(self, idx: int, fh: BinaryIO) -> list[int]: def values(self) -> ValuesView[list[T]]: return self._table.values() + def keys(self) -> KeysView[int]: + return self._table.keys() + def write(self, fh: BinaryIO) -> None: """Writes a table directly to the fileheader""" indexes = self.indexes() @@ -589,12 +592,12 @@ def open(self, idx: int) -> AsdfStream: def streams(self) -> Iterator[AsdfStream]: """Iterate over all streams in the file.""" - for i in sorted(self.table._table.keys()): + for i in sorted(self.table.keys()): yield self.open(i) def disks(self) -> Iterator[AsdfStream]: """Iterate over all non-reserved streams in the file.""" - for i in sorted(self.table._table.keys()): + for i in sorted(self.table.keys()): if i in RESERVED_IDX: continue yield self.open(i) From e7f8a2f625acc2ef7e9829cb00fea09d506caeef Mon Sep 17 00:00:00 2001 From: Miauwkeru Date: Thu, 19 Feb 2026 16:03:45 +0100 Subject: [PATCH 13/13] Rename AsdfWriter._write_table to AsdfWriter.flush --- dissect/evidence/asdf/asdf.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index 2cf6a3c..cd85ed1 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -380,8 +380,8 @@ def close(self) -> None: """ super().close() self._write_meta() - if self._table: - self._write_table() + if len(self._table): + self.flush() self._write_footer() self.fh.close() @@ -457,7 +457,7 @@ def _write_block(self, source: BinaryIO, offset: int, size: int, idx: int = 0, b ) if len(self._table) >= self._max_entries: - self._write_table() + self.flush() def _write_meta(self) -> None: """Write the metadata tar to the destination file-like object.""" @@ -467,7 +467,7 @@ def _write_meta(self) -> None: self._meta_buf.seek(0) self.copy_bytes(self._meta_buf, 0, size, idx=IDX_METADATA) - def _write_table(self) -> None: + def flush(self) -> None: """Write the ASDF block table to the destination file-like object.""" self._table.write(self.fh)