From fc6eaee8b42f935919e716229ac75cbb75e9911d Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Wed, 19 Nov 2025 16:10:20 +0100 Subject: [PATCH 01/28] Improve WAL This commit: - Adds a feature to open databases from various WAL checkpoints. - Allows a database to be initialized with a Path OR BinaryIO. - Improves the reader by also parsing the WAL file - Adds a Python script to generate test data. --- dissect/database/sqlite3/sqlite3.py | 175 +++++++++++++++++++++---- tests/_data/sqlite3/generate_sqlite.py | 3 + tests/_data/sqlite3/test.sqlite | 4 +- tests/_data/sqlite3/test.sqlite-wal | 3 + tests/sqlite3/conftest.py | 15 ++- tests/sqlite3/test_sqlite3.py | 129 +++++++++++++++++- 6 files changed, 298 insertions(+), 31 deletions(-) create mode 100644 tests/_data/sqlite3/generate_sqlite.py create mode 100644 tests/_data/sqlite3/test.sqlite-wal diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index 0c6a6fb..e3ec57f 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -3,8 +3,9 @@ import itertools import re import struct -from functools import lru_cache +from functools import cached_property, lru_cache from io import BytesIO +from pathlib import Path from typing import TYPE_CHECKING, Any, BinaryIO from dissect.database.sqlite3.c_sqlite3 import c_sqlite3 @@ -55,11 +56,57 @@ class SQLite3: - def __init__(self, fh: BinaryIO, wal_fh: BinaryIO | None = None): + def __init__( + self, + fh: Path | BinaryIO, + wal_fh: Path | BinaryIO | None = None, + wal_checkpoint: WALCheckpoint | int | None = None, + ): + # Use the provided file handle or try to open the file path. + if hasattr(fh, "read"): + name = getattr(fh, "name", None) + path = Path(name) if name else None + else: + path = fh + fh = path.open("rb") + self.fh = fh + self.path = path + + # Use the provided WAL file handle or try to open a sidecar WAL file. + if wal_fh is not None: + if hasattr(wal_fh, "read"): + name = getattr(wal_fh, "name", None) + wal_path = Path(name) if name else None + else: + if not isinstance(wal_fh, Path): + wal_fh = Path(wal_fh) + wal_path = wal_fh + wal_fh = wal_path.open("rb") + elif self.path: + # Check for common WAL sidecars next to the DB. + for suffix in (".sqlite-wal", ".db-wal"): + candidate = self.path.with_suffix(suffix) + if candidate.exists(): + wal_path = candidate + wal_fh = wal_path.open("rb") + break + else: + wal_path = None + wal_fh = None + self.wal = WAL(wal_fh) if wal_fh else None + self.wal_path = wal_path if wal_fh else None + self.wal_checkpoint = wal_checkpoint - self.header = c_sqlite3.header(fh) + if self.wal and self.wal_checkpoint is not None and isinstance(self.wal_checkpoint, int): + checkpoint = self.wal_checkpoint + checkpoints = self.wal.checkpoints + if checkpoint < 0 or checkpoint >= len(checkpoints): + raise IndexError("WAL checkpoint index out of range") + self.wal_checkpoint = checkpoints[checkpoint] + + self.header = c_sqlite3.header(self.fh) if self.header.magic != SQLITE3_HEADER_MAGIC: raise InvalidDatabase("Invalid header magic") @@ -77,6 +124,13 @@ def __init__(self, fh: BinaryIO, wal_fh: BinaryIO | None = None): def open_wal(self, fh: BinaryIO) -> None: self.wal = WAL(fh) + def checkpoints(self) -> Iterator[SQLite3]: + if not self.wal: + return + + for checkpoint in self.wal.commits: + yield SQLite3(self.fh, self.wal.fh, checkpoint) + def table(self, name: str) -> Table | None: name = name.lower() for table in self.tables(): @@ -108,10 +162,48 @@ def indices(self) -> Iterator[Index]: yield Index(self, *cell.values) def raw_page(self, num: int) -> bytes: + """Retrieve the raw frame data for the given page number. + + Reads the page from a checkpoint if provided. + + Will first check if the WAL contains a more recent version of the page, + otherwise it will read the page from the database file. + + References: + - https://sqlite.org/fileformat2.html#reader_algorithm + """ # Only throw an out of bounds exception if the header contains a page_count. # Some old versions of SQLite3 do not set/update the page_count correctly. if (num < 1 or num > self.header.page_count) and self.header.page_count > 0: raise InvalidPageNumber("Page number exceeds boundaries") + + # If a specific WAL checkpoint was provided, use it instead of the on-disk page. + if self.wal and self.wal_checkpoint is not None: + if num == 1: + self.fh.seek(len(c_sqlite3.header)) + elif num in self.wal_checkpoint: + frame = self.wal_checkpoint.get(num) + return frame.data + else: + # If the page is not present in the checkpoint, skip. + pass + + # Check if the latest valid instance of the page is committed (either the frame itself + # is the commit frame or it is included in a commit's frames). If so, return that frame's data. + if self.wal: + frames = list(self.wal.frames()) + last_valid_frame = None + for f in frames: + if f.valid and f.page_number == num: + last_valid_frame = f + + if last_valid_frame is not None: + for commit in self.wal.commits: + # commit.frames contains all frames that were committed together; + # if our last valid frame is in one of those, it's part of that commit. + if last_valid_frame in commit.frames: + return last_valid_frame.data + if num == 1: # Page 1 is root self.fh.seek(len(c_sqlite3.header)) else: @@ -492,21 +584,55 @@ def frames(self) -> Iterator[WALFrame]: except EOFError: # noqa: PERF203 break - def checkpoints(self) -> list[WALCheckpoint]: - if not self._checkpoints: - checkpoints = [] - frames = [] + @cached_property + def commits(self) -> list[WALCommit]: + """Collects all commits in the WAL file. - for frame in self.frames(): - frames.append(frame) + For commit records ``header.page_count`` specifies the size of the + database file in pages after the commit. For all other records it is 0. - if frame.page_count != 0: - checkpoints.append(WALCheckpoint(self, frames)) - frames = [] + References: + - https://sqlite.org/fileformat2.html#wal_file_format + """ + commits = [] + frames = [] + + for frame in self.frames(): + frames.append(frame) + + # A commit record has a page_count header greater than zero + if frame.page_count > 0: + commits.append(WALCommit(self, frames)) + frames = [] + + return commits - self._checkpoints = checkpoints + @cached_property + def checkpoints(self) -> list[WALCommit]: + """Return deduplicated WAL commits (checkpoints), newest first. + + Deduplicate commits by the salt1 value of their first frame. Later + commits overwrite earlier ones so the returned list contains the most + recent commit for each salt1, sorted descending. + + References: + - https://sqlite.org/fileformat2.html#wal_file_format + - https://sqlite.org/wal.html#checkpointing + """ + checkpoints_map: dict[int, WALCommit] = {} + for commit in self.commits: + if not commit.frames: + continue + salt1 = commit.frames[0].header.salt1 + # Keep the most recent commit for each salt1 (later commits overwrite). + checkpoints_map[salt1] = commit - return self._checkpoints + return sorted( + checkpoints_map.values(), + key=lambda c: c.frames[0].header.salt1, + #TODO Should this be reverse? Reverse means: cp0 is the latest, cpN the oldest + reverse=True, + ) class WALFrame: @@ -546,11 +672,11 @@ def page_count(self) -> int: return self.header.page_count -class WALCheckpoint: +# Collection of frames that were committed together +class _WALFramesCollection: def __init__(self, wal: WAL, frames: list[WALFrame]): self.wal = wal self.frames = frames - self._page_map = None def __contains__(self, page: int) -> bool: return page in self.page_map @@ -559,19 +685,24 @@ def __getitem__(self, page: int) -> WALFrame: return self.page_map[page] def __repr__(self) -> str: - return f"" + return f"<{self.__class__.__name__} frames={len(self.frames)}>" - @property + @cached_property def page_map(self) -> dict[int, WALFrame]: - if not self._page_map: - self._page_map = {frame.page_number: frame for frame in self.frames} - - return self._page_map + return {frame.page_number: frame for frame in self.frames} def get(self, page: int, default: Any = None) -> WALFrame: return self.page_map.get(page, default) +class WALCheckpoint(_WALFramesCollection): + pass + + +class WALCommit(_WALFramesCollection): + pass + + def wal_checksum(buf: bytes, endian: str = ">") -> tuple[int, int]: """For future use, will be used when WAL is fully implemented""" diff --git a/tests/_data/sqlite3/generate_sqlite.py b/tests/_data/sqlite3/generate_sqlite.py new file mode 100644 index 0000000..d28478b --- /dev/null +++ b/tests/_data/sqlite3/generate_sqlite.py @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c65612d4bb03b3489d9e3e5c6ec6f740a53a824c20bab660ad73791f26281f5 +size 2191 diff --git a/tests/_data/sqlite3/test.sqlite b/tests/_data/sqlite3/test.sqlite index 9a30530..4b0d00a 100644 --- a/tests/_data/sqlite3/test.sqlite +++ b/tests/_data/sqlite3/test.sqlite @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8a2b588f035eca3a644799c77902e9058e1e577392b2616ab08e9cd2c73c9fb -size 16384 +oid sha256:10c92d94ad110823f169c6221941309576dccacd459f8b627a1eb54f5a3f813c +size 20480 diff --git a/tests/_data/sqlite3/test.sqlite-wal b/tests/_data/sqlite3/test.sqlite-wal new file mode 100644 index 0000000..ae4b789 --- /dev/null +++ b/tests/_data/sqlite3/test.sqlite-wal @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:263c7b76fff7ece44363f5c40fd50718586583385a305b0b3bcca121d32f98de +size 70072 diff --git a/tests/sqlite3/conftest.py b/tests/sqlite3/conftest.py index a312084..4416331 100644 --- a/tests/sqlite3/conftest.py +++ b/tests/sqlite3/conftest.py @@ -4,17 +4,22 @@ import pytest -from tests._util import open_file +from tests._util import absolute_path if TYPE_CHECKING: from collections.abc import Iterator @pytest.fixture -def sqlite_db() -> Iterator[BinaryIO]: - yield from open_file("_data/sqlite3/test.sqlite") +def sqlite_db() -> BinaryIO: + return absolute_path("_data/sqlite3/test.sqlite").open("rb") @pytest.fixture -def empty_db() -> Iterator[BinaryIO]: - yield from open_file("_data/sqlite3/empty.sqlite") +def sqlite_wal() -> BinaryIO: + return absolute_path("_data/sqlite3/test.sqlite-wal").open("rb") + + +@pytest.fixture +def empty_db() -> BinaryIO: + return absolute_path("_data/sqlite3/empty.sqlite").open("rb") diff --git a/tests/sqlite3/test_sqlite3.py b/tests/sqlite3/test_sqlite3.py index 37e2db0..4593e97 100644 --- a/tests/sqlite3/test_sqlite3.py +++ b/tests/sqlite3/test_sqlite3.py @@ -14,7 +14,7 @@ def test_sqlite(sqlite_db: BinaryIO) -> None: assert s.header.magic == sqlite3.SQLITE3_HEADER_MAGIC tables = list(s.tables()) - assert len(tables) == 1 + assert len(tables) == 2 table = tables[0] assert table.name == "test" @@ -24,7 +24,7 @@ def test_sqlite(sqlite_db: BinaryIO) -> None: assert s.table("test").__dict__ == table.__dict__ rows = list(table.rows()) - assert len(rows) == 5 + assert len(rows) == 10 assert rows[0].id == 1 assert rows[0].name == "testing" assert rows[0].value == 1337 @@ -46,6 +46,131 @@ def test_sqlite(sqlite_db: BinaryIO) -> None: assert list(rows[0]) == [("id", 1), ("name", "testing"), ("value", 1337)] +def test_sqlite_wal(sqlite_db: BinaryIO, sqlite_wal: BinaryIO) -> None: + # After the first checkpoint the "after checkpoint" entries are present + s = sqlite3.SQLite3(sqlite_db, sqlite_wal, wal_checkpoint=2) + + table = list(s.tables())[0] + + rows = list(table.rows()) + assert len(rows) == 9 + + assert rows[0].id == 1 + assert rows[0].name == "testing" + assert rows[0].value == 1337 + assert rows[1].id == 2 + assert rows[1].name == "omg" + assert rows[1].value == 7331 + assert rows[2].id == 3 + assert rows[2].name == "A" * 4100 + assert rows[2].value == 4100 + assert rows[3].id == 4 + assert rows[3].name == "B" * 4100 + assert rows[3].value == 4100 + assert rows[4].id == 5 + assert rows[4].name == "negative" + assert rows[4].value == -11644473429 + assert rows[5].id == 6 + assert rows[5].name == "after checkpoint" + assert rows[5].value == 42 + assert rows[6].id == 7 + assert rows[6].name == "after checkpoint" + assert rows[6].value == 43 + assert rows[7].id == 8 + assert rows[7].name == "after checkpoint" + assert rows[7].value == 44 + assert rows[8].id == 9 + assert rows[8].name == "after checkpoint" + assert rows[8].value == 45 + + sqlite_wal.seek(0) + sqlite_db.seek(0) + + # After the second checkpoint two more entries are present ("second checkpoint") + s = sqlite3.SQLite3(sqlite_db, sqlite_wal, wal_checkpoint=1) + + table = list(s.tables())[0] + + rows = list(table.rows()) + assert len(rows) == 11 + + assert rows[0].id == 1 + assert rows[0].name == "testing" + assert rows[0].value == 1337 + assert rows[1].id == 2 + assert rows[1].name == "omg" + assert rows[1].value == 7331 + assert rows[2].id == 3 + assert rows[2].name == "A" * 4100 + assert rows[2].value == 4100 + assert rows[3].id == 4 + assert rows[3].name == "B" * 4100 + assert rows[3].value == 4100 + assert rows[4].id == 5 + assert rows[4].name == "negative" + assert rows[4].value == -11644473429 + assert rows[5].id == 6 + assert rows[5].name == "after checkpoint" + assert rows[5].value == 42 + assert rows[6].id == 7 + assert rows[6].name == "after checkpoint" + assert rows[6].value == 43 + assert rows[7].id == 8 + assert rows[7].name == "after checkpoint" + assert rows[7].value == 44 + assert rows[8].id == 9 + assert rows[8].name == "after checkpoint" + assert rows[8].value == 45 + assert rows[9].id == 10 + assert rows[9].name == "second checkpoint" + assert rows[9].value == 100 + assert rows[10].id == 11 + assert rows[10].name == "second checkpoint" + assert rows[10].value == 101 + + sqlite_wal.seek(0) + sqlite_db.seek(0) + + # After the third checkpoint the deletion and update of one "after checkpoint" are reflected + s = sqlite3.SQLite3(sqlite_db, sqlite_wal, wal_checkpoint=0) + + table = list(s.tables())[0] + rows = list(table.rows()) + + assert len(rows) == 10 + + assert rows[0].id == 1 + assert rows[0].name == "testing" + assert rows[0].value == 1337 + assert rows[1].id == 2 + assert rows[1].name == "omg" + assert rows[1].value == 7331 + assert rows[2].id == 3 + assert rows[2].name == "A" * 4100 + assert rows[2].value == 4100 + assert rows[3].id == 4 + assert rows[3].name == "B" * 4100 + assert rows[3].value == 4100 + assert rows[4].id == 5 + assert rows[4].name == "negative" + assert rows[4].value == -11644473429 + assert rows[5].id == 6 + assert rows[5].name == "after checkpoint" + assert rows[5].value == 42 + assert rows[6].id == 8 + assert rows[6].name == "after checkpoint" + assert rows[6].value == 44 + assert rows[7].id == 9 + assert rows[7].name == "wow" + assert rows[7].value == 1234 + assert rows[8].id == 10 + assert rows[8].name == "second checkpoint" + assert rows[8].value == 100 + assert rows[9].id == 11 + assert rows[9].name == "second checkpoint" + assert rows[9].value == 101 + + @pytest.mark.parametrize( ("input", "encoding", "expected_output"), [ From 9e4dc6ad202f39731f4639e331550b17ee6c0dc4 Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Wed, 19 Nov 2025 16:42:07 +0100 Subject: [PATCH 02/28] Move WAL classes to seperate file --- dissect/database/sqlite3/sqlite3.py | 170 +------------------------- dissect/database/sqlite3/wal.py | 178 ++++++++++++++++++++++++++++ 2 files changed, 181 insertions(+), 167 deletions(-) create mode 100644 dissect/database/sqlite3/wal.py diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index e3ec57f..2fdcce5 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -2,8 +2,7 @@ import itertools import re -import struct -from functools import cached_property, lru_cache +from functools import lru_cache from io import BytesIO from pathlib import Path from typing import TYPE_CHECKING, Any, BinaryIO @@ -16,6 +15,7 @@ NoCellData, ) from dissect.database.sqlite3.util import parse_table_columns_constraints +from dissect.database.sqlite3.wal import WAL, Checkpoint if TYPE_CHECKING: from collections.abc import Iterator @@ -50,17 +50,13 @@ SQLITE3_HEADER_MAGIC = b"SQLite format 3\x00" -WAL_HEADER_MAGIC_LE = 0x377F0682 -WAL_HEADER_MAGIC_BE = 0x377F0683 -WAL_HEADER_MAGIC = {WAL_HEADER_MAGIC_LE, WAL_HEADER_MAGIC_BE} - class SQLite3: def __init__( self, fh: Path | BinaryIO, wal_fh: Path | BinaryIO | None = None, - wal_checkpoint: WALCheckpoint | int | None = None, + wal_checkpoint: Checkpoint | int | None = None, ): # Use the provided file handle or try to open the file path. if hasattr(fh, "read"): @@ -557,166 +553,6 @@ def values(self) -> list[int | float | str | bytes | None]: return self._values -class WAL: - def __init__(self, fh: BinaryIO): - self.fh = fh - self.header = c_sqlite3.wal_header(fh) - - if self.header.magic not in WAL_HEADER_MAGIC: - raise InvalidDatabase("Invalid header magic") - - self.checksum_endian = "<" if self.header.magic == WAL_HEADER_MAGIC_LE else ">" - self._checkpoints = None - - self.frame = lru_cache(1024)(self.frame) - - def frame(self, frame_idx: int) -> WALFrame: - frame_size = len(c_sqlite3.wal_frame) + self.header.page_size - offset = len(c_sqlite3.wal_header) + frame_idx * frame_size - return WALFrame(self, offset) - - def frames(self) -> Iterator[WALFrame]: - frame_idx = 0 - while True: - try: - yield self.frame(frame_idx) - frame_idx += 1 - except EOFError: # noqa: PERF203 - break - - @cached_property - def commits(self) -> list[WALCommit]: - """Collects all commits in the WAL file. - - For commit records ``header.page_count`` specifies the size of the - database file in pages after the commit. For all other records it is 0. - - References: - - https://sqlite.org/fileformat2.html#wal_file_format - """ - commits = [] - frames = [] - - for frame in self.frames(): - frames.append(frame) - - # A commit record has a page_count header greater than zero - if frame.page_count > 0: - commits.append(WALCommit(self, frames)) - frames = [] - - return commits - - @cached_property - def checkpoints(self) -> list[WALCommit]: - """Return deduplicated WAL commits (checkpoints), newest first. - - Deduplicate commits by the salt1 value of their first frame. Later - commits overwrite earlier ones so the returned list contains the most - recent commit for each salt1, sorted descending. - - References: - - https://sqlite.org/fileformat2.html#wal_file_format - - https://sqlite.org/wal.html#checkpointing - """ - checkpoints_map: dict[int, WALCommit] = {} - for commit in self.commits: - if not commit.frames: - continue - salt1 = commit.frames[0].header.salt1 - # Keep the most recent commit for each salt1 (later commits overwrite). - checkpoints_map[salt1] = commit - - return sorted( - checkpoints_map.values(), - key=lambda c: c.frames[0].header.salt1, - #TODO Should this be reverse? Reverse means: cp0 is the latest, cpN the oldest - reverse=True, - ) - - -class WALFrame: - def __init__(self, wal: WAL, offset: int): - self.wal = wal - self.offset = offset - - self.fh = wal.fh - self._data = None - - self.fh.seek(offset) - self.header = c_sqlite3.wal_frame(self.fh) - - def __repr__(self) -> str: - return f"" - - @property - def valid(self) -> bool: - salt1_match = self.header.salt1 == self.wal.header.salt1 - salt2_match = self.header.salt2 == self.wal.header.salt2 - - return salt1_match and salt2_match - - @property - def data(self) -> bytes: - if not self._data: - self.fh.seek(self.offset + len(c_sqlite3.wal_frame)) - self._data = self.fh.read(self.wal.header.page_size) - return self._data - - @property - def page_number(self) -> int: - return self.header.page_number - - @property - def page_count(self) -> int: - return self.header.page_count - - -# Collection of frames that were committed together -class _WALFramesCollection: - def __init__(self, wal: WAL, frames: list[WALFrame]): - self.wal = wal - self.frames = frames - - def __contains__(self, page: int) -> bool: - return page in self.page_map - - def __getitem__(self, page: int) -> WALFrame: - return self.page_map[page] - - def __repr__(self) -> str: - return f"<{self.__class__.__name__} frames={len(self.frames)}>" - - @cached_property - def page_map(self) -> dict[int, WALFrame]: - return {frame.page_number: frame for frame in self.frames} - - def get(self, page: int, default: Any = None) -> WALFrame: - return self.page_map.get(page, default) - - -class WALCheckpoint(_WALFramesCollection): - pass - - -class WALCommit(_WALFramesCollection): - pass - - -def wal_checksum(buf: bytes, endian: str = ">") -> tuple[int, int]: - """For future use, will be used when WAL is fully implemented""" - - s0 = s1 = 0 - num_ints = len(buf) // 4 - arr = struct.unpack(f"{endian}{num_ints}I", buf) - - for int_num in range(0, num_ints, 2): - s0 = (s0 + (arr[int_num] + s1)) & 0xFFFFFFFF - s1 = (s1 + (arr[int_num + 1] + s0)) & 0xFFFFFFFF - - return s0, s1 - - def walk_tree(sqlite: SQLite3, page: Page) -> Iterator[Cell]: if page.header.flags in ( c_sqlite3.PAGE_TYPE_LEAF_TABLE, diff --git a/dissect/database/sqlite3/wal.py b/dissect/database/sqlite3/wal.py new file mode 100644 index 0000000..4969e17 --- /dev/null +++ b/dissect/database/sqlite3/wal.py @@ -0,0 +1,178 @@ +from __future__ import annotations + +import struct +from functools import cached_property, lru_cache +from typing import TYPE_CHECKING, Any, BinaryIO + +from dissect.database.sqlite3.c_sqlite3 import c_sqlite3 +from dissect.database.sqlite3.exception import ( + InvalidDatabase, +) + +if TYPE_CHECKING: + from collections.abc import Iterator + + +WAL_HEADER_MAGIC_LE = 0x377F0682 +WAL_HEADER_MAGIC_BE = 0x377F0683 +WAL_HEADER_MAGIC = {WAL_HEADER_MAGIC_LE, WAL_HEADER_MAGIC_BE} + + +class WAL: + def __init__(self, fh: BinaryIO): + self.fh = fh + self.header = c_sqlite3.wal_header(fh) + + if self.header.magic not in WAL_HEADER_MAGIC: + raise InvalidDatabase("Invalid header magic") + + self.checksum_endian = "<" if self.header.magic == WAL_HEADER_MAGIC_LE else ">" + self._checkpoints = None + + self.frame = lru_cache(1024)(self.frame) + + def frame(self, frame_idx: int) -> Frame: + frame_size = len(c_sqlite3.wal_frame) + self.header.page_size + offset = len(c_sqlite3.wal_header) + frame_idx * frame_size + return Frame(self, offset) + + def frames(self) -> Iterator[Frame]: + frame_idx = 0 + while True: + try: + yield self.frame(frame_idx) + frame_idx += 1 + except EOFError: # noqa: PERF203 + break + + @cached_property + def commits(self) -> list[Commit]: + """Collects all commits in the WAL file. + + For commit records ``header.page_count`` specifies the size of the + database file in pages after the commit. For all other records it is 0. + + References: + - https://sqlite.org/fileformat2.html#wal_file_format + """ + commits = [] + frames = [] + + for frame in self.frames(): + frames.append(frame) + + # A commit record has a page_count header greater than zero + if frame.page_count > 0: + commits.append(Commit(self, frames)) + frames = [] + + return commits + + @cached_property + def checkpoints(self) -> list[Checkpoint]: + """Return deduplicated WAL commits (checkpoints), newest first. + + Deduplicate commits by the salt1 value of their first frame. Later + commits overwrite earlier ones so the returned list contains the most + recent commit for each salt1, sorted descending. + + References: + - https://sqlite.org/fileformat2.html#wal_file_format + - https://sqlite.org/wal.html#checkpointing + """ + checkpoints_map: dict[int, Checkpoint] = {} + for commit in self.commits: + if not commit.frames: + continue + salt1 = commit.frames[0].header.salt1 + # Keep the most recent commit for each salt1 (later commits overwrite). + checkpoints_map[salt1] = commit + + return sorted( + checkpoints_map.values(), + key=lambda c: c.frames[0].header.salt1, + #TODO Should this be reverse? Reverse means: cp0 is the latest, cpN the oldest + reverse=True, + ) + + +class Frame: + def __init__(self, wal: WAL, offset: int): + self.wal = wal + self.offset = offset + + self.fh = wal.fh + self._data = None + + self.fh.seek(offset) + self.header = c_sqlite3.wal_frame(self.fh) + + def __repr__(self) -> str: + return f"" + + @property + def valid(self) -> bool: + salt1_match = self.header.salt1 == self.wal.header.salt1 + salt2_match = self.header.salt2 == self.wal.header.salt2 + + return salt1_match and salt2_match + + @property + def data(self) -> bytes: + if not self._data: + self.fh.seek(self.offset + len(c_sqlite3.wal_frame)) + self._data = self.fh.read(self.wal.header.page_size) + return self._data + + @property + def page_number(self) -> int: + return self.header.page_number + + @property + def page_count(self) -> int: + return self.header.page_count + + +# Collection of frames that were committed together +class _FramesCollection: + def __init__(self, wal: WAL, frames: list[Frame]): + self.wal = wal + self.frames = frames + + def __contains__(self, page: int) -> bool: + return page in self.page_map + + def __getitem__(self, page: int) -> Frame: + return self.page_map[page] + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} frames={len(self.frames)}>" + + @cached_property + def page_map(self) -> dict[int, Frame]: + return {frame.page_number: frame for frame in self.frames} + + def get(self, page: int, default: Any = None) -> Frame: + return self.page_map.get(page, default) + + +class Checkpoint(_FramesCollection): + pass + + +class Commit(_FramesCollection): + pass + + +def wal_checksum(buf: bytes, endian: str = ">") -> tuple[int, int]: + """For future use, will be used when WAL is fully implemented""" + + s0 = s1 = 0 + num_ints = len(buf) // 4 + arr = struct.unpack(f"{endian}{num_ints}I", buf) + + for int_num in range(0, num_ints, 2): + s0 = (s0 + (arr[int_num] + s1)) & 0xFFFFFFFF + s1 = (s1 + (arr[int_num + 1] + s0)) & 0xFFFFFFFF + + return s0, s1 From a25688f3337c93d8321aafe465926d62be377a6c Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Wed, 19 Nov 2025 16:49:25 +0100 Subject: [PATCH 03/28] Make linter happy --- dissect/database/sqlite3/wal.py | 2 +- tests/_data/sqlite3/__init__.py | 0 tests/_data/sqlite3/generate_sqlite.py | 4 ++-- tests/sqlite3/conftest.py | 5 +---- tests/sqlite3/test_sqlite3.py | 6 +++--- 5 files changed, 7 insertions(+), 10 deletions(-) create mode 100644 tests/_data/sqlite3/__init__.py diff --git a/dissect/database/sqlite3/wal.py b/dissect/database/sqlite3/wal.py index 4969e17..ade5044 100644 --- a/dissect/database/sqlite3/wal.py +++ b/dissect/database/sqlite3/wal.py @@ -91,7 +91,7 @@ def checkpoints(self) -> list[Checkpoint]: return sorted( checkpoints_map.values(), key=lambda c: c.frames[0].header.salt1, - #TODO Should this be reverse? Reverse means: cp0 is the latest, cpN the oldest + # TODO Should this be reverse? Reverse means: cp0 is the latest, cpN the oldest reverse=True, ) diff --git a/tests/_data/sqlite3/__init__.py b/tests/_data/sqlite3/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/_data/sqlite3/generate_sqlite.py b/tests/_data/sqlite3/generate_sqlite.py index d28478b..aaaac31 100644 --- a/tests/_data/sqlite3/generate_sqlite.py +++ b/tests/_data/sqlite3/generate_sqlite.py @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c65612d4bb03b3489d9e3e5c6ec6f740a53a824c20bab660ad73791f26281f5 -size 2191 +oid sha256:e903989231def93d3db8d50c61e4ec167a817f90618bf7b6c85f5264ffb4aaf6 +size 2328 diff --git a/tests/sqlite3/conftest.py b/tests/sqlite3/conftest.py index 4416331..e072ba0 100644 --- a/tests/sqlite3/conftest.py +++ b/tests/sqlite3/conftest.py @@ -1,14 +1,11 @@ from __future__ import annotations -from typing import TYPE_CHECKING, BinaryIO +from typing import BinaryIO import pytest from tests._util import absolute_path -if TYPE_CHECKING: - from collections.abc import Iterator - @pytest.fixture def sqlite_db() -> BinaryIO: diff --git a/tests/sqlite3/test_sqlite3.py b/tests/sqlite3/test_sqlite3.py index 4593e97..b37448f 100644 --- a/tests/sqlite3/test_sqlite3.py +++ b/tests/sqlite3/test_sqlite3.py @@ -50,7 +50,7 @@ def test_sqlite_wal(sqlite_db: BinaryIO, sqlite_wal: BinaryIO) -> None: # After the first checkpoint the "after checkpoint" entries are present s = sqlite3.SQLite3(sqlite_db, sqlite_wal, wal_checkpoint=2) - table = list(s.tables())[0] + table = next(iter(s.tables())) rows = list(table.rows()) assert len(rows) == 9 @@ -89,7 +89,7 @@ def test_sqlite_wal(sqlite_db: BinaryIO, sqlite_wal: BinaryIO) -> None: # After the second checkpoint two more entries are present ("second checkpoint") s = sqlite3.SQLite3(sqlite_db, sqlite_wal, wal_checkpoint=1) - table = list(s.tables())[0] + table = next(iter(s.tables())) rows = list(table.rows()) assert len(rows) == 11 @@ -134,7 +134,7 @@ def test_sqlite_wal(sqlite_db: BinaryIO, sqlite_wal: BinaryIO) -> None: # After the third checkpoint the deletion and update of one "after checkpoint" are reflected s = sqlite3.SQLite3(sqlite_db, sqlite_wal, wal_checkpoint=0) - table = list(s.tables())[0] + table = next(iter(s.tables())) rows = list(table.rows()) assert len(rows) == 10 From 47b7611c83a23cfe064bb2d08d05e53b9f2b4dff Mon Sep 17 00:00:00 2001 From: Pim <36573021+PimSanders@users.noreply.github.com> Date: Wed, 26 Nov 2025 07:54:09 +0100 Subject: [PATCH 04/28] Apply suggestions from code review Co-authored-by: Erik Schamper <1254028+Schamper@users.noreply.github.com> --- dissect/database/sqlite3/sqlite3.py | 11 +++++------ dissect/database/sqlite3/wal.py | 24 ++++++++++-------------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index 2fdcce5..9cc083d 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -55,8 +55,8 @@ class SQLite3: def __init__( self, fh: Path | BinaryIO, - wal_fh: Path | BinaryIO | None = None, - wal_checkpoint: Checkpoint | int | None = None, + wal: WAL | Path | BinaryIO | None = None, + checkpoint: Checkpoint | int | None = None, ): # Use the provided file handle or try to open the file path. if hasattr(fh, "read"): @@ -82,8 +82,7 @@ def __init__( elif self.path: # Check for common WAL sidecars next to the DB. for suffix in (".sqlite-wal", ".db-wal"): - candidate = self.path.with_suffix(suffix) - if candidate.exists(): + if (candidate := self.path.with_suffix(suffix)).exists(): wal_path = candidate wal_fh = wal_path.open("rb") break @@ -160,9 +159,9 @@ def indices(self) -> Iterator[Index]: def raw_page(self, num: int) -> bytes: """Retrieve the raw frame data for the given page number. - Reads the page from a checkpoint if provided. + Reads the page from a checkpoint, if this class was initialized with a WAL checkpoint. - Will first check if the WAL contains a more recent version of the page, + If a WAL is available, will first check if the WAL contains a more recent version of the page, otherwise it will read the page from the database file. References: diff --git a/dissect/database/sqlite3/wal.py b/dissect/database/sqlite3/wal.py index ade5044..23d5a35 100644 --- a/dissect/database/sqlite3/wal.py +++ b/dissect/database/sqlite3/wal.py @@ -47,10 +47,10 @@ def frames(self) -> Iterator[Frame]: @cached_property def commits(self) -> list[Commit]: - """Collects all commits in the WAL file. + """Return all commits in the WAL file. - For commit records ``header.page_count`` specifies the size of the - database file in pages after the commit. For all other records it is 0. + Commits are frames where ``header.page_count`` specifies the size of the + database file in pages after the commit. For all other frames it is 0. References: - https://sqlite.org/fileformat2.html#wal_file_format @@ -102,13 +102,12 @@ def __init__(self, wal: WAL, offset: int): self.offset = offset self.fh = wal.fh - self._data = None self.fh.seek(offset) self.header = c_sqlite3.wal_frame(self.fh) def __repr__(self) -> str: - return f"" + return f"" @property def valid(self) -> bool: @@ -119,10 +118,8 @@ def valid(self) -> bool: @property def data(self) -> bytes: - if not self._data: - self.fh.seek(self.offset + len(c_sqlite3.wal_frame)) - self._data = self.fh.read(self.wal.header.page_size) - return self._data + self.fh.seek(self.offset + len(c_sqlite3.wal_frame)) + return self.fh.read(self.wal.header.page_size) @property def page_number(self) -> int: @@ -134,7 +131,7 @@ def page_count(self) -> int: # Collection of frames that were committed together -class _FramesCollection: +class _FrameCollection: def __init__(self, wal: WAL, frames: list[Frame]): self.wal = wal self.frames = frames @@ -156,16 +153,15 @@ def get(self, page: int, default: Any = None) -> Frame: return self.page_map.get(page, default) -class Checkpoint(_FramesCollection): +class Checkpoint(_FrameCollection): pass -class Commit(_FramesCollection): +class Commit(_FrameCollection): pass -def wal_checksum(buf: bytes, endian: str = ">") -> tuple[int, int]: - """For future use, will be used when WAL is fully implemented""" +def checksum(buf: bytes, endian: str = ">") -> tuple[int, int]: s0 = s1 = 0 num_ints = len(buf) // 4 From fc76bfc9d0958945eab799268b8e42efb0851872 Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Wed, 26 Nov 2025 11:16:54 +0100 Subject: [PATCH 05/28] Apply suggestions from code review --- dissect/database/sqlite3/sqlite3.py | 69 +++++++++++++++-------------- dissect/database/sqlite3/wal.py | 27 ++++++++++- tests/_data/sqlite3/__init__.py | 0 tests/sqlite3/conftest.py | 14 +++--- tests/sqlite3/test_sqlite3.py | 50 ++++++++++++++------- 5 files changed, 103 insertions(+), 57 deletions(-) delete mode 100644 tests/_data/sqlite3/__init__.py diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index 9cc083d..1aadfeb 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -48,10 +48,27 @@ 9: lambda fh: 1, } +# See https://sqlite.org/fileformat2.html#magic_header_string SQLITE3_HEADER_MAGIC = b"SQLite format 3\x00" class SQLite3: + """SQLite3 database class. + + Loads a SQLite3 database from the given file handle. Optionally a WAL file handle can be provided to read + changes from the WAL. Additionally, a specific checkpoint from the WAL can be applied. + + Args: + fh: The path or file-like object to open a SQLite3 database on. + wal: The path or file-like object to open a SQLite3 WAL file on. + checkpoint: The checkpoint to apply from the WAL file. Can be a Checkpoint object or an integer index. + + Raises: + InvalidDatabase: If the file-like object does not look like a SQLite3 database based on the header magic. + + References: + - https://sqlite.org/fileformat2.html + """ def __init__( self, fh: Path | BinaryIO, @@ -68,38 +85,25 @@ def __init__( self.fh = fh self.path = path + self.checkpoint = checkpoint + self.wal = None - # Use the provided WAL file handle or try to open a sidecar WAL file. - if wal_fh is not None: - if hasattr(wal_fh, "read"): - name = getattr(wal_fh, "name", None) - wal_path = Path(name) if name else None - else: - if not isinstance(wal_fh, Path): - wal_fh = Path(wal_fh) - wal_path = wal_fh - wal_fh = wal_path.open("rb") - elif self.path: + if wal: + self.wal = WAL(wal) + elif path: # Check for common WAL sidecars next to the DB. for suffix in (".sqlite-wal", ".db-wal"): if (candidate := self.path.with_suffix(suffix)).exists(): - wal_path = candidate - wal_fh = wal_path.open("rb") + self.wal = WAL(candidate.open("rb")) break - else: - wal_path = None - wal_fh = None - self.wal = WAL(wal_fh) if wal_fh else None - self.wal_path = wal_path if wal_fh else None - self.wal_checkpoint = wal_checkpoint - - if self.wal and self.wal_checkpoint is not None and isinstance(self.wal_checkpoint, int): - checkpoint = self.wal_checkpoint + # If a checkpoint index was provided, resolve it to a Checkpoint object. + if self.wal and isinstance(self.checkpoint, int): + checkpoint = self.checkpoint checkpoints = self.wal.checkpoints if checkpoint < 0 or checkpoint >= len(checkpoints): raise IndexError("WAL checkpoint index out of range") - self.wal_checkpoint = checkpoints[checkpoint] + self.checkpoint = checkpoints[checkpoint] self.header = c_sqlite3.header(self.fh) if self.header.magic != SQLITE3_HEADER_MAGIC: @@ -116,7 +120,7 @@ def __init__( self.page = lru_cache(256)(self.page) - def open_wal(self, fh: BinaryIO) -> None: + def open_wal(self, fh: Path | BinaryIO) -> None: self.wal = WAL(fh) def checkpoints(self) -> Iterator[SQLite3]: @@ -172,12 +176,15 @@ def raw_page(self, num: int) -> bytes: if (num < 1 or num > self.header.page_count) and self.header.page_count > 0: raise InvalidPageNumber("Page number exceeds boundaries") + if num == 1: # Page 1 is root + self.fh.seek(len(c_sqlite3.header)) + else: + self.fh.seek((num - 1) * self.page_size) + # If a specific WAL checkpoint was provided, use it instead of the on-disk page. - if self.wal and self.wal_checkpoint is not None: - if num == 1: - self.fh.seek(len(c_sqlite3.header)) - elif num in self.wal_checkpoint: - frame = self.wal_checkpoint.get(num) + if self.wal and self.checkpoint is not None: + if num in self.checkpoint: + frame = self.checkpoint.get(num) return frame.data else: # If the page is not present in the checkpoint, skip. @@ -199,10 +206,6 @@ def raw_page(self, num: int) -> bytes: if last_valid_frame in commit.frames: return last_valid_frame.data - if num == 1: # Page 1 is root - self.fh.seek(len(c_sqlite3.header)) - else: - self.fh.seek((num - 1) * self.page_size) return self.fh.read(self.header.page_size) def page(self, num: int) -> Page: diff --git a/dissect/database/sqlite3/wal.py b/dissect/database/sqlite3/wal.py index 23d5a35..a2392da 100644 --- a/dissect/database/sqlite3/wal.py +++ b/dissect/database/sqlite3/wal.py @@ -2,6 +2,7 @@ import struct from functools import cached_property, lru_cache +from pathlib import Path from typing import TYPE_CHECKING, Any, BinaryIO from dissect.database.sqlite3.c_sqlite3 import c_sqlite3 @@ -12,15 +13,26 @@ if TYPE_CHECKING: from collections.abc import Iterator - +# See https://sqlite.org/fileformat2.html#wal_file_format WAL_HEADER_MAGIC_LE = 0x377F0682 WAL_HEADER_MAGIC_BE = 0x377F0683 WAL_HEADER_MAGIC = {WAL_HEADER_MAGIC_LE, WAL_HEADER_MAGIC_BE} class WAL: - def __init__(self, fh: BinaryIO): + def __init__(self, fh: WAL | Path | BinaryIO): + # Use the provided WAL file handle or try to open a sidecar WAL file. + if hasattr(fh, "read"): + name = getattr(fh, "name", None) + path = Path(name) if name else None + else: + if not isinstance(fh, Path): + fh = Path(fh) + path = fh + fh = path.open("rb") + self.fh = fh + self.path = path self.header = c_sqlite3.wal_header(fh) if self.header.magic not in WAL_HEADER_MAGIC: @@ -154,10 +166,21 @@ def get(self, page: int, default: Any = None) -> Frame: class Checkpoint(_FrameCollection): + """A checkpoint is an operation that transfers all committed transactions from + the WAL file back into the main database file. + + References: + - https://sqlite.org/fileformat2.html#wal_file_format + """ pass class Commit(_FrameCollection): + """A commit is a collection of frames that were committed together. + + References: + - https://sqlite.org/fileformat2.html#wal_file_format + """ pass diff --git a/tests/_data/sqlite3/__init__.py b/tests/_data/sqlite3/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/sqlite3/conftest.py b/tests/sqlite3/conftest.py index e072ba0..d4aaba5 100644 --- a/tests/sqlite3/conftest.py +++ b/tests/sqlite3/conftest.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import BinaryIO +from pathlib import Path import pytest @@ -8,15 +8,15 @@ @pytest.fixture -def sqlite_db() -> BinaryIO: - return absolute_path("_data/sqlite3/test.sqlite").open("rb") +def sqlite_db() -> Path: + return absolute_path("_data/sqlite3/test.sqlite") @pytest.fixture -def sqlite_wal() -> BinaryIO: - return absolute_path("_data/sqlite3/test.sqlite-wal").open("rb") +def sqlite_wal() -> Path: + return absolute_path("_data/sqlite3/test.sqlite-wal") @pytest.fixture -def empty_db() -> BinaryIO: - return absolute_path("_data/sqlite3/empty.sqlite").open("rb") +def empty_db() -> Path: + return absolute_path("_data/sqlite3/empty.sqlite") diff --git a/tests/sqlite3/test_sqlite3.py b/tests/sqlite3/test_sqlite3.py index b37448f..400e18b 100644 --- a/tests/sqlite3/test_sqlite3.py +++ b/tests/sqlite3/test_sqlite3.py @@ -1,19 +1,25 @@ from __future__ import annotations from io import BytesIO +from pathlib import Path from typing import Any, BinaryIO import pytest from dissect.database.sqlite3 import sqlite3 +def test_sqlite_binaryio(sqlite_db: Path) -> None: + s = sqlite3.SQLite3(sqlite_db.open("rb")) + _sqlite_read_data(s) -def test_sqlite(sqlite_db: BinaryIO) -> None: +def test_sqlite_path(sqlite_db: Path) -> None: s = sqlite3.SQLite3(sqlite_db) + _sqlite_read_data(s) - assert s.header.magic == sqlite3.SQLITE3_HEADER_MAGIC +def _sqlite_read_data(db: sqlite3.SQLite3) -> None: + assert db.header.magic == sqlite3.SQLITE3_HEADER_MAGIC - tables = list(s.tables()) + tables = list(db.tables()) assert len(tables) == 2 table = tables[0] @@ -21,7 +27,7 @@ def test_sqlite(sqlite_db: BinaryIO) -> None: assert table.page == 2 assert [column.name for column in table.columns] == ["id", "name", "value"] assert table.primary_key == "id" - assert s.table("test").__dict__ == table.__dict__ + assert db.table("test").__dict__ == table.__dict__ rows = list(table.rows()) assert len(rows) == 10 @@ -46,10 +52,30 @@ def test_sqlite(sqlite_db: BinaryIO) -> None: assert list(rows[0]) == [("id", 1), ("name", "testing"), ("value", 1337)] -def test_sqlite_wal(sqlite_db: BinaryIO, sqlite_wal: BinaryIO) -> None: - # After the first checkpoint the "after checkpoint" entries are present - s = sqlite3.SQLite3(sqlite_db, sqlite_wal, wal_checkpoint=2) +def test_sqlite_wal_binaryio(sqlite_db: Path, sqlite_wal: Path) -> None: + s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=2) + _sqlite_read_checkpoint2(s) + + s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=1) + _sqlite_read_checkpoint1(s) + + s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=0) + _sqlite_read_checkpoint0(s) + + +def test_sqlite_wal_path(sqlite_db: Path, sqlite_wal: Path) -> None: + s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=2) + _sqlite_read_checkpoint2(s) + + s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=1) + _sqlite_read_checkpoint1(s) + s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=0) + _sqlite_read_checkpoint0(s) + + +def _sqlite_read_checkpoint2(s: sqlite3.SQLite3) -> None: + # After the first checkpoint the "after checkpoint" entries are present table = next(iter(s.tables())) rows = list(table.rows()) @@ -83,12 +109,9 @@ def test_sqlite_wal(sqlite_db: BinaryIO, sqlite_wal: BinaryIO) -> None: assert rows[8].name == "after checkpoint" assert rows[8].value == 45 - sqlite_wal.seek(0) - sqlite_db.seek(0) +def _sqlite_read_checkpoint1(s: sqlite3.SQLite3) -> None: # After the second checkpoint two more entries are present ("second checkpoint") - s = sqlite3.SQLite3(sqlite_db, sqlite_wal, wal_checkpoint=1) - table = next(iter(s.tables())) rows = list(table.rows()) @@ -128,12 +151,9 @@ def test_sqlite_wal(sqlite_db: BinaryIO, sqlite_wal: BinaryIO) -> None: assert rows[10].name == "second checkpoint" assert rows[10].value == 101 - sqlite_wal.seek(0) - sqlite_db.seek(0) +def _sqlite_read_checkpoint0(s: sqlite3.SQLite3) -> None: # After the third checkpoint the deletion and update of one "after checkpoint" are reflected - s = sqlite3.SQLite3(sqlite_db, sqlite_wal, wal_checkpoint=0) - table = next(iter(s.tables())) rows = list(table.rows()) From aa342691c77a52a6a6413e019dca2a4c4d703ee8 Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Wed, 26 Nov 2025 12:53:27 +0100 Subject: [PATCH 06/28] Apply suggestions from code review --- dissect/database/sqlite3/sqlite3.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index 1aadfeb..9cd2a8d 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -55,8 +55,10 @@ class SQLite3: """SQLite3 database class. - Loads a SQLite3 database from the given file handle. Optionally a WAL file handle can be provided to read - changes from the WAL. Additionally, a specific checkpoint from the WAL can be applied. + Loads a SQLite3 database from the given file-like object or path. If a path is provided (or can be deduced + from the file-like object), a WAL file will be automatically looked for with a few common suffixes. + Optionally a WAL file-like object or path can be directly provided to read changes from the WAL (this takes + priority over the aforementioned WAL lookup). Additionally, a specific checkpoint from the WAL can be applied. Args: fh: The path or file-like object to open a SQLite3 database on. From 36ec9daccb0cf8cbab2da707b1e827201da2fbd3 Mon Sep 17 00:00:00 2001 From: Pim <36573021+PimSanders@users.noreply.github.com> Date: Wed, 26 Nov 2025 13:05:06 +0100 Subject: [PATCH 07/28] Apply suggestions from code review Co-authored-by: Erik Schamper <1254028+Schamper@users.noreply.github.com> --- dissect/database/sqlite3/sqlite3.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index 9cd2a8d..7af8339 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -63,7 +63,7 @@ class SQLite3: Args: fh: The path or file-like object to open a SQLite3 database on. wal: The path or file-like object to open a SQLite3 WAL file on. - checkpoint: The checkpoint to apply from the WAL file. Can be a Checkpoint object or an integer index. + checkpoint: The checkpoint to apply from the WAL file. Can be a :class:`Checkpoint` object or an integer index. Raises: InvalidDatabase: If the file-like object does not look like a SQLite3 database based on the header magic. @@ -87,8 +87,8 @@ def __init__( self.fh = fh self.path = path - self.checkpoint = checkpoint self.wal = None + self.checkpoint = None if wal: self.wal = WAL(wal) @@ -96,16 +96,17 @@ def __init__( # Check for common WAL sidecars next to the DB. for suffix in (".sqlite-wal", ".db-wal"): if (candidate := self.path.with_suffix(suffix)).exists(): - self.wal = WAL(candidate.open("rb")) + self.wal = WAL(candidate) break # If a checkpoint index was provided, resolve it to a Checkpoint object. - if self.wal and isinstance(self.checkpoint, int): + if self.wal and isinstance(checkpoint, int): checkpoint = self.checkpoint - checkpoints = self.wal.checkpoints - if checkpoint < 0 or checkpoint >= len(checkpoints): + if checkpoint < 0 or checkpoint >= len(self.wal.checkpoints): raise IndexError("WAL checkpoint index out of range") - self.checkpoint = checkpoints[checkpoint] + self.checkpoint = self.wal.checkpoints[checkpoint] + else: + self.checkpoint = checkpoint self.header = c_sqlite3.header(self.fh) if self.header.magic != SQLITE3_HEADER_MAGIC: From c7278416ef4b388f20a7071a21f572fb00514efb Mon Sep 17 00:00:00 2001 From: Pim <36573021+PimSanders@users.noreply.github.com> Date: Wed, 26 Nov 2025 14:40:33 +0100 Subject: [PATCH 08/28] Apply suggestions from code review Co-authored-by: Erik Schamper <1254028+Schamper@users.noreply.github.com> --- dissect/database/sqlite3/sqlite3.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index 7af8339..4a29ebb 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -91,7 +91,7 @@ def __init__( self.checkpoint = None if wal: - self.wal = WAL(wal) + self.wal = WAL(wal) if not isinstance(wal, WAL) else wal elif path: # Check for common WAL sidecars next to the DB. for suffix in (".sqlite-wal", ".db-wal"): @@ -101,7 +101,6 @@ def __init__( # If a checkpoint index was provided, resolve it to a Checkpoint object. if self.wal and isinstance(checkpoint, int): - checkpoint = self.checkpoint if checkpoint < 0 or checkpoint >= len(self.wal.checkpoints): raise IndexError("WAL checkpoint index out of range") self.checkpoint = self.wal.checkpoints[checkpoint] @@ -131,7 +130,7 @@ def checkpoints(self) -> Iterator[SQLite3]: return for checkpoint in self.wal.commits: - yield SQLite3(self.fh, self.wal.fh, checkpoint) + yield SQLite3(self.fh, self.wal, checkpoint) def table(self, name: str) -> Table | None: name = name.lower() @@ -189,9 +188,6 @@ def raw_page(self, num: int) -> bytes: if num in self.checkpoint: frame = self.checkpoint.get(num) return frame.data - else: - # If the page is not present in the checkpoint, skip. - pass # Check if the latest valid instance of the page is committed (either the frame itself # is the commit frame or it is included in a commit's frames). If so, return that frame's data. From f1dafd832aa0cfb49c593ec9366c602aa0a82661 Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Wed, 26 Nov 2025 14:41:58 +0100 Subject: [PATCH 09/28] Remove open_wal() function --- dissect/database/sqlite3/sqlite3.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index 4a29ebb..dffe3e5 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -122,9 +122,6 @@ def __init__( self.page = lru_cache(256)(self.page) - def open_wal(self, fh: Path | BinaryIO) -> None: - self.wal = WAL(fh) - def checkpoints(self) -> Iterator[SQLite3]: if not self.wal: return From e16fde655d25f774b4c72d2c4f3e042e253ed1ae Mon Sep 17 00:00:00 2001 From: Pim <36573021+PimSanders@users.noreply.github.com> Date: Wed, 26 Nov 2025 14:57:06 +0100 Subject: [PATCH 10/28] Update dissect/database/sqlite3/sqlite3.py Co-authored-by: Erik Schamper <1254028+Schamper@users.noreply.github.com> --- dissect/database/sqlite3/sqlite3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index dffe3e5..00b3122 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -126,7 +126,7 @@ def checkpoints(self) -> Iterator[SQLite3]: if not self.wal: return - for checkpoint in self.wal.commits: + for checkpoint in self.wal.checkpoints: yield SQLite3(self.fh, self.wal, checkpoint) def table(self, name: str) -> Table | None: From 5c1bdaf0d4aa223fd3ae8c91195b87a63322cba8 Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Wed, 26 Nov 2025 16:20:30 +0100 Subject: [PATCH 11/28] Apply suggestions from code review --- dissect/database/sqlite3/sqlite3.py | 17 +++++------------ dissect/database/sqlite3/wal.py | 2 ++ 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index 00b3122..624ebb5 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -177,6 +177,7 @@ def raw_page(self, num: int) -> bytes: if num == 1: # Page 1 is root self.fh.seek(len(c_sqlite3.header)) + return self.fh.read(self.header.page_size) else: self.fh.seek((num - 1) * self.page_size) @@ -189,18 +190,10 @@ def raw_page(self, num: int) -> bytes: # Check if the latest valid instance of the page is committed (either the frame itself # is the commit frame or it is included in a commit's frames). If so, return that frame's data. if self.wal: - frames = list(self.wal.frames()) - last_valid_frame = None - for f in frames: - if f.valid and f.page_number == num: - last_valid_frame = f - - if last_valid_frame is not None: - for commit in self.wal.commits: - # commit.frames contains all frames that were committed together; - # if our last valid frame is in one of those, it's part of that commit. - if last_valid_frame in commit.frames: - return last_valid_frame.data + for commit in self.wal.commits: + if num in commit: + frame = commit.get(num) + return frame.data return self.fh.read(self.header.page_size) diff --git a/dissect/database/sqlite3/wal.py b/dissect/database/sqlite3/wal.py index a2392da..de86cc4 100644 --- a/dissect/database/sqlite3/wal.py +++ b/dissect/database/sqlite3/wal.py @@ -78,6 +78,8 @@ def commits(self) -> list[Commit]: commits.append(Commit(self, frames)) frames = [] + #TODO There might be data stored in later frames after the commit? + return commits @cached_property From b134330ae31d29b2ee5b6d164349cdebd4e947d2 Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Wed, 26 Nov 2025 16:28:17 +0100 Subject: [PATCH 12/28] Make linter happy again --- dissect/database/sqlite3/sqlite3.py | 11 +++++------ dissect/database/sqlite3/wal.py | 5 +---- tests/sqlite3/conftest.py | 5 ++++- tests/sqlite3/test_sqlite3.py | 9 +++++++-- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index 624ebb5..9485ccb 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -71,6 +71,7 @@ class SQLite3: References: - https://sqlite.org/fileformat2.html """ + def __init__( self, fh: Path | BinaryIO, @@ -178,14 +179,12 @@ def raw_page(self, num: int) -> bytes: if num == 1: # Page 1 is root self.fh.seek(len(c_sqlite3.header)) return self.fh.read(self.header.page_size) - else: - self.fh.seek((num - 1) * self.page_size) + self.fh.seek((num - 1) * self.page_size) # If a specific WAL checkpoint was provided, use it instead of the on-disk page. - if self.wal and self.checkpoint is not None: - if num in self.checkpoint: - frame = self.checkpoint.get(num) - return frame.data + if self.wal and self.checkpoint is not None and num in self.checkpoint: + frame = self.checkpoint.get(num) + return frame.data # Check if the latest valid instance of the page is committed (either the frame itself # is the commit frame or it is included in a commit's frames). If so, return that frame's data. diff --git a/dissect/database/sqlite3/wal.py b/dissect/database/sqlite3/wal.py index de86cc4..6025c31 100644 --- a/dissect/database/sqlite3/wal.py +++ b/dissect/database/sqlite3/wal.py @@ -78,7 +78,7 @@ def commits(self) -> list[Commit]: commits.append(Commit(self, frames)) frames = [] - #TODO There might be data stored in later frames after the commit? + # TODO There might be data stored in later frames after the commit? return commits @@ -174,7 +174,6 @@ class Checkpoint(_FrameCollection): References: - https://sqlite.org/fileformat2.html#wal_file_format """ - pass class Commit(_FrameCollection): @@ -183,11 +182,9 @@ class Commit(_FrameCollection): References: - https://sqlite.org/fileformat2.html#wal_file_format """ - pass def checksum(buf: bytes, endian: str = ">") -> tuple[int, int]: - s0 = s1 = 0 num_ints = len(buf) // 4 arr = struct.unpack(f"{endian}{num_ints}I", buf) diff --git a/tests/sqlite3/conftest.py b/tests/sqlite3/conftest.py index d4aaba5..9f86947 100644 --- a/tests/sqlite3/conftest.py +++ b/tests/sqlite3/conftest.py @@ -1,11 +1,14 @@ from __future__ import annotations -from pathlib import Path +from typing import TYPE_CHECKING import pytest from tests._util import absolute_path +if TYPE_CHECKING: + from pathlib import Path + @pytest.fixture def sqlite_db() -> Path: diff --git a/tests/sqlite3/test_sqlite3.py b/tests/sqlite3/test_sqlite3.py index 400e18b..320f825 100644 --- a/tests/sqlite3/test_sqlite3.py +++ b/tests/sqlite3/test_sqlite3.py @@ -1,21 +1,26 @@ from __future__ import annotations from io import BytesIO -from pathlib import Path -from typing import Any, BinaryIO +from typing import TYPE_CHECKING, Any, BinaryIO import pytest from dissect.database.sqlite3 import sqlite3 +if TYPE_CHECKING: + from pathlib import Path + + def test_sqlite_binaryio(sqlite_db: Path) -> None: s = sqlite3.SQLite3(sqlite_db.open("rb")) _sqlite_read_data(s) + def test_sqlite_path(sqlite_db: Path) -> None: s = sqlite3.SQLite3(sqlite_db) _sqlite_read_data(s) + def _sqlite_read_data(db: sqlite3.SQLite3) -> None: assert db.header.magic == sqlite3.SQLITE3_HEADER_MAGIC From 95e9ec867b8ed962de2a880be4700a72448fa018 Mon Sep 17 00:00:00 2001 From: Pim <36573021+PimSanders@users.noreply.github.com> Date: Mon, 1 Dec 2025 19:21:10 +0100 Subject: [PATCH 13/28] Apply suggestions from code review Co-authored-by: Erik Schamper <1254028+Schamper@users.noreply.github.com> --- dissect/database/sqlite3/sqlite3.py | 4 ++-- dissect/database/sqlite3/wal.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index 9485ccb..eb4dba1 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -179,10 +179,9 @@ def raw_page(self, num: int) -> bytes: if num == 1: # Page 1 is root self.fh.seek(len(c_sqlite3.header)) return self.fh.read(self.header.page_size) - self.fh.seek((num - 1) * self.page_size) # If a specific WAL checkpoint was provided, use it instead of the on-disk page. - if self.wal and self.checkpoint is not None and num in self.checkpoint: + if self.checkpoint is not None and num in self.checkpoint: frame = self.checkpoint.get(num) return frame.data @@ -194,6 +193,7 @@ def raw_page(self, num: int) -> bytes: frame = commit.get(num) return frame.data + self.fh.seek((num - 1) * self.page_size) return self.fh.read(self.header.page_size) def page(self, num: int) -> Page: diff --git a/dissect/database/sqlite3/wal.py b/dissect/database/sqlite3/wal.py index 6025c31..ceddae7 100644 --- a/dissect/database/sqlite3/wal.py +++ b/dissect/database/sqlite3/wal.py @@ -1,5 +1,7 @@ from __future__ import annotations +import logging +import os import struct from functools import cached_property, lru_cache from pathlib import Path @@ -12,6 +14,8 @@ if TYPE_CHECKING: from collections.abc import Iterator +log = logging.getLogger(__name__) +log.setLevel(os.getenv("DISSECT_LOG_SQLITE3", "CRITICAL")) # See https://sqlite.org/fileformat2.html#wal_file_format WAL_HEADER_MAGIC_LE = 0x377F0682 @@ -39,7 +43,6 @@ def __init__(self, fh: WAL | Path | BinaryIO): raise InvalidDatabase("Invalid header magic") self.checksum_endian = "<" if self.header.magic == WAL_HEADER_MAGIC_LE else ">" - self._checkpoints = None self.frame = lru_cache(1024)(self.frame) @@ -144,8 +147,8 @@ def page_count(self) -> int: return self.header.page_count -# Collection of frames that were committed together class _FrameCollection: + """Convenience class to keep track of a collection of frames that were committed together.""" def __init__(self, wal: WAL, frames: list[Frame]): self.wal = wal self.frames = frames From 527c8ec5e3b6579aaccaa1f52ededa61df32641d Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Mon, 1 Dec 2025 19:27:15 +0100 Subject: [PATCH 14/28] Move files --- tests/_data/sqlite3/generate_sqlite.py | 3 - tests/_tools/sqlite3/generate_sqlite.py | 86 ++++++++++++++ tests/sqlite3/test_sqlite3.py | 139 ---------------------- tests/sqlite3/test_wal.py | 147 ++++++++++++++++++++++++ 4 files changed, 233 insertions(+), 142 deletions(-) delete mode 100644 tests/_data/sqlite3/generate_sqlite.py create mode 100644 tests/_tools/sqlite3/generate_sqlite.py create mode 100644 tests/sqlite3/test_wal.py diff --git a/tests/_data/sqlite3/generate_sqlite.py b/tests/_data/sqlite3/generate_sqlite.py deleted file mode 100644 index aaaac31..0000000 --- a/tests/_data/sqlite3/generate_sqlite.py +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e903989231def93d3db8d50c61e4ec167a817f90618bf7b6c85f5264ffb4aaf6 -size 2328 diff --git a/tests/_tools/sqlite3/generate_sqlite.py b/tests/_tools/sqlite3/generate_sqlite.py new file mode 100644 index 0000000..f9df4ad --- /dev/null +++ b/tests/_tools/sqlite3/generate_sqlite.py @@ -0,0 +1,86 @@ +import sqlite3 +from pathlib import Path + +conn = sqlite3.connect("db.sqlite", isolation_level=None) + +# Set WAL mode +conn.execute("PRAGMA journal_mode=WAL;") + +# Disable automatic checkpoints to keep all data in WAL for testing +conn.execute("PRAGMA wal_autocheckpoint=-1;") + + +def create_table() -> None: + conn.execute( + """ + CREATE TABLE IF NOT EXISTS test ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT NOT NULL, + value INTEGER NOT NULL + ) + """ + ) + + +def insert_data(name: str, value: str | int) -> None: + conn.execute("INSERT INTO test (name, value) VALUES (?, ?)", (name, value)) + + +def delete_data(name: str, value: str | int) -> None: + conn.execute("DELETE FROM test WHERE name = ? AND value = ?", (name, value)) + + +def update_data(old_name: str, old_value: str | int, new_name: str, new_value: str | int) -> None: + conn.execute( + "UPDATE test SET name = ?, value = ? WHERE name = ? AND value = ?", + (new_name, new_value, old_name, old_value), + ) + + +def create_checkpoint() -> None: + conn.execute("PRAGMA wal_checkpoint(FULL);") + + +def move_files() -> None: + Path("db.sqlite").rename("test.sqlite") + Path("db.sqlite-wal").rename("test.sqlite-wal") + Path("db.sqlite-shm").rename("test.sqlite-shm") + + # Remove this line if the shm file is needed as well + Path("test.sqlite-shm").unlink() + + +if __name__ == "__main__": + create_table() + + # Initial data + insert_data("testing", 1337) + insert_data("omg", 7331) + insert_data("A" * 4100, 4100) + insert_data("B" * 4100, 4100) + insert_data("negative", -11644473429) + + create_checkpoint() + + # Insert extra data after the first checkpoint + insert_data("after checkpoint", 42) + insert_data("after checkpoint", 43) + insert_data("after checkpoint", 44) + insert_data("after checkpoint", 45) + + create_checkpoint() + + # More data after second checkpoint, fewer entries to ensure both checkpoints will be in WAL + insert_data("second checkpoint", 100) + insert_data("second checkpoint", 101) + + create_checkpoint() + + # Modify some data after third checkpoint + delete_data("after checkpoint", 43) + update_data("after checkpoint", 45, "wow", 1234) + + # Rename files to prevent automatic cleanup + move_files() + + conn.close() diff --git a/tests/sqlite3/test_sqlite3.py b/tests/sqlite3/test_sqlite3.py index 320f825..14caa72 100644 --- a/tests/sqlite3/test_sqlite3.py +++ b/tests/sqlite3/test_sqlite3.py @@ -57,145 +57,6 @@ def _sqlite_read_data(db: sqlite3.SQLite3) -> None: assert list(rows[0]) == [("id", 1), ("name", "testing"), ("value", 1337)] -def test_sqlite_wal_binaryio(sqlite_db: Path, sqlite_wal: Path) -> None: - s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=2) - _sqlite_read_checkpoint2(s) - - s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=1) - _sqlite_read_checkpoint1(s) - - s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=0) - _sqlite_read_checkpoint0(s) - - -def test_sqlite_wal_path(sqlite_db: Path, sqlite_wal: Path) -> None: - s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=2) - _sqlite_read_checkpoint2(s) - - s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=1) - _sqlite_read_checkpoint1(s) - - s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=0) - _sqlite_read_checkpoint0(s) - - -def _sqlite_read_checkpoint2(s: sqlite3.SQLite3) -> None: - # After the first checkpoint the "after checkpoint" entries are present - table = next(iter(s.tables())) - - rows = list(table.rows()) - assert len(rows) == 9 - - assert rows[0].id == 1 - assert rows[0].name == "testing" - assert rows[0].value == 1337 - assert rows[1].id == 2 - assert rows[1].name == "omg" - assert rows[1].value == 7331 - assert rows[2].id == 3 - assert rows[2].name == "A" * 4100 - assert rows[2].value == 4100 - assert rows[3].id == 4 - assert rows[3].name == "B" * 4100 - assert rows[3].value == 4100 - assert rows[4].id == 5 - assert rows[4].name == "negative" - assert rows[4].value == -11644473429 - assert rows[5].id == 6 - assert rows[5].name == "after checkpoint" - assert rows[5].value == 42 - assert rows[6].id == 7 - assert rows[6].name == "after checkpoint" - assert rows[6].value == 43 - assert rows[7].id == 8 - assert rows[7].name == "after checkpoint" - assert rows[7].value == 44 - assert rows[8].id == 9 - assert rows[8].name == "after checkpoint" - assert rows[8].value == 45 - - -def _sqlite_read_checkpoint1(s: sqlite3.SQLite3) -> None: - # After the second checkpoint two more entries are present ("second checkpoint") - table = next(iter(s.tables())) - - rows = list(table.rows()) - assert len(rows) == 11 - - assert rows[0].id == 1 - assert rows[0].name == "testing" - assert rows[0].value == 1337 - assert rows[1].id == 2 - assert rows[1].name == "omg" - assert rows[1].value == 7331 - assert rows[2].id == 3 - assert rows[2].name == "A" * 4100 - assert rows[2].value == 4100 - assert rows[3].id == 4 - assert rows[3].name == "B" * 4100 - assert rows[3].value == 4100 - assert rows[4].id == 5 - assert rows[4].name == "negative" - assert rows[4].value == -11644473429 - assert rows[5].id == 6 - assert rows[5].name == "after checkpoint" - assert rows[5].value == 42 - assert rows[6].id == 7 - assert rows[6].name == "after checkpoint" - assert rows[6].value == 43 - assert rows[7].id == 8 - assert rows[7].name == "after checkpoint" - assert rows[7].value == 44 - assert rows[8].id == 9 - assert rows[8].name == "after checkpoint" - assert rows[8].value == 45 - assert rows[9].id == 10 - assert rows[9].name == "second checkpoint" - assert rows[9].value == 100 - assert rows[10].id == 11 - assert rows[10].name == "second checkpoint" - assert rows[10].value == 101 - - -def _sqlite_read_checkpoint0(s: sqlite3.SQLite3) -> None: - # After the third checkpoint the deletion and update of one "after checkpoint" are reflected - table = next(iter(s.tables())) - rows = list(table.rows()) - - assert len(rows) == 10 - - assert rows[0].id == 1 - assert rows[0].name == "testing" - assert rows[0].value == 1337 - assert rows[1].id == 2 - assert rows[1].name == "omg" - assert rows[1].value == 7331 - assert rows[2].id == 3 - assert rows[2].name == "A" * 4100 - assert rows[2].value == 4100 - assert rows[3].id == 4 - assert rows[3].name == "B" * 4100 - assert rows[3].value == 4100 - assert rows[4].id == 5 - assert rows[4].name == "negative" - assert rows[4].value == -11644473429 - assert rows[5].id == 6 - assert rows[5].name == "after checkpoint" - assert rows[5].value == 42 - assert rows[6].id == 8 - assert rows[6].name == "after checkpoint" - assert rows[6].value == 44 - assert rows[7].id == 9 - assert rows[7].name == "wow" - assert rows[7].value == 1234 - assert rows[8].id == 10 - assert rows[8].name == "second checkpoint" - assert rows[8].value == 100 - assert rows[9].id == 11 - assert rows[9].name == "second checkpoint" - assert rows[9].value == 101 - - @pytest.mark.parametrize( ("input", "encoding", "expected_output"), [ diff --git a/tests/sqlite3/test_wal.py b/tests/sqlite3/test_wal.py new file mode 100644 index 0000000..ca2f5f3 --- /dev/null +++ b/tests/sqlite3/test_wal.py @@ -0,0 +1,147 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from dissect.database.sqlite3 import sqlite3 + +if TYPE_CHECKING: + from pathlib import Path + + +def test_sqlite_wal_binaryio(sqlite_db: Path, sqlite_wal: Path) -> None: + s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=2) + _sqlite_read_checkpoint2(s) + + s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=1) + _sqlite_read_checkpoint1(s) + + s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=0) + _sqlite_read_checkpoint0(s) + + +def test_sqlite_wal_path(sqlite_db: Path, sqlite_wal: Path) -> None: + s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=2) + _sqlite_read_checkpoint2(s) + + s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=1) + _sqlite_read_checkpoint1(s) + + s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=0) + _sqlite_read_checkpoint0(s) + + +def _sqlite_read_checkpoint2(s: sqlite3.SQLite3) -> None: + # After the first checkpoint the "after checkpoint" entries are present + table = next(iter(s.tables())) + + rows = list(table.rows()) + assert len(rows) == 9 + + assert rows[0].id == 1 + assert rows[0].name == "testing" + assert rows[0].value == 1337 + assert rows[1].id == 2 + assert rows[1].name == "omg" + assert rows[1].value == 7331 + assert rows[2].id == 3 + assert rows[2].name == "A" * 4100 + assert rows[2].value == 4100 + assert rows[3].id == 4 + assert rows[3].name == "B" * 4100 + assert rows[3].value == 4100 + assert rows[4].id == 5 + assert rows[4].name == "negative" + assert rows[4].value == -11644473429 + assert rows[5].id == 6 + assert rows[5].name == "after checkpoint" + assert rows[5].value == 42 + assert rows[6].id == 7 + assert rows[6].name == "after checkpoint" + assert rows[6].value == 43 + assert rows[7].id == 8 + assert rows[7].name == "after checkpoint" + assert rows[7].value == 44 + assert rows[8].id == 9 + assert rows[8].name == "after checkpoint" + assert rows[8].value == 45 + + +def _sqlite_read_checkpoint1(s: sqlite3.SQLite3) -> None: + # After the second checkpoint two more entries are present ("second checkpoint") + table = next(iter(s.tables())) + + rows = list(table.rows()) + assert len(rows) == 11 + + assert rows[0].id == 1 + assert rows[0].name == "testing" + assert rows[0].value == 1337 + assert rows[1].id == 2 + assert rows[1].name == "omg" + assert rows[1].value == 7331 + assert rows[2].id == 3 + assert rows[2].name == "A" * 4100 + assert rows[2].value == 4100 + assert rows[3].id == 4 + assert rows[3].name == "B" * 4100 + assert rows[3].value == 4100 + assert rows[4].id == 5 + assert rows[4].name == "negative" + assert rows[4].value == -11644473429 + assert rows[5].id == 6 + assert rows[5].name == "after checkpoint" + assert rows[5].value == 42 + assert rows[6].id == 7 + assert rows[6].name == "after checkpoint" + assert rows[6].value == 43 + assert rows[7].id == 8 + assert rows[7].name == "after checkpoint" + assert rows[7].value == 44 + assert rows[8].id == 9 + assert rows[8].name == "after checkpoint" + assert rows[8].value == 45 + assert rows[9].id == 10 + assert rows[9].name == "second checkpoint" + assert rows[9].value == 100 + assert rows[10].id == 11 + assert rows[10].name == "second checkpoint" + assert rows[10].value == 101 + + +def _sqlite_read_checkpoint0(s: sqlite3.SQLite3) -> None: + # After the third checkpoint the deletion and update of one "after checkpoint" are reflected + table = next(iter(s.tables())) + rows = list(table.rows()) + + assert len(rows) == 10 + + assert rows[0].id == 1 + assert rows[0].name == "testing" + assert rows[0].value == 1337 + assert rows[1].id == 2 + assert rows[1].name == "omg" + assert rows[1].value == 7331 + assert rows[2].id == 3 + assert rows[2].name == "A" * 4100 + assert rows[2].value == 4100 + assert rows[3].id == 4 + assert rows[3].name == "B" * 4100 + assert rows[3].value == 4100 + assert rows[4].id == 5 + assert rows[4].name == "negative" + assert rows[4].value == -11644473429 + assert rows[5].id == 6 + assert rows[5].name == "after checkpoint" + assert rows[5].value == 42 + assert rows[6].id == 8 + assert rows[6].name == "after checkpoint" + assert rows[6].value == 44 + assert rows[7].id == 9 + assert rows[7].name == "wow" + assert rows[7].value == 1234 + assert rows[8].id == 10 + assert rows[8].name == "second checkpoint" + assert rows[8].value == 100 + assert rows[9].id == 11 + assert rows[9].name == "second checkpoint" + assert rows[9].value == 101 From 323116c6800aa9c63a00f867eafb6a57a690abd8 Mon Sep 17 00:00:00 2001 From: Pim <36573021+PimSanders@users.noreply.github.com> Date: Mon, 1 Dec 2025 19:31:31 +0100 Subject: [PATCH 15/28] Update dissect/database/sqlite3/wal.py Co-authored-by: Erik Schamper <1254028+Schamper@users.noreply.github.com> --- dissect/database/sqlite3/wal.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dissect/database/sqlite3/wal.py b/dissect/database/sqlite3/wal.py index ceddae7..9eb8e68 100644 --- a/dissect/database/sqlite3/wal.py +++ b/dissect/database/sqlite3/wal.py @@ -81,7 +81,9 @@ def commits(self) -> list[Commit]: commits.append(Commit(self, frames)) frames = [] - # TODO There might be data stored in later frames after the commit? + if frames: + # TODO: Do we want to track these somewhere? + log.warning("Found leftover %d frames after the last WAL commit", len(frames)) return commits From 307404dd176caf011d3d44445520f9034b3d26a6 Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Mon, 1 Dec 2025 19:38:51 +0100 Subject: [PATCH 16/28] Invert checkpoint order to follow order of commits and frames --- dissect/database/sqlite3/wal.py | 2 -- tests/sqlite3/test_wal.py | 26 +++++++++++++------------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/dissect/database/sqlite3/wal.py b/dissect/database/sqlite3/wal.py index 9eb8e68..39b7ddd 100644 --- a/dissect/database/sqlite3/wal.py +++ b/dissect/database/sqlite3/wal.py @@ -110,8 +110,6 @@ def checkpoints(self) -> list[Checkpoint]: return sorted( checkpoints_map.values(), key=lambda c: c.frames[0].header.salt1, - # TODO Should this be reverse? Reverse means: cp0 is the latest, cpN the oldest - reverse=True, ) diff --git a/tests/sqlite3/test_wal.py b/tests/sqlite3/test_wal.py index ca2f5f3..02297cb 100644 --- a/tests/sqlite3/test_wal.py +++ b/tests/sqlite3/test_wal.py @@ -9,28 +9,28 @@ def test_sqlite_wal_binaryio(sqlite_db: Path, sqlite_wal: Path) -> None: - s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=2) - _sqlite_read_checkpoint2(s) - s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=1) _sqlite_read_checkpoint1(s) - s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=0) - _sqlite_read_checkpoint0(s) + s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=2) + _sqlite_read_checkpoint2(s) + s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=3) + _sqlite_read_checkpoint3(s) -def test_sqlite_wal_path(sqlite_db: Path, sqlite_wal: Path) -> None: - s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=2) - _sqlite_read_checkpoint2(s) +def test_sqlite_wal_path(sqlite_db: Path, sqlite_wal: Path) -> None: s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=1) _sqlite_read_checkpoint1(s) - s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=0) - _sqlite_read_checkpoint0(s) + s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=2) + _sqlite_read_checkpoint2(s) + s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=3) + _sqlite_read_checkpoint3(s) -def _sqlite_read_checkpoint2(s: sqlite3.SQLite3) -> None: + +def _sqlite_read_checkpoint1(s: sqlite3.SQLite3) -> None: # After the first checkpoint the "after checkpoint" entries are present table = next(iter(s.tables())) @@ -66,7 +66,7 @@ def _sqlite_read_checkpoint2(s: sqlite3.SQLite3) -> None: assert rows[8].value == 45 -def _sqlite_read_checkpoint1(s: sqlite3.SQLite3) -> None: +def _sqlite_read_checkpoint2(s: sqlite3.SQLite3) -> None: # After the second checkpoint two more entries are present ("second checkpoint") table = next(iter(s.tables())) @@ -108,7 +108,7 @@ def _sqlite_read_checkpoint1(s: sqlite3.SQLite3) -> None: assert rows[10].value == 101 -def _sqlite_read_checkpoint0(s: sqlite3.SQLite3) -> None: +def _sqlite_read_checkpoint3(s: sqlite3.SQLite3) -> None: # After the third checkpoint the deletion and update of one "after checkpoint" are reflected table = next(iter(s.tables())) rows = list(table.rows()) From ae43fb7d27212aeb40e51a6261fe1a454ea2e584 Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Wed, 3 Dec 2025 08:23:32 +0100 Subject: [PATCH 17/28] Improve WAL file detection --- dissect/database/sqlite3/sqlite3.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index eb4dba1..0353f52 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -94,11 +94,10 @@ def __init__( if wal: self.wal = WAL(wal) if not isinstance(wal, WAL) else wal elif path: - # Check for common WAL sidecars next to the DB. - for suffix in (".sqlite-wal", ".db-wal"): - if (candidate := self.path.with_suffix(suffix)).exists(): - self.wal = WAL(candidate) - break + # Check for WAL sidecar next to the DB. + wal_path = path.with_name(f"{path.name}-wal") + if wal_path.exists(): + self.wal = WAL(wal_path) # If a checkpoint index was provided, resolve it to a Checkpoint object. if self.wal and isinstance(checkpoint, int): From 2dfb115cfd77778ba913a1242da6c19b16d8cfeb Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Wed, 3 Dec 2025 09:17:24 +0100 Subject: [PATCH 18/28] Add tests for WAL file detection --- tests/sqlite3/test_wal.py | 40 ++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/tests/sqlite3/test_wal.py b/tests/sqlite3/test_wal.py index 02297cb..aff7021 100644 --- a/tests/sqlite3/test_wal.py +++ b/tests/sqlite3/test_wal.py @@ -10,27 +10,49 @@ def test_sqlite_wal_binaryio(sqlite_db: Path, sqlite_wal: Path) -> None: s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=1) - _sqlite_read_checkpoint1(s) + _sqlite_read_checkpoint_1(s) s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=2) - _sqlite_read_checkpoint2(s) + _sqlite_read_checkpoint_2(s) s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=3) - _sqlite_read_checkpoint3(s) + _sqlite_read_checkpoint_3(s) + + +def test_sqlite_wal_auto_detect_binaryio(sqlite_db: Path) -> None: + s = sqlite3.SQLite3(sqlite_db.open("rb"), checkpoint=1) + _sqlite_read_checkpoint_1(s) + + s = sqlite3.SQLite3(sqlite_db.open("rb"), checkpoint=2) + _sqlite_read_checkpoint_2(s) + + s = sqlite3.SQLite3(sqlite_db.open("rb"), checkpoint=3) + _sqlite_read_checkpoint_3(s) def test_sqlite_wal_path(sqlite_db: Path, sqlite_wal: Path) -> None: s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=1) - _sqlite_read_checkpoint1(s) + _sqlite_read_checkpoint_1(s) s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=2) - _sqlite_read_checkpoint2(s) + _sqlite_read_checkpoint_2(s) s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=3) - _sqlite_read_checkpoint3(s) + _sqlite_read_checkpoint_3(s) + + +def test_sqlite_wal_auto_detect_path(sqlite_db: Path) -> None: + s = sqlite3.SQLite3(sqlite_db, checkpoint=1) + _sqlite_read_checkpoint_1(s) + + s = sqlite3.SQLite3(sqlite_db, checkpoint=2) + _sqlite_read_checkpoint_2(s) + + s = sqlite3.SQLite3(sqlite_db, checkpoint=3) + _sqlite_read_checkpoint_3(s) -def _sqlite_read_checkpoint1(s: sqlite3.SQLite3) -> None: +def _sqlite_read_checkpoint_1(s: sqlite3.SQLite3) -> None: # After the first checkpoint the "after checkpoint" entries are present table = next(iter(s.tables())) @@ -66,7 +88,7 @@ def _sqlite_read_checkpoint1(s: sqlite3.SQLite3) -> None: assert rows[8].value == 45 -def _sqlite_read_checkpoint2(s: sqlite3.SQLite3) -> None: +def _sqlite_read_checkpoint_2(s: sqlite3.SQLite3) -> None: # After the second checkpoint two more entries are present ("second checkpoint") table = next(iter(s.tables())) @@ -108,7 +130,7 @@ def _sqlite_read_checkpoint2(s: sqlite3.SQLite3) -> None: assert rows[10].value == 101 -def _sqlite_read_checkpoint3(s: sqlite3.SQLite3) -> None: +def _sqlite_read_checkpoint_3(s: sqlite3.SQLite3) -> None: # After the third checkpoint the deletion and update of one "after checkpoint" are reflected table = next(iter(s.tables())) rows = list(table.rows()) From f38c19da093ddbcb5764d6d647b96e68b8e6bdc2 Mon Sep 17 00:00:00 2001 From: Pim <36573021+PimSanders@users.noreply.github.com> Date: Thu, 4 Dec 2025 11:03:06 +0100 Subject: [PATCH 19/28] Apply suggestions from code review Co-authored-by: Erik Schamper <1254028+Schamper@users.noreply.github.com> --- dissect/database/sqlite3/sqlite3.py | 8 ++++---- dissect/database/sqlite3/wal.py | 11 +++++------ tests/sqlite3/test_wal.py | 6 +++--- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index 0353f52..5dbd384 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -123,6 +123,7 @@ def __init__( self.page = lru_cache(256)(self.page) def checkpoints(self) -> Iterator[SQLite3]: + """Yield instances of the database at all available checkpoints in the WAL file, if applicable.""" if not self.wal: return @@ -180,18 +181,17 @@ def raw_page(self, num: int) -> bytes: return self.fh.read(self.header.page_size) # If a specific WAL checkpoint was provided, use it instead of the on-disk page. - if self.checkpoint is not None and num in self.checkpoint: - frame = self.checkpoint.get(num) + if self.checkpoint is not None and (frame := self.checkpoint.get(num)): return frame.data # Check if the latest valid instance of the page is committed (either the frame itself # is the commit frame or it is included in a commit's frames). If so, return that frame's data. if self.wal: for commit in self.wal.commits: - if num in commit: - frame = commit.get(num) + if (frame := commit.get(num)): return frame.data + # Else we read the page from the database file self.fh.seek((num - 1) * self.page_size) return self.fh.read(self.header.page_size) diff --git a/dissect/database/sqlite3/wal.py b/dissect/database/sqlite3/wal.py index 39b7ddd..85d02a6 100644 --- a/dissect/database/sqlite3/wal.py +++ b/dissect/database/sqlite3/wal.py @@ -8,12 +8,11 @@ from typing import TYPE_CHECKING, Any, BinaryIO from dissect.database.sqlite3.c_sqlite3 import c_sqlite3 -from dissect.database.sqlite3.exception import ( - InvalidDatabase, -) +from dissect.database.sqlite3.exception import InvalidDatabase if TYPE_CHECKING: from collections.abc import Iterator + log = logging.getLogger(__name__) log.setLevel(os.getenv("DISSECT_LOG_SQLITE3", "CRITICAL")) @@ -40,7 +39,7 @@ def __init__(self, fh: WAL | Path | BinaryIO): self.header = c_sqlite3.wal_header(fh) if self.header.magic not in WAL_HEADER_MAGIC: - raise InvalidDatabase("Invalid header magic") + raise InvalidDatabase("Invalid WAL header magic") self.checksum_endian = "<" if self.header.magic == WAL_HEADER_MAGIC_LE else ">" @@ -91,9 +90,9 @@ def commits(self) -> list[Commit]: def checkpoints(self) -> list[Checkpoint]: """Return deduplicated WAL commits (checkpoints), newest first. - Deduplicate commits by the salt1 value of their first frame. Later + Deduplicate commits by the ``salt1`` value of their first frame. Later commits overwrite earlier ones so the returned list contains the most - recent commit for each salt1, sorted descending. + recent commit for each ``salt1``, sorted descending. References: - https://sqlite.org/fileformat2.html#wal_file_format diff --git a/tests/sqlite3/test_wal.py b/tests/sqlite3/test_wal.py index aff7021..c58298b 100644 --- a/tests/sqlite3/test_wal.py +++ b/tests/sqlite3/test_wal.py @@ -52,7 +52,7 @@ def test_sqlite_wal_auto_detect_path(sqlite_db: Path) -> None: _sqlite_read_checkpoint_3(s) -def _sqlite_read_checkpoint_1(s: sqlite3.SQLite3) -> None: +def _assert_checkpoint_1(s: sqlite3.SQLite3) -> None: # After the first checkpoint the "after checkpoint" entries are present table = next(iter(s.tables())) @@ -88,7 +88,7 @@ def _sqlite_read_checkpoint_1(s: sqlite3.SQLite3) -> None: assert rows[8].value == 45 -def _sqlite_read_checkpoint_2(s: sqlite3.SQLite3) -> None: +def _assert_checkpoint_2(s: sqlite3.SQLite3) -> None: # After the second checkpoint two more entries are present ("second checkpoint") table = next(iter(s.tables())) @@ -130,7 +130,7 @@ def _sqlite_read_checkpoint_2(s: sqlite3.SQLite3) -> None: assert rows[10].value == 101 -def _sqlite_read_checkpoint_3(s: sqlite3.SQLite3) -> None: +def _assert_checkpoint_3(s: sqlite3.SQLite3) -> None: # After the third checkpoint the deletion and update of one "after checkpoint" are reflected table = next(iter(s.tables())) rows = list(table.rows()) From c29c27880c3c76e4c991ee18909bc3b5b337f537 Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Thu, 4 Dec 2025 11:27:53 +0100 Subject: [PATCH 20/28] Apply suggestions from code review --- dissect/database/sqlite3/sqlite3.py | 28 ++++++++++++++-------------- dissect/database/sqlite3/wal.py | 4 ++-- tests/sqlite3/test_wal.py | 24 ++++++++++++------------ 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index 5dbd384..f20566e 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -91,6 +91,19 @@ def __init__( self.wal = None self.checkpoint = None + self.header = c_sqlite3.header(self.fh) + if self.header.magic != SQLITE3_HEADER_MAGIC: + raise InvalidDatabase("Invalid header magic") + + self.encoding = ENCODING.get(self.header.text_encoding, "utf-8") + self.page_size = self.header.page_size + if self.page_size == 1: + self.page_size = 65536 + + self.usable_page_size = self.page_size - self.header.reserved_size + if self.usable_page_size < 480: + raise InvalidDatabase("Usable page size is too small") + if wal: self.wal = WAL(wal) if not isinstance(wal, WAL) else wal elif path: @@ -107,23 +120,10 @@ def __init__( else: self.checkpoint = checkpoint - self.header = c_sqlite3.header(self.fh) - if self.header.magic != SQLITE3_HEADER_MAGIC: - raise InvalidDatabase("Invalid header magic") - - self.encoding = ENCODING.get(self.header.text_encoding, "utf-8") - self.page_size = self.header.page_size - if self.page_size == 1: - self.page_size = 65536 - - self.usable_page_size = self.page_size - self.header.reserved_size - if self.usable_page_size < 480: - raise InvalidDatabase("Usable page size is too small") - self.page = lru_cache(256)(self.page) def checkpoints(self) -> Iterator[SQLite3]: - """Yield instances of the database at all available checkpoints in the WAL file, if applicable.""" + """Yield instances of the database at all available checkpoints in the WAL file, if applicable.""" if not self.wal: return diff --git a/dissect/database/sqlite3/wal.py b/dissect/database/sqlite3/wal.py index 85d02a6..802e730 100644 --- a/dissect/database/sqlite3/wal.py +++ b/dissect/database/sqlite3/wal.py @@ -88,11 +88,11 @@ def commits(self) -> list[Commit]: @cached_property def checkpoints(self) -> list[Checkpoint]: - """Return deduplicated WAL commits (checkpoints), newest first. + """Return deduplicated checkpoints, oldest first. Deduplicate commits by the ``salt1`` value of their first frame. Later commits overwrite earlier ones so the returned list contains the most - recent commit for each ``salt1``, sorted descending. + recent commit for each ``salt1``, sorted ascending. References: - https://sqlite.org/fileformat2.html#wal_file_format diff --git a/tests/sqlite3/test_wal.py b/tests/sqlite3/test_wal.py index c58298b..b4b15b9 100644 --- a/tests/sqlite3/test_wal.py +++ b/tests/sqlite3/test_wal.py @@ -10,46 +10,46 @@ def test_sqlite_wal_binaryio(sqlite_db: Path, sqlite_wal: Path) -> None: s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=1) - _sqlite_read_checkpoint_1(s) + _assert_checkpoint_1(s) s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=2) - _sqlite_read_checkpoint_2(s) + _assert_checkpoint_2(s) s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=3) - _sqlite_read_checkpoint_3(s) + _assert_checkpoint_3(s) def test_sqlite_wal_auto_detect_binaryio(sqlite_db: Path) -> None: s = sqlite3.SQLite3(sqlite_db.open("rb"), checkpoint=1) - _sqlite_read_checkpoint_1(s) + _assert_checkpoint_1(s) s = sqlite3.SQLite3(sqlite_db.open("rb"), checkpoint=2) - _sqlite_read_checkpoint_2(s) + _assert_checkpoint_2(s) s = sqlite3.SQLite3(sqlite_db.open("rb"), checkpoint=3) - _sqlite_read_checkpoint_3(s) + _assert_checkpoint_3(s) def test_sqlite_wal_path(sqlite_db: Path, sqlite_wal: Path) -> None: s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=1) - _sqlite_read_checkpoint_1(s) + _assert_checkpoint_1(s) s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=2) - _sqlite_read_checkpoint_2(s) + _assert_checkpoint_2(s) s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=3) - _sqlite_read_checkpoint_3(s) + _assert_checkpoint_3(s) def test_sqlite_wal_auto_detect_path(sqlite_db: Path) -> None: s = sqlite3.SQLite3(sqlite_db, checkpoint=1) - _sqlite_read_checkpoint_1(s) + _assert_checkpoint_1(s) s = sqlite3.SQLite3(sqlite_db, checkpoint=2) - _sqlite_read_checkpoint_2(s) + _assert_checkpoint_2(s) s = sqlite3.SQLite3(sqlite_db, checkpoint=3) - _sqlite_read_checkpoint_3(s) + _assert_checkpoint_3(s) def _assert_checkpoint_1(s: sqlite3.SQLite3) -> None: From f79d4397b5a31a62e9d850de3abfb5154bd45b3d Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Thu, 4 Dec 2025 12:19:48 +0100 Subject: [PATCH 21/28] Improve tests --- tests/sqlite3/test_sqlite3.py | 32 +++++++++++------- tests/sqlite3/test_wal.py | 62 +++++++++++------------------------ 2 files changed, 40 insertions(+), 54 deletions(-) diff --git a/tests/sqlite3/test_sqlite3.py b/tests/sqlite3/test_sqlite3.py index 14caa72..fdc0666 100644 --- a/tests/sqlite3/test_sqlite3.py +++ b/tests/sqlite3/test_sqlite3.py @@ -10,18 +10,13 @@ if TYPE_CHECKING: from pathlib import Path +@pytest.mark.parametrize( + ("db_as_path"), + [pytest.param(True, id="db_as_path"), pytest.param(False, id="db_as_fh")], +) +def test_sqlite(sqlite_db: Path, db_as_path: bool) -> None: + db = sqlite3.SQLite3(sqlite_db) if db_as_path else sqlite3.SQLite3(sqlite_db.open("rb")) -def test_sqlite_binaryio(sqlite_db: Path) -> None: - s = sqlite3.SQLite3(sqlite_db.open("rb")) - _sqlite_read_data(s) - - -def test_sqlite_path(sqlite_db: Path) -> None: - s = sqlite3.SQLite3(sqlite_db) - _sqlite_read_data(s) - - -def _sqlite_read_data(db: sqlite3.SQLite3) -> None: assert db.header.magic == sqlite3.SQLITE3_HEADER_MAGIC tables = list(db.tables()) @@ -51,6 +46,21 @@ def _sqlite_read_data(db: sqlite3.SQLite3) -> None: assert rows[4].id == 5 assert rows[4].name == "negative" assert rows[4].value == -11644473429 + assert rows[5].id == 6 + assert rows[5].name == "after checkpoint" + assert rows[5].value == 42 + assert rows[6].id == 8 + assert rows[6].name == "after checkpoint" + assert rows[6].value == 44 + assert rows[7].id == 9 + assert rows[7].name == "wow" + assert rows[7].value == 1234 + assert rows[8].id == 10 + assert rows[8].name == "second checkpoint" + assert rows[8].value == 100 + assert rows[9].id == 11 + assert rows[9].name == "second checkpoint" + assert rows[9].value == 101 assert len(rows) == len(list(table)) assert table.row(0).__dict__ == rows[0].__dict__ diff --git a/tests/sqlite3/test_wal.py b/tests/sqlite3/test_wal.py index b4b15b9..2fc21fe 100644 --- a/tests/sqlite3/test_wal.py +++ b/tests/sqlite3/test_wal.py @@ -2,54 +2,30 @@ from typing import TYPE_CHECKING +import pytest + from dissect.database.sqlite3 import sqlite3 if TYPE_CHECKING: from pathlib import Path - -def test_sqlite_wal_binaryio(sqlite_db: Path, sqlite_wal: Path) -> None: - s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=1) - _assert_checkpoint_1(s) - - s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=2) - _assert_checkpoint_2(s) - - s = sqlite3.SQLite3(sqlite_db.open("rb"), sqlite_wal.open("rb"), checkpoint=3) - _assert_checkpoint_3(s) - - -def test_sqlite_wal_auto_detect_binaryio(sqlite_db: Path) -> None: - s = sqlite3.SQLite3(sqlite_db.open("rb"), checkpoint=1) - _assert_checkpoint_1(s) - - s = sqlite3.SQLite3(sqlite_db.open("rb"), checkpoint=2) - _assert_checkpoint_2(s) - - s = sqlite3.SQLite3(sqlite_db.open("rb"), checkpoint=3) - _assert_checkpoint_3(s) - - -def test_sqlite_wal_path(sqlite_db: Path, sqlite_wal: Path) -> None: - s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=1) - _assert_checkpoint_1(s) - - s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=2) - _assert_checkpoint_2(s) - - s = sqlite3.SQLite3(sqlite_db, sqlite_wal, checkpoint=3) - _assert_checkpoint_3(s) - - -def test_sqlite_wal_auto_detect_path(sqlite_db: Path) -> None: - s = sqlite3.SQLite3(sqlite_db, checkpoint=1) - _assert_checkpoint_1(s) - - s = sqlite3.SQLite3(sqlite_db, checkpoint=2) - _assert_checkpoint_2(s) - - s = sqlite3.SQLite3(sqlite_db, checkpoint=3) - _assert_checkpoint_3(s) +@pytest.mark.parametrize( + ("db_as_path"), + [pytest.param(True, id="db_as_path"), pytest.param(False, id="db_as_fh")], +) +@pytest.mark.parametrize( + ("wal_as_path"), + [pytest.param(True, id="wal_as_path"), pytest.param(False, id="wal_as_fh")], +) +def test_sqlite_wal(sqlite_db: Path, sqlite_wal: Path, db_as_path: bool, wal_as_path: bool) -> None: + db = sqlite3.SQLite3(sqlite_db if db_as_path else sqlite_db.open("rb"), sqlite_wal if wal_as_path else sqlite_wal.open("rb"), checkpoint=1) + _assert_checkpoint_1(db) + + db = sqlite3.SQLite3(sqlite_db if db_as_path else sqlite_db.open("rb"), sqlite_wal if wal_as_path else sqlite_wal.open("rb"), checkpoint=2) + _assert_checkpoint_2(db) + + db = sqlite3.SQLite3(sqlite_db if db_as_path else sqlite_db.open("rb"), sqlite_wal if wal_as_path else sqlite_wal.open("rb"), checkpoint=3) + _assert_checkpoint_3(db) def _assert_checkpoint_1(s: sqlite3.SQLite3) -> None: From 3383f7591a9853c2629e8b831a8f239ab0bf74f8 Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Thu, 4 Dec 2025 12:34:30 +0100 Subject: [PATCH 22/28] Fix missing latest transaction when reading from WAL --- dissect/database/sqlite3/sqlite3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index f20566e..faf0aff 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -187,8 +187,8 @@ def raw_page(self, num: int) -> bytes: # Check if the latest valid instance of the page is committed (either the frame itself # is the commit frame or it is included in a commit's frames). If so, return that frame's data. if self.wal: - for commit in self.wal.commits: - if (frame := commit.get(num)): + for commit in reversed(self.wal.commits): + if (frame := commit.get(num)) and frame.valid: return frame.data # Else we read the page from the database file From 438b69d70f7202b2cdca88bc934a7c736f3bb30a Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Tue, 9 Dec 2025 15:01:21 +0100 Subject: [PATCH 23/28] Move page 1 seek --- dissect/database/sqlite3/sqlite3.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index faf0aff..ba8ba57 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -176,10 +176,6 @@ def raw_page(self, num: int) -> bytes: if (num < 1 or num > self.header.page_count) and self.header.page_count > 0: raise InvalidPageNumber("Page number exceeds boundaries") - if num == 1: # Page 1 is root - self.fh.seek(len(c_sqlite3.header)) - return self.fh.read(self.header.page_size) - # If a specific WAL checkpoint was provided, use it instead of the on-disk page. if self.checkpoint is not None and (frame := self.checkpoint.get(num)): return frame.data @@ -191,6 +187,10 @@ def raw_page(self, num: int) -> bytes: if (frame := commit.get(num)) and frame.valid: return frame.data + if num == 1: # Page 1 is root + self.fh.seek(len(c_sqlite3.header)) + return self.fh.read(self.header.page_size) + # Else we read the page from the database file self.fh.seek((num - 1) * self.page_size) return self.fh.read(self.header.page_size) From 5d709a016636fd0507a0feba9399f793d2a7dac5 Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Tue, 9 Dec 2025 15:40:42 +0100 Subject: [PATCH 24/28] Revert db seek to first version --- dissect/database/sqlite3/sqlite3.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dissect/database/sqlite3/sqlite3.py b/dissect/database/sqlite3/sqlite3.py index ba8ba57..25b32e6 100644 --- a/dissect/database/sqlite3/sqlite3.py +++ b/dissect/database/sqlite3/sqlite3.py @@ -187,12 +187,11 @@ def raw_page(self, num: int) -> bytes: if (frame := commit.get(num)) and frame.valid: return frame.data + # Else we read the page from the database file. if num == 1: # Page 1 is root self.fh.seek(len(c_sqlite3.header)) - return self.fh.read(self.header.page_size) - - # Else we read the page from the database file - self.fh.seek((num - 1) * self.page_size) + else: + self.fh.seek((num - 1) * self.page_size) return self.fh.read(self.header.page_size) def page(self, num: int) -> Page: From 07bde24b73051de8becfb01f6020b9acb44de7d0 Mon Sep 17 00:00:00 2001 From: Pim <36573021+PimSanders@users.noreply.github.com> Date: Tue, 9 Dec 2025 16:01:54 +0100 Subject: [PATCH 25/28] Apply suggestions from code review Co-authored-by: Erik Schamper <1254028+Schamper@users.noreply.github.com> --- dissect/database/sqlite3/wal.py | 5 +---- tests/_tools/sqlite3/generate_sqlite.py | 2 ++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/dissect/database/sqlite3/wal.py b/dissect/database/sqlite3/wal.py index 802e730..e31284e 100644 --- a/dissect/database/sqlite3/wal.py +++ b/dissect/database/sqlite3/wal.py @@ -106,10 +106,7 @@ def checkpoints(self) -> list[Checkpoint]: # Keep the most recent commit for each salt1 (later commits overwrite). checkpoints_map[salt1] = commit - return sorted( - checkpoints_map.values(), - key=lambda c: c.frames[0].header.salt1, - ) + return [checkpoints_map[salt] for salt in sorted(checkpoints_map.keys()] class Frame: diff --git a/tests/_tools/sqlite3/generate_sqlite.py b/tests/_tools/sqlite3/generate_sqlite.py index f9df4ad..74f6bf0 100644 --- a/tests/_tools/sqlite3/generate_sqlite.py +++ b/tests/_tools/sqlite3/generate_sqlite.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import sqlite3 from pathlib import Path From 43d1909bb4ac49f14aa126e12b82af0395d1f76a Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Tue, 9 Dec 2025 16:09:27 +0100 Subject: [PATCH 26/28] Apply suggestions from code review --- dissect/database/sqlite3/wal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dissect/database/sqlite3/wal.py b/dissect/database/sqlite3/wal.py index e31284e..65f5e30 100644 --- a/dissect/database/sqlite3/wal.py +++ b/dissect/database/sqlite3/wal.py @@ -106,7 +106,7 @@ def checkpoints(self) -> list[Checkpoint]: # Keep the most recent commit for each salt1 (later commits overwrite). checkpoints_map[salt1] = commit - return [checkpoints_map[salt] for salt in sorted(checkpoints_map.keys()] + return [checkpoints_map[salt] for salt in sorted(checkpoints_map.keys())] class Frame: From fe1de4930fd99d591bac35ff19ddaf35ed444225 Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Tue, 9 Dec 2025 16:10:19 +0100 Subject: [PATCH 27/28] Write test data to correct directory --- tests/_tools/sqlite3/generate_sqlite.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/_tools/sqlite3/generate_sqlite.py b/tests/_tools/sqlite3/generate_sqlite.py index 74f6bf0..7019a00 100644 --- a/tests/_tools/sqlite3/generate_sqlite.py +++ b/tests/_tools/sqlite3/generate_sqlite.py @@ -44,12 +44,14 @@ def create_checkpoint() -> None: def move_files() -> None: - Path("db.sqlite").rename("test.sqlite") - Path("db.sqlite-wal").rename("test.sqlite-wal") - Path("db.sqlite-shm").rename("test.sqlite-shm") + destination_dir = (Path(__file__).parent / "../../_data/sqlite3/").resolve() + + Path("db.sqlite").rename(destination_dir / "test.sqlite") + Path("db.sqlite-wal").rename(destination_dir / "test.sqlite-wal") + Path("db.sqlite-shm").rename(destination_dir / "test.sqlite-shm") # Remove this line if the shm file is needed as well - Path("test.sqlite-shm").unlink() + Path(destination_dir / "test.sqlite-shm").unlink() if __name__ == "__main__": From e12b54bf208e0f1b06db48ba019fd70463e9109f Mon Sep 17 00:00:00 2001 From: Pim Sanders <36573021+PimSanders@users.noreply.github.com> Date: Tue, 9 Dec 2025 16:13:42 +0100 Subject: [PATCH 28/28] Make linter happy once more --- dissect/database/sqlite3/wal.py | 1 + tests/_tools/sqlite3/__init__.py | 0 tests/sqlite3/test_sqlite3.py | 1 + tests/sqlite3/test_wal.py | 19 ++++++++++++++++--- 4 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 tests/_tools/sqlite3/__init__.py diff --git a/dissect/database/sqlite3/wal.py b/dissect/database/sqlite3/wal.py index 65f5e30..309f156 100644 --- a/dissect/database/sqlite3/wal.py +++ b/dissect/database/sqlite3/wal.py @@ -145,6 +145,7 @@ def page_count(self) -> int: class _FrameCollection: """Convenience class to keep track of a collection of frames that were committed together.""" + def __init__(self, wal: WAL, frames: list[Frame]): self.wal = wal self.frames = frames diff --git a/tests/_tools/sqlite3/__init__.py b/tests/_tools/sqlite3/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/sqlite3/test_sqlite3.py b/tests/sqlite3/test_sqlite3.py index fdc0666..a0ce473 100644 --- a/tests/sqlite3/test_sqlite3.py +++ b/tests/sqlite3/test_sqlite3.py @@ -10,6 +10,7 @@ if TYPE_CHECKING: from pathlib import Path + @pytest.mark.parametrize( ("db_as_path"), [pytest.param(True, id="db_as_path"), pytest.param(False, id="db_as_fh")], diff --git a/tests/sqlite3/test_wal.py b/tests/sqlite3/test_wal.py index 2fc21fe..cc01925 100644 --- a/tests/sqlite3/test_wal.py +++ b/tests/sqlite3/test_wal.py @@ -9,6 +9,7 @@ if TYPE_CHECKING: from pathlib import Path + @pytest.mark.parametrize( ("db_as_path"), [pytest.param(True, id="db_as_path"), pytest.param(False, id="db_as_fh")], @@ -18,13 +19,25 @@ [pytest.param(True, id="wal_as_path"), pytest.param(False, id="wal_as_fh")], ) def test_sqlite_wal(sqlite_db: Path, sqlite_wal: Path, db_as_path: bool, wal_as_path: bool) -> None: - db = sqlite3.SQLite3(sqlite_db if db_as_path else sqlite_db.open("rb"), sqlite_wal if wal_as_path else sqlite_wal.open("rb"), checkpoint=1) + db = sqlite3.SQLite3( + sqlite_db if db_as_path else sqlite_db.open("rb"), + sqlite_wal if wal_as_path else sqlite_wal.open("rb"), + checkpoint=1, + ) _assert_checkpoint_1(db) - db = sqlite3.SQLite3(sqlite_db if db_as_path else sqlite_db.open("rb"), sqlite_wal if wal_as_path else sqlite_wal.open("rb"), checkpoint=2) + db = sqlite3.SQLite3( + sqlite_db if db_as_path else sqlite_db.open("rb"), + sqlite_wal if wal_as_path else sqlite_wal.open("rb"), + checkpoint=2, + ) _assert_checkpoint_2(db) - db = sqlite3.SQLite3(sqlite_db if db_as_path else sqlite_db.open("rb"), sqlite_wal if wal_as_path else sqlite_wal.open("rb"), checkpoint=3) + db = sqlite3.SQLite3( + sqlite_db if db_as_path else sqlite_db.open("rb"), + sqlite_wal if wal_as_path else sqlite_wal.open("rb"), + checkpoint=3, + ) _assert_checkpoint_3(db)