diff --git a/dissect/target/filesystem.py b/dissect/target/filesystem.py index bbdeb37157..f616972f3e 100644 --- a/dissect/target/filesystem.py +++ b/dissect/target/filesystem.py @@ -1087,7 +1087,7 @@ def get(self, path: str) -> FilesystemEntry: def scandir(self) -> Iterator[DirEntry]: raise NotADirectoryError(f"'{self.path}' is not a directory") - def open(self) -> BinaryIO: + def open(self, *args, **kwargs) -> BinaryIO: return VirtualFileHandle(self.entry) def stat(self, follow_symlinks: bool = True) -> fsutil.stat_result: @@ -1147,7 +1147,7 @@ def __init__(self, fs: Filesystem, path: str, entry: str, algo: str = "gzip"): if self._compressor is None: raise ValueError(f"Unsupported compression algorithm {algo}") - def open(self) -> BinaryIO: + def open(self, *args, **kwargs) -> BinaryIO: return self._compressor.open(self.entry, "rb") diff --git a/dissect/target/loader.py b/dissect/target/loader.py index c57cd1bdb4..e23db1e2d0 100644 --- a/dissect/target/loader.py +++ b/dissect/target/loader.py @@ -153,6 +153,42 @@ def map(self, target: Target) -> None: raise NotImplementedError +class MiddlewareLoader(Loader): + """A base class for preparing arbitrary data to be used by other :class:`Loader`s. + + Instead of mapping data directly to a :class:`Target `, loaders of this type + prepare data in some way and make it available for other :class:`Loader`s to use. + + Subclasses should implement the :method:`detect` method like any other loader, and return a path to the prepared + data in the :method:`prepare` method . The loading mechanism will then use that path to find other loaders to map + the prepared data into the target. + + Feels like forever since I've heard the term "middleware", I'm bringing it back baby! + """ + + def __init__(self, path: Path, *, fallbacks: list[type[Loader]] | None = None, **kwargs): + super().__init__(path, **kwargs) + # This will be the loader that successfully mapped the prepared path + self.loader = None + + @staticmethod + def detect(path: Path) -> bool: + raise NotImplementedError + + def prepare(self, target: Target) -> Path: + raise NotImplementedError + + def map(self, target: Target) -> None: + path = self.prepare(target) + + if (loader := find_loader(path, fallbacks=[DirLoader, RawLoader])) is not None: + ldr = loader(path) + ldr.map(target) + + # Store a reference to the loader if we successfully mapped + self.loader = ldr + + def register(module_name: str, class_name: str, internal: bool = True) -> None: """Registers a ``Loader`` class inside ``LOADERS``. @@ -290,4 +326,5 @@ def open(path: str | Path, *, fallbacks: list[type[Loader]] | None = None, **kwa register("log", "LogLoader") register("remote", "RemoteLoader") register("mqtt", "MqttLoader") +register("compression", "CompressionLoader") register("multiraw", "MultiRawLoader") # Should be last diff --git a/dissect/target/loaders/compression.py b/dissect/target/loaders/compression.py new file mode 100644 index 0000000000..5bb397b5eb --- /dev/null +++ b/dissect/target/loaders/compression.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from dissect.target.filesystem import VirtualFilesystem +from dissect.target.helpers import fsutil +from dissect.target.helpers.logging import get_logger +from dissect.target.loader import MiddlewareLoader + +if TYPE_CHECKING: + from pathlib import Path + + from dissect.target.target import target + +log = get_logger(__name__) + +COMPRESSION_EXT = (".gz", ".lzma", ".bz2", ".zst") + + +class CompressionLoader(MiddlewareLoader): + """Allow loading compressed files. + This does impact performance, so it's recommended to uncompress the file before passing it to Dissect. + """ + + def __init__(self, path: Path, **kwargs): + super().__init__(path, **kwargs) + + log.warning( + "file %r is compressed, which will affect performance. " + "Consider uncompressing the archive before passing the file to Dissect.", + path, + ) + + @staticmethod + def detect(path: Path) -> bool: + return path.name.lower().endswith(COMPRESSION_EXT) or is_compressed_magic(path) + + def prepare(self, target: target.Target) -> Path: + filename = self.path.name.removesuffix(".gz") + vfs = VirtualFilesystem() + vfs.map_file_fh(filename, fsutil.open_decompress(self.path)) + + return vfs.path(filename) + + +def is_compressed_magic(path: Path) -> bool: + """Check if this is a compressed file based on the magic + Based on the magic check from fsutil.open_decompress. + """ + file = path.open("rb") + + magic = file.read(5) + file.seek(0) + + # Gzip + if magic[:2] == b"\x1f\x8b": + return True + + # LZMA + if magic[:5] == b"\xfd7zXZ": + return True + + # BZ2 + if magic[:3] == b"BZh" and 0x31 <= magic[3] <= 0x39: + return True + + # ZSTD + return magic[:4] in [b"\xfd\x2f\xb5\x28", b"\x28\xb5\x2f\xfd"] diff --git a/dissect/target/loaders/tar.py b/dissect/target/loaders/tar.py index 3791bb0222..dd73cf704d 100644 --- a/dissect/target/loaders/tar.py +++ b/dissect/target/loaders/tar.py @@ -20,43 +20,9 @@ from dissect.target import target - log = get_logger(__name__) -TAR_EXT_COMP = ( - ".tar.gz", - ".tar.xz", - ".tar.bz", - ".tar.bz2", - ".tar.lzma", - ".tar.lz", - ".tgz", - ".txz", - ".tbz", - ".tbz2", - ".tlz", - ".tlzma", -) TAR_EXT = (".tar",) - -TAR_MAGIC_COMP = ( - # gzip - b"\x1f\x8b", - # bzip2 - b"\x42\x5a\x68", - # xz - b"\xfd\x37\x7a\x58\x5a\x00", - # lzma - b"\x5d\x00\x00\x01\x00", - b"\x5d\x00\x00\x10\x00", - b"\x5d\x00\x00\x08\x00", - b"\x5d\x00\x00\x10\x00", - b"\x5d\x00\x00\x20\x00", - b"\x5d\x00\x00\x40\x00", - b"\x5d\x00\x00\x80\x00", - b"\x5d\x00\x00\x00\x01", - b"\x5d\x00\x00\x00\x02", -) TAR_MAGIC = (tf.GNU_MAGIC, tf.POSIX_MAGIC) WINDOWS_MEMBERS = ( @@ -146,20 +112,13 @@ class TarLoader(Loader): def __init__(self, path: Path, **kwargs): super().__init__(path, **kwargs) - if is_compressed(path): - log.warning( - "Tar file %r is compressed, which will affect performance. " - "Consider uncompressing the archive before passing the tar file to Dissect.", - path, - ) - self.fh = path.open("rb") self.tar = tf.open(mode="r:*", fileobj=self.fh) # noqa: SIM115 self.subloader = None @staticmethod def detect(path: Path) -> bool: - return path.name.lower().endswith(TAR_EXT + TAR_EXT_COMP) or is_tar_magic(path, TAR_MAGIC + TAR_MAGIC_COMP) + return path.name.lower().endswith(TAR_EXT) or is_tar_magic(path, TAR_MAGIC) def map(self, target: target.Target) -> None: for candidate in self.__subloaders__: @@ -192,7 +151,3 @@ def is_tar_magic(path: Path, magics: Iterable[bytes]) -> bool: continue return True return False - - -def is_compressed(path: Path) -> bool: - return path.name.lower().endswith(TAR_EXT_COMP) or is_tar_magic(path, TAR_MAGIC_COMP) diff --git a/dissect/target/loaders/vbk.py b/dissect/target/loaders/vbk.py index 13d6f6c203..0e911bf537 100644 --- a/dissect/target/loaders/vbk.py +++ b/dissect/target/loaders/vbk.py @@ -7,8 +7,7 @@ from dissect.target.exceptions import LoaderError from dissect.target.filesystem import VirtualFilesystem from dissect.target.filesystems.vbk import VbkFilesystem -from dissect.target.loader import Loader, find_loader -from dissect.target.loaders.raw import RawLoader +from dissect.target.loader import MiddlewareLoader if TYPE_CHECKING: from pathlib import Path @@ -19,7 +18,7 @@ RE_RAW_DISK = re.compile(r"(?:[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})|(?:DEV__.+)") -class VbkLoader(Loader): +class VbkLoader(MiddlewareLoader): """Load Veaam Backup (VBK) files. References: @@ -35,7 +34,7 @@ def __init__(self, path: Path, **kwargs): def detect(path: Path) -> bool: return path.suffix.lower() == ".vbk" - def map(self, target: Target) -> None: + def prepare(self, target: Target) -> Path: # We haven't really researched any of the VBK metadata yet, so just try some common formats root = self.vbkfs.path("/") if (base := next(root.glob("*"), None)) is None: @@ -51,24 +50,19 @@ def map(self, target: Target) -> None: candidates.append(root.joinpath("+".join(map(str, disks)))) - # Try to find a loader - for candidate in candidates: - if candidate.suffix.lower() == ".vmcx": - # For VMCX files we need to massage the file layout a bit - vfs = VirtualFilesystem() - vfs.map_file_entry(candidate.name, candidate) + # We should only have one candidate at this point + if len(candidates) > 1: + raise LoaderError("Unsupported VBK structure, use `-L raw` to manually inspect the VBK") - for entry in chain(base.glob("Ide*/*"), base.glob("Scsi*/*")): - vfs.map_file_entry(entry.name, entry) + candidate = candidates[0] + if candidate.suffix.lower() == ".vmcx": + # For VMCX files we need to massage the file layout a bit + vfs = VirtualFilesystem() + vfs.map_file_entry(candidate.name, candidate) - candidate = vfs.path(candidate.name) + for entry in chain(base.glob("Ide*/*"), base.glob("Scsi*/*")): + vfs.map_file_entry(entry.name, entry) - if (loader := find_loader(candidate, fallbacks=[RawLoader])) is not None: - ldr = loader(candidate) - ldr.map(target) + candidate = vfs.path(candidate.name) - # Store a reference to the loader if we successfully mapped - self.loader = ldr - break - else: - raise LoaderError("Unsupported VBK structure, use `-L raw` to manually inspect the VBK") + return candidate diff --git a/tests/loaders/test_compression.py b/tests/loaders/test_compression.py new file mode 100644 index 0000000000..49b8fa6def --- /dev/null +++ b/tests/loaders/test_compression.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +from dissect.target.loaders.compression import CompressionLoader +from dissect.target.target import Target +from tests._utils import absolute_path + +if TYPE_CHECKING: + from pytest_benchmark.fixture import BenchmarkFixture + + +@pytest.mark.parametrize( + "archive", + [ + "_data/loaders/tar/test-archive.tar.gz", + ], +) +@pytest.mark.benchmark +def test_benchmark(benchmark: BenchmarkFixture, archive: str) -> None: + """Benchmark the loading of archives.""" + file = absolute_path(archive) + + benchmark(lambda: CompressionLoader(file).map(Target())) diff --git a/tests/loaders/test_tar.py b/tests/loaders/test_tar.py index d008d3c772..4a3947755d 100644 --- a/tests/loaders/test_tar.py +++ b/tests/loaders/test_tar.py @@ -8,6 +8,7 @@ import pytest from dissect.target.loader import open as loader_open +from dissect.target.loaders.compression import CompressionLoader from dissect.target.loaders.tar import GenericTarSubLoader, TarLoader from dissect.target.plugins.os.windows._os import WindowsPlugin from dissect.target.target import Target @@ -18,6 +19,8 @@ from collections.abc import Callable from pathlib import Path + from pytest_benchmark.fixture import BenchmarkFixture + @pytest.mark.parametrize( ("opener"), @@ -41,12 +44,13 @@ def test_compressed_tar_file(caplog: pytest.LogCaptureFixture) -> None: with caplog.at_level(logging.WARNING): loader = loader_open(path) - assert isinstance(loader, TarLoader) + assert isinstance(loader, CompressionLoader) assert "is compressed" in caplog.text t = Target() loader.map(t) - assert isinstance(loader.subloader, GenericTarSubLoader) + assert isinstance(loader.loader, TarLoader) + assert isinstance(loader.loader.subloader, GenericTarSubLoader) assert len(t.filesystems) == 1 @@ -61,11 +65,12 @@ def test_compressed_tar_file_with_empty_dir() -> None: path = absolute_path("_data/loaders/tar/test-archive-empty-folder.tgz") loader = loader_open(path) - assert isinstance(loader, TarLoader) + assert isinstance(loader, CompressionLoader) t = Target() loader.map(t) - assert isinstance(loader.subloader, GenericTarSubLoader) + assert isinstance(loader.loader, TarLoader) + assert isinstance(loader.loader.subloader, GenericTarSubLoader) assert len(t.filesystems) == 1 @@ -87,11 +92,12 @@ def test_case_sensitivity_windows(tmp_path: Path) -> None: _mkdir(tf, "Windows/System32") loader = loader_open(path) - assert isinstance(loader, TarLoader) + assert isinstance(loader, CompressionLoader) t = Target() loader.map(t) - assert isinstance(loader.subloader, GenericTarSubLoader) + assert isinstance(loader.loader, TarLoader) + assert isinstance(loader.loader.subloader, GenericTarSubLoader) # Make sure the case sensitiveness is changed to False and make sure we detect the target as Windows. assert not t.filesystems[0].case_sensitive @@ -107,11 +113,12 @@ def test_case_sensitivity_linux(tmp_path: Path) -> None: _mkdir(tf, "opt") loader = loader_open(path) - assert isinstance(loader, TarLoader) + assert isinstance(loader, CompressionLoader) t = Target() loader.map(t) - assert isinstance(loader.subloader, GenericTarSubLoader) + assert isinstance(loader.loader, TarLoader) + assert isinstance(loader.loader.subloader, GenericTarSubLoader) assert t.filesystems[0].case_sensitive @@ -121,17 +128,6 @@ def test_case_sensitivity_linux(tmp_path: Path) -> None: [ # regular tar file (True, "file.tar", ""), - # gzip tar file - (True, "file.tar.gz", ""), - (True, "file.tgz", ""), - # bzip2 tar file - (True, "file.tar.bz2", ""), - (True, "file.tar.bz", ""), - (True, "file.tbz", ""), - (True, "file.tbz2", ""), - # xz tar file - (True, "file.tar.xz", ""), - (True, "file.txz", ""), # some things it should not detect (False, "file", "00010203"), (False, "file.zip", "504b0304"), @@ -150,10 +146,6 @@ def test_detect_extension(should_detect: bool, filename: str, buffer: str, tmp_p "file", [ "small.tar", - "small.tar.bz2", - "small.tar.gz", - "small.tar.lz", - "small.tar.xz", ], ) def test_detect_buffer(file: str, tmp_path: Path) -> None: @@ -168,3 +160,17 @@ def test_detect_buffer(file: str, tmp_path: Path) -> None: tmp_tar.write_bytes(small_file.read_bytes()) assert TarLoader.detect(tmp_tar) + + +@pytest.mark.parametrize( + "archive", + [ + "_data/loaders/tar/test-archive.tar", + ], +) +@pytest.mark.benchmark +def test_benchmark(benchmark: BenchmarkFixture, archive: str) -> None: + """Benchmark the loading of archives.""" + file = absolute_path(archive) + + benchmark(lambda: TarLoader(file).map(Target()))