diff --git a/dissect/hypervisor/disk/c_qcow2.py b/dissect/hypervisor/disk/c_qcow2.py index a98664c..d218e74 100644 --- a/dissect/hypervisor/disk/c_qcow2.py +++ b/dissect/hypervisor/disk/c_qcow2.py @@ -171,9 +171,6 @@ ) -def ctz(value: int, size: int = 32) -> int: +def ctz(value: int) -> int: """Count the number of zero bits in an integer of a given size.""" - for i in range(size): - if value & (1 << i): - return i - return 0 + return (value & -value).bit_length() - 1 if value else 0 diff --git a/dissect/hypervisor/disk/c_qcow2.pyi b/dissect/hypervisor/disk/c_qcow2.pyi new file mode 100644 index 0000000..51db133 --- /dev/null +++ b/dissect/hypervisor/disk/c_qcow2.pyi @@ -0,0 +1,190 @@ +# Generated by cstruct-stubgen +from typing import BinaryIO, Literal, overload + +import dissect.cstruct as __cs__ +from typing_extensions import TypeAlias + +class _c_qcow2(__cs__.cstruct): + MIN_CLUSTER_BITS: Literal[9] = ... + MAX_CLUSTER_BITS: Literal[21] = ... + QCOW2_COMPRESSED_SECTOR_SIZE: Literal[512] = ... + QCOW2_COMPRESSION_TYPE_ZLIB: Literal[0] = ... + QCOW2_COMPRESSION_TYPE_ZSTD: Literal[1] = ... + L1E_SIZE: Literal[8] = ... + L2E_SIZE_NORMAL: Literal[8] = ... + L2E_SIZE_EXTENDED: Literal[16] = ... + L1E_OFFSET_MASK: Literal[72057594037927424] = ... + L2E_OFFSET_MASK: Literal[72057594037927424] = ... + L2E_COMPRESSED_OFFSET_SIZE_MASK: Literal[4611686018427387903] = ... + QCOW_OFLAG_COPIED: Literal[9223372036854775808] = ... + QCOW_OFLAG_COMPRESSED: Literal[4611686018427387904] = ... + QCOW_OFLAG_ZERO: Literal[1] = ... + QCOW_EXTL2_SUBCLUSTERS_PER_CLUSTER: Literal[32] = ... + QCOW2_INCOMPAT_DIRTY_BITNR: Literal[0] = ... + QCOW2_INCOMPAT_CORRUPT_BITNR: Literal[1] = ... + QCOW2_INCOMPAT_DATA_FILE_BITNR: Literal[2] = ... + QCOW2_INCOMPAT_COMPRESSION_BITNR: Literal[3] = ... + QCOW2_INCOMPAT_EXTL2_BITNR: Literal[4] = ... + QCOW2_INCOMPAT_DIRTY: Literal[1] = ... + QCOW2_INCOMPAT_CORRUPT: Literal[2] = ... + QCOW2_INCOMPAT_DATA_FILE: Literal[4] = ... + QCOW2_INCOMPAT_COMPRESSION: Literal[8] = ... + QCOW2_INCOMPAT_EXTL2: Literal[16] = ... + QCOW2_EXT_MAGIC_END: Literal[0] = ... + QCOW2_EXT_MAGIC_BACKING_FORMAT: Literal[3799591626] = ... + QCOW2_EXT_MAGIC_FEATURE_TABLE: Literal[1745090647] = ... + QCOW2_EXT_MAGIC_CRYPTO_HEADER: Literal[87539319] = ... + QCOW2_EXT_MAGIC_BITMAPS: Literal[595929205] = ... + QCOW2_EXT_MAGIC_DATA_FILE: Literal[1145132097] = ... + class QCowHeader(__cs__.Structure): + magic: _c_qcow2.uint32 + version: _c_qcow2.uint32 + backing_file_offset: _c_qcow2.uint64 + backing_file_size: _c_qcow2.uint32 + cluster_bits: _c_qcow2.uint32 + size: _c_qcow2.uint64 + crypt_method: _c_qcow2.uint32 + l1_size: _c_qcow2.uint32 + l1_table_offset: _c_qcow2.uint64 + refcount_table_offset: _c_qcow2.uint64 + refcount_table_clusters: _c_qcow2.uint32 + nb_snapshots: _c_qcow2.uint32 + snapshots_offset: _c_qcow2.uint64 + incompatible_features: _c_qcow2.uint64 + compatible_features: _c_qcow2.uint64 + autoclear_features: _c_qcow2.uint64 + refcount_order: _c_qcow2.uint32 + header_length: _c_qcow2.uint32 + compression_type: _c_qcow2.uint8 + padding: __cs__.Array[_c_qcow2.uint8] + @overload + def __init__( + self, + magic: _c_qcow2.uint32 | None = ..., + version: _c_qcow2.uint32 | None = ..., + backing_file_offset: _c_qcow2.uint64 | None = ..., + backing_file_size: _c_qcow2.uint32 | None = ..., + cluster_bits: _c_qcow2.uint32 | None = ..., + size: _c_qcow2.uint64 | None = ..., + crypt_method: _c_qcow2.uint32 | None = ..., + l1_size: _c_qcow2.uint32 | None = ..., + l1_table_offset: _c_qcow2.uint64 | None = ..., + refcount_table_offset: _c_qcow2.uint64 | None = ..., + refcount_table_clusters: _c_qcow2.uint32 | None = ..., + nb_snapshots: _c_qcow2.uint32 | None = ..., + snapshots_offset: _c_qcow2.uint64 | None = ..., + incompatible_features: _c_qcow2.uint64 | None = ..., + compatible_features: _c_qcow2.uint64 | None = ..., + autoclear_features: _c_qcow2.uint64 | None = ..., + refcount_order: _c_qcow2.uint32 | None = ..., + header_length: _c_qcow2.uint32 | None = ..., + compression_type: _c_qcow2.uint8 | None = ..., + padding: __cs__.Array[_c_qcow2.uint8] | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class QCowExtension(__cs__.Structure): + magic: _c_qcow2.uint32 + len: _c_qcow2.uint32 + @overload + def __init__(self, magic: _c_qcow2.uint32 | None = ..., len: _c_qcow2.uint32 | None = ...): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class Qcow2CryptoHeaderExtension(__cs__.Structure): + offset: _c_qcow2.uint64 + length: _c_qcow2.uint64 + @overload + def __init__(self, offset: _c_qcow2.uint64 | None = ..., length: _c_qcow2.uint64 | None = ...): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class Qcow2BitmapHeaderExt(__cs__.Structure): + nb_bitmaps: _c_qcow2.uint32 + reserved32: _c_qcow2.uint32 + bitmap_directory_size: _c_qcow2.uint64 + bitmap_directory_offset: _c_qcow2.uint64 + @overload + def __init__( + self, + nb_bitmaps: _c_qcow2.uint32 | None = ..., + reserved32: _c_qcow2.uint32 | None = ..., + bitmap_directory_size: _c_qcow2.uint64 | None = ..., + bitmap_directory_offset: _c_qcow2.uint64 | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class QCowSnapshotHeader(__cs__.Structure): + l1_table_offset: _c_qcow2.uint64 + l1_size: _c_qcow2.uint32 + id_str_size: _c_qcow2.uint16 + name_size: _c_qcow2.uint16 + date_sec: _c_qcow2.uint32 + date_nsec: _c_qcow2.uint32 + vm_clock_nsec: _c_qcow2.uint64 + vm_state_size: _c_qcow2.uint32 + extra_data_size: _c_qcow2.uint32 + @overload + def __init__( + self, + l1_table_offset: _c_qcow2.uint64 | None = ..., + l1_size: _c_qcow2.uint32 | None = ..., + id_str_size: _c_qcow2.uint16 | None = ..., + name_size: _c_qcow2.uint16 | None = ..., + date_sec: _c_qcow2.uint32 | None = ..., + date_nsec: _c_qcow2.uint32 | None = ..., + vm_clock_nsec: _c_qcow2.uint64 | None = ..., + vm_state_size: _c_qcow2.uint32 | None = ..., + extra_data_size: _c_qcow2.uint32 | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class QCowSnapshotExtraData(__cs__.Structure): + vm_state_size_large: _c_qcow2.uint64 + disk_size: _c_qcow2.uint64 + icount: _c_qcow2.uint64 + @overload + def __init__( + self, + vm_state_size_large: _c_qcow2.uint64 | None = ..., + disk_size: _c_qcow2.uint64 | None = ..., + icount: _c_qcow2.uint64 | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class QCow2ClusterType(__cs__.Enum): + QCOW2_CLUSTER_UNALLOCATED = ... + QCOW2_CLUSTER_ZERO_PLAIN = ... + QCOW2_CLUSTER_ZERO_ALLOC = ... + QCOW2_CLUSTER_NORMAL = ... + QCOW2_CLUSTER_COMPRESSED = ... + + class QCow2SubclusterType(__cs__.Enum): + QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN = ... + QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC = ... + QCOW2_SUBCLUSTER_ZERO_PLAIN = ... + QCOW2_SUBCLUSTER_ZERO_ALLOC = ... + QCOW2_SUBCLUSTER_NORMAL = ... + QCOW2_SUBCLUSTER_COMPRESSED = ... + QCOW2_SUBCLUSTER_INVALID = ... + +# Technically `c_qcow2` is an instance of `_c_qcow2`, but then we can't use it in type hints +c_qcow2: TypeAlias = _c_qcow2 + +QCOW2_MAGIC: int +QCOW2_MAGIC_BYTES: bytes + +QCOW2_INCOMPAT_MASK: int + +QCow2ClusterType: TypeAlias = _c_qcow2.QCow2ClusterType +QCow2SubclusterType: TypeAlias = _c_qcow2.QCow2SubclusterType + +NORMAL_SUBCLUSTER_TYPES: tuple[QCow2SubclusterType, ...] +ZERO_SUBCLUSTER_TYPES: tuple[QCow2SubclusterType, ...] +UNALLOCATED_SUBCLUSTER_TYPES: tuple[QCow2SubclusterType, ...] + +def ctz(value: int) -> int: ... diff --git a/dissect/hypervisor/disk/qcow2.py b/dissect/hypervisor/disk/qcow2.py index 3fceed5..e869f71 100644 --- a/dissect/hypervisor/disk/qcow2.py +++ b/dissect/hypervisor/disk/qcow2.py @@ -3,7 +3,6 @@ # - https://github.com/qemu/qemu/blob/master/docs/interop/qcow2.txt from __future__ import annotations -import copy import zlib from functools import cached_property, lru_cache from io import BytesIO @@ -15,6 +14,7 @@ from dissect.hypervisor.disk.c_qcow2 import ( NORMAL_SUBCLUSTER_TYPES, QCOW2_MAGIC, + QCOW2_MAGIC_BYTES, UNALLOCATED_SUBCLUSTER_TYPES, ZERO_SUBCLUSTER_TYPES, QCow2ClusterType, @@ -35,28 +35,42 @@ HAS_ZSTD = False -ALLOW_NO_BACKING_FILE = 1 - - -class QCow2(AlignedStream): +class QCow2: """QCOW2 virtual disk implementation. - Supports both data-file and backing-file, but must be manually given as arguments. + If a data-file is required and ``fh`` is not a ``Path``, it's required to manually pass a file like object + in the `data_file` argument. Otherwise, the data file will be automatically opened if it exists in the same directory. + It's possible to defer opening the data file by passing ``allow_no_data_file=True``. - If a data-file is required, it's required to manually pass a file like object - as the `data_file` argument. + The same applies to the backing-file. This too can be deferred by passing ``allow_no_backing_file=True``. - A backing-file can optionally be skipped if `qcow2.ALLOW_NO_BACKING_FILE` is passed - as the `backing_file` argument. In this case, any reads from a backing file will result - in all null bytes being read. - """ + Args: + fh: File handle or path to the QCOW2 file. + data_file: Optional file handle for the data file. If not provided and ``fh`` is a ``Path``, it will try to open it automatically. + backing_file: Optional file handle for the backing file. If not provided and ``fh`` is a ``Path``, it will try to open it automatically. + allow_no_data_file: If True, allows the QCOW2 file to be opened without a data file. + allow_no_backing_file: If True, allows the QCOW2 file to be opened without a backing file. + """ # noqa: E501 def __init__( - self, fh: BinaryIO | Path, data_file: BinaryIO | None = None, backing_file: BinaryIO | int | None = None + self, + fh: BinaryIO | Path, + data_file: BinaryIO | None = None, + backing_file: BinaryIO | None = None, + *, + allow_no_data_file: bool = False, + allow_no_backing_file: bool = False, ): - self.fh = fh.open("rb") if isinstance(fh, Path) else fh + if isinstance(fh, Path): + self.path = fh + self.fh = self.path.open("rb") + else: + self.path = None + self.fh = fh - self.header = c_qcow2.QCowHeader(fh) + self.fh.seek(0) + + self.header = c_qcow2.QCowHeader(self.fh) if self.header.magic != QCOW2_MAGIC: raise InvalidHeaderError("Invalid qcow2 header magic") @@ -66,14 +80,16 @@ def __init__( if self.header.cluster_bits < c_qcow2.MIN_CLUSTER_BITS or self.header.cluster_bits > c_qcow2.MAX_CLUSTER_BITS: raise InvalidHeaderError(f"Unsupported cluster size: 2**{self.header.cluster_bits}") + self.size = self.header.size + self.cluster_bits = self.header.cluster_bits self.cluster_size = 1 << self.cluster_bits self.subclusters_per_cluster = c_qcow2.QCOW_EXTL2_SUBCLUSTERS_PER_CLUSTER if self.has_subclusters else 1 self.subcluster_size = self.cluster_size // self.subclusters_per_cluster - self.subcluster_bits = ctz(self.subcluster_size, 32) + self.subcluster_bits = ctz(self.subcluster_size) self._l2_entry_size = c_qcow2.L2E_SIZE_EXTENDED if self.has_subclusters else c_qcow2.L2E_SIZE_NORMAL - self.l2_bits = self.cluster_bits - ctz(self._l2_entry_size, 32) + self.l2_bits = self.cluster_bits - ctz(self._l2_entry_size) self.l2_size = 1 << self.l2_bits # 104 = byte offset of compression_type @@ -103,10 +119,8 @@ def __init__( self.unknown_extensions = [] self._read_extensions() - if self.header.incompatible_features & c_qcow2.QCOW2_INCOMPAT_DATA_FILE: - if data_file is None: - raise Error(f"data-file required but not provided (image_data_file = {self.image_data_file})") - self.data_file = data_file + if self.needs_data_file: + self.data_file = self._open_data_file(data_file, allow_no_data_file) else: self.data_file = self.fh @@ -118,24 +132,10 @@ def __init__( self.auto_backing_file = self.fh.read(self.header.backing_file_size).decode() self.image_backing_file = self.auto_backing_file.upper() - if backing_file is None: - if not isinstance(fh, Path): - raise Error( - f"backing-file required but not provided (auto_backing_file = {self.auto_backing_file})" - ) - if not (candidate_path := fh.parent.joinpath(self.auto_backing_file)).exists(): - raise Error( - f"backing-file '{candidate_path}' not found (auto_backing_file = '{self.auto_backing_file}')" - ) - backing_file = candidate_path.open("rb") - - if backing_file != ALLOW_NO_BACKING_FILE: - self.backing_file = backing_file + self.backing_file = self._open_backing_file(backing_file, allow_no_backing_file) self.l2_table = lru_cache(128)(self.l2_table) - super().__init__(self.header.size) - def _read_extensions(self) -> None: start_offset = self.header.header_length end_offset = self.header.backing_file_offset or 1 << self.cluster_bits @@ -169,14 +169,54 @@ def _read_extensions(self) -> None: # Align to nearest 8 byte boundary offset += (ext.len + 7) & 0xFFFFFFF8 + def _open_data_file(self, data_file: BinaryIO | None, allow_no_data_file: bool = False) -> BinaryIO | None: + if data_file is not None: + return data_file + + if self.path: + if (data_file_path := self.path.with_name(self.image_data_file)).exists(): + return data_file_path.open("rb") + + if not allow_no_data_file: + raise Error(f"data-file {str(data_file_path)!r} not found (image_data_file = {self.image_data_file!r})") + elif allow_no_data_file: + return None + + raise Error(f"data-file required but not provided (image_data_file = {self.image_data_file!r})") + + def _open_backing_file(self, backing_file: BinaryIO | None, allow_no_backing_file: bool = False) -> BinaryIO | None: + backing_file_path = None + if backing_file is None: + if self.path: + if (backing_file_path := self.path.with_name(self.auto_backing_file)).exists(): + backing_file = backing_file_path.open("rb") + elif not allow_no_backing_file: + raise Error( + f"backing-file {str(backing_file_path)!r} not found (auto_backing_file = {self.auto_backing_file!r})" # noqa: E501 + ) + elif not allow_no_backing_file: + raise Error(f"backing-file required but not provided (auto_backing_file = {self.auto_backing_file!r})") + + if backing_file: + if backing_file.read(4) == QCOW2_MAGIC_BYTES: + if backing_file_path: + backing_file.close() + backing_file = QCow2(backing_file_path).open() + else: + backing_file = QCow2(backing_file).open() + else: + backing_file.seek(0) + + return backing_file + @cached_property def snapshots(self) -> list[QCow2Snapshot]: - snapshots = [] + snapshots: list[QCow2Snapshot] = [] offset = self.header.snapshots_offset for _ in range(self.header.nb_snapshots): snapshots.append(QCow2Snapshot(self, offset)) - offset += snapshots[-1].entry_size + offset += (snapshots[-1].entry_size + 7) & ~7 # Round up to 8 bytes return snapshots @@ -193,65 +233,72 @@ def l2_table(self, l2_offset: int) -> L2Table: def has_backing_file(self) -> bool: return self.backing_file is not None + @property + def needs_backing_file(self) -> bool: + return self.header.backing_file_offset != 0 + @property def has_data_file(self) -> bool: - return self.data_file != self.fh + return self.data_file is not None and self.data_file != self.fh + + @property + def needs_data_file(self) -> bool: + return bool(self.header.incompatible_features & c_qcow2.QCOW2_INCOMPAT_DATA_FILE) @property def has_subclusters(self) -> bool: return bool(self.header.incompatible_features & c_qcow2.QCOW2_INCOMPAT_EXTL2) - def _read(self, offset: int, length: int) -> bytes: - result = [] + def open(self) -> QCow2Stream: + """Open the QCow2 file for reading.""" + if self.needs_data_file and not self.has_data_file: + raise Error(f"data-file required but not provided (image_data_file = {self.image_data_file!r})") + if self.needs_backing_file and not self.has_backing_file: + raise Error(f"backing-file required but not provided (auto_backing_file = {self.auto_backing_file!r})") + return QCow2Stream(self) - for sc_type, read_offset, run_offset, run_length in self._yield_runs(offset, length): - unalloc_zeroed = sc_type in UNALLOCATED_SUBCLUSTER_TYPES and not self.has_backing_file - if sc_type in ZERO_SUBCLUSTER_TYPES or unalloc_zeroed: - result.append(b"\x00" * run_length) - elif sc_type in UNALLOCATED_SUBCLUSTER_TYPES and self.has_backing_file: - self.backing_file.seek(read_offset) - result.append(self.backing_file.read(run_length)) - elif sc_type == QCow2SubclusterType.QCOW2_SUBCLUSTER_COMPRESSED: - result.append(self._read_compressed(run_offset, read_offset, run_length)) - elif sc_type == QCow2SubclusterType.QCOW2_SUBCLUSTER_NORMAL: - self.data_file.seek(run_offset) - result.append(self.data_file.read(run_length)) +class QCow2Snapshot: + """Wrapper class for snapshot table entries.""" - return b"".join(result) + def __init__(self, qcow2: QCow2, offset: int): + self.qcow2 = qcow2 + self.offset = offset - def _read_compressed(self, cluster_descriptor: int, offset: int, length: int) -> bytes: - offset_in_cluster = offset_into_cluster(self, offset) - coffset = cluster_descriptor & self.cluster_offset_mask - nb_csectors = ((cluster_descriptor >> self.csize_shift) & self.csize_mask) + 1 - # Original source uses the mask ~(~(QCOW2_COMPRESSED_SECTOR_SIZE - 1ULL)) - # However bit inversion is weird in Python, and this evaluates to 511, so we use that value instead. - csize = nb_csectors * c_qcow2.QCOW2_COMPRESSED_SECTOR_SIZE - (coffset & 511) + self.qcow2.fh.seek(offset) + self.header = c_qcow2.QCowSnapshotHeader(self.qcow2.fh) - self.fh.seek(coffset) - buf = self.fh.read(csize) - decompressed = self._decompress(buf) + # Older versions may not have all the extra data fields + # Instead of reading them manually, just pad the extra data to fit our struct + extra_data = self.qcow2.fh.read(self.header.extra_data_size) + self.extra = c_qcow2.QCowSnapshotExtraData(extra_data.ljust(len(c_qcow2.QCowSnapshotExtraData), b"\x00")) - return decompressed[offset_in_cluster : offset_in_cluster + length] + unknown_extra_size = self.header.extra_data_size - len(c_qcow2.QCowSnapshotExtraData) + self.unknown_extra = self.qcow2.fh.read(unknown_extra_size) if unknown_extra_size > 0 else None - def _decompress(self, buf: bytes) -> bytes: - if self.compression_type == c_qcow2.QCOW2_COMPRESSION_TYPE_ZLIB: - dctx = zlib.decompressobj(-12) - return dctx.decompress(buf, self.cluster_size) + self.id = self.qcow2.fh.read(self.header.id_str_size).decode() + self.name = self.qcow2.fh.read(self.header.name_size).decode() - if self.compression_type == c_qcow2.QCOW2_COMPRESSION_TYPE_ZSTD: - result = [] + self.entry_size = self.qcow2.fh.tell() - offset - dctx = zstd.ZstdDecompressor() - reader = dctx.stream_reader(BytesIO(buf)) - while reader.tell() < self.cluster_size: - chunk = reader.read(self.cluster_size - reader.tell()) - if not chunk: - break - result.append(chunk) - return b"".join(result) + def open(self) -> QCow2Stream: + """Open the snapshot for reading.""" + return QCow2Stream(self.qcow2, self.l1_table) + + @cached_property + def l1_table(self) -> list[int]: + # L1 table is usually relatively small, it can be at most 32MB on PB or EB size disks + self.qcow2.fh.seek(self.header.l1_table_offset) + return c_qcow2.uint64[self.header.l1_size](self.qcow2.fh) - raise Error(f"Invalid compression type: {self.compression_type}") + +class QCow2Stream(AlignedStream): + """Aligned stream for reading QCow2 files.""" + + def __init__(self, qcow2: QCow2, l1_table: list[int] | None = None): + super().__init__(qcow2.header.size, align=qcow2.cluster_size) + self.qcow2 = qcow2 + self.l1_table = l1_table or qcow2.l1_table def _yield_runs(self, offset: int, length: int) -> Iterator[tuple[QCow2SubclusterType, int, int, int]]: # reference: qcow2_get_host_offset @@ -260,19 +307,19 @@ def _yield_runs(self, offset: int, length: int) -> Iterator[tuple[QCow2Subcluste host_offset = 0 read_count = 0 - l1_index = offset_to_l1_index(self, offset) - l2_index = offset_to_l2_index(self, offset) - sc_index = offset_to_sc_index(self, offset) + l1_index = offset_to_l1_index(self.qcow2, offset) + l2_index = offset_to_l2_index(self.qcow2, offset) + sc_index = offset_to_sc_index(self.qcow2, offset) - offset_in_cluster = offset_into_cluster(self, offset) + offset_in_cluster = offset_into_cluster(self.qcow2, offset) bytes_needed = length + offset_in_cluster # at the time being we just use the entire l2 table and not cached slices # this is actually the bytes available/remaining in this l2 table - bytes_available = (self.l2_size - l2_index) << self.cluster_bits + bytes_available = (self.qcow2.l2_size - l2_index) << self.qcow2.cluster_bits bytes_needed = min(bytes_needed, bytes_available) - if l1_index > self.header.l1_size: + if l1_index > self.qcow2.header.l1_size: # bytes_needed is already the smaller value here read_count = bytes_needed - offset_in_cluster @@ -293,11 +340,11 @@ def _yield_runs(self, offset: int, length: int) -> Iterator[tuple[QCow2Subcluste offset += read_count continue - l2_table = self.l2_table(l2_offset) + l2_table = self.qcow2.l2_table(l2_offset) l2_entry = l2_table.entry(l2_index) l2_bitmap = l2_table.bitmap(l2_index) - sc_type = get_subcluster_type(self, l2_entry, l2_bitmap, sc_index) + sc_type = get_subcluster_type(self.qcow2, l2_entry, l2_bitmap, sc_index) if sc_type == QCow2SubclusterType.QCOW2_SUBCLUSTER_COMPRESSED: host_offset = l2_entry & c_qcow2.L2E_COMPRESSED_OFFSET_SIZE_MASK @@ -305,10 +352,10 @@ def _yield_runs(self, offset: int, length: int) -> Iterator[tuple[QCow2Subcluste host_cluster_offset = l2_entry & c_qcow2.L2E_OFFSET_MASK host_offset = host_cluster_offset + offset_in_cluster - nb_clusters = size_to_clusters(self, bytes_needed) - sc_count = count_contiguous_subclusters(self, nb_clusters, sc_index, l2_table, l2_index) + nb_clusters = size_to_clusters(self.qcow2, bytes_needed) + sc_count = count_contiguous_subclusters(self.qcow2, nb_clusters, sc_index, l2_table, l2_index) # this is the amount of contiguous bytes available of the same subcluster type - bytes_available = (sc_count + sc_index) << self.subcluster_bits + bytes_available = (sc_count + sc_index) << self.qcow2.subcluster_bits # account for the offset in the cluster read_count = min(bytes_available, bytes_needed) - offset_in_cluster @@ -317,6 +364,58 @@ def _yield_runs(self, offset: int, length: int) -> Iterator[tuple[QCow2Subcluste length -= read_count offset += read_count + def _read_compressed(self, cluster_descriptor: int, offset: int, length: int) -> bytes: + offset_in_cluster = offset_into_cluster(self.qcow2, offset) + coffset = cluster_descriptor & self.qcow2.cluster_offset_mask + nb_csectors = ((cluster_descriptor >> self.qcow2.csize_shift) & self.qcow2.csize_mask) + 1 + # Original source uses the mask ~(~(QCOW2_COMPRESSED_SECTOR_SIZE - 1ULL)) + # However bit inversion is weird in Python, and this evaluates to 511, so we use that value instead. + csize = nb_csectors * c_qcow2.QCOW2_COMPRESSED_SECTOR_SIZE - (coffset & 511) + + self.qcow2.fh.seek(coffset) + buf = self.qcow2.fh.read(csize) + decompressed = self._decompress(buf) + + return decompressed[offset_in_cluster : offset_in_cluster + length] + + def _decompress(self, buf: bytes) -> bytes: + if self.qcow2.compression_type == c_qcow2.QCOW2_COMPRESSION_TYPE_ZLIB: + dctx = zlib.decompressobj(-12) + return dctx.decompress(buf, self.qcow2.cluster_size) + + if self.qcow2.compression_type == c_qcow2.QCOW2_COMPRESSION_TYPE_ZSTD: + result = [] + + dctx = zstd.ZstdDecompressor() + reader = dctx.stream_reader(BytesIO(buf)) + while reader.tell() < self.qcow2.cluster_size: + chunk = reader.read(self.qcow2.cluster_size - reader.tell()) + if not chunk: + break + result.append(chunk) + return b"".join(result) + + raise Error(f"Invalid compression type: {self.qcow2.compression_type}") + + def _read(self, offset: int, length: int) -> bytes: + result = [] + + for sc_type, read_offset, run_offset, run_length in self._yield_runs(offset, length): + unalloc_zeroed = sc_type in UNALLOCATED_SUBCLUSTER_TYPES and not self.qcow2.has_backing_file + + if sc_type in ZERO_SUBCLUSTER_TYPES or unalloc_zeroed: + result.append(b"\x00" * run_length) + elif sc_type in UNALLOCATED_SUBCLUSTER_TYPES and self.qcow2.has_backing_file: + self.qcow2.backing_file.seek(read_offset) + result.append(self.qcow2.backing_file.read(run_length)) + elif sc_type == QCow2SubclusterType.QCOW2_SUBCLUSTER_COMPRESSED: + result.append(self._read_compressed(run_offset, read_offset, run_length)) + elif sc_type == QCow2SubclusterType.QCOW2_SUBCLUSTER_NORMAL: + self.qcow2.data_file.seek(run_offset) + result.append(self.qcow2.data_file.read(run_length)) + + return b"".join(result) + class L2Table: """Convenience class for accessing the L2 table.""" @@ -338,42 +437,6 @@ def bitmap(self, idx: int) -> int: return 0 -class QCow2Snapshot: - """Wrapper class for snapshot table entries.""" - - def __init__(self, qcow2: QCow2, offset: int): - self.qcow2 = qcow2 - self.offset = offset - - self.qcow2.fh.seek(offset) - self.header = c_qcow2.QCowSnapshotHeader(self.qcow2.fh) - - # Older versions may not have all the extra data fields - # Instead of reading them manually, just pad the extra data to fit our struct - extra_data = self.qcow2.fh.read(self.header.extra_data_size) - self.extra = c_qcow2.QCowSnapshotExtraData(extra_data.ljust(len(c_qcow2.QCowSnapshotExtraData), b"\x00")) - - unknown_extra_size = self.header.extra_data_size - len(c_qcow2.QCowSnapshotExtraData) - self.unknown_extra = self.qcow2.fh.read(unknown_extra_size) if unknown_extra_size > 0 else None - - self.id_str = self.qcow2.fh.read(self.header.id_str_size).decode() - self.name = self.qcow2.fh.read(self.header.name_size).decode() - - self.entry_size = self.qcow2.fh.tell() - offset - - def open(self) -> QCow2: - disk = copy.copy(self.qcow2) - disk.l1_table = self.l1_table - disk.seek(0) - return disk - - @cached_property - def l1_table(self) -> list[int]: - # L1 table is usually relatively small, it can be at most 32MB on PB or EB size disks - self.qcow2.fh.seek(self.header.l1_table_offset) - return c_qcow2.uint64[self.header.l1_size](self.qcow2.fh) - - def offset_into_cluster(qcow2: QCow2, offset: int) -> int: return offset & (qcow2.cluster_size - 1) @@ -471,16 +534,16 @@ def get_subcluster_range_type( sc_mask = (1 << sc_from) - 1 if sc_type == QCow2SubclusterType.QCOW2_SUBCLUSTER_NORMAL: val = l2_bitmap | sc_mask # QCOW_OFLAG_SUB_ALLOC_RANGE(0, sc_from) - return sc_type, ctz(val, 32) - sc_from + return sc_type, ctz(val) - sc_from if sc_type in ZERO_SUBCLUSTER_TYPES: val = (l2_bitmap | sc_mask) >> 32 # QCOW_OFLAG_SUB_ZERO_RANGE(0, sc_from) - return sc_type, ctz(val, 32) - sc_from + return sc_type, ctz(val) - sc_from if sc_type in UNALLOCATED_SUBCLUSTER_TYPES: # We need to mask it with a 64bit mask because Python flips the sign bit inv_mask = ~sc_mask & ((1 << 64) - 1) # ~QCOW_OFLAG_SUB_ALLOC_RANGE(0, sc_from) val = ((l2_bitmap >> 32) | l2_bitmap) & inv_mask - return sc_type, ctz(val, 32) - sc_from + return sc_type, ctz(val) - sc_from raise Error(f"Invalid subcluster type: {sc_type}") diff --git a/pyproject.toml b/pyproject.toml index 9aa0854..bc127a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,6 +96,7 @@ ignore = ["E203", "B904", "UP024", "ANN002", "ANN003", "ANN204", "ANN401", "SIM1 [tool.ruff.lint.per-file-ignores] "tests/_docs/**" = ["INP001"] +"**/*.pyi" = ["PYI042", "PYI047", "PYI054"] [tool.ruff.lint.isort] known-first-party = ["dissect.hypervisor"] diff --git a/tests/_data/disk/qcow2/backing-chain-1.qcow2.gz b/tests/_data/disk/qcow2/backing-chain-1.qcow2.gz new file mode 100644 index 0000000..dfd1cbc Binary files /dev/null and b/tests/_data/disk/qcow2/backing-chain-1.qcow2.gz differ diff --git a/tests/_data/disk/qcow2/backing-chain-2.qcow2.gz b/tests/_data/disk/qcow2/backing-chain-2.qcow2.gz new file mode 100644 index 0000000..7ac1acf Binary files /dev/null and b/tests/_data/disk/qcow2/backing-chain-2.qcow2.gz differ diff --git a/tests/_data/disk/qcow2/backing-chain-3.qcow2.gz b/tests/_data/disk/qcow2/backing-chain-3.qcow2.gz new file mode 100644 index 0000000..1ba55da Binary files /dev/null and b/tests/_data/disk/qcow2/backing-chain-3.qcow2.gz differ diff --git a/tests/_data/disk/qcow2/basic.qcow2.gz b/tests/_data/disk/qcow2/basic.qcow2.gz new file mode 100644 index 0000000..f77ee8a Binary files /dev/null and b/tests/_data/disk/qcow2/basic.qcow2.gz differ diff --git a/tests/_data/disk/qcow2/data-file.bin.gz b/tests/_data/disk/qcow2/data-file.bin.gz new file mode 100644 index 0000000..617e387 Binary files /dev/null and b/tests/_data/disk/qcow2/data-file.bin.gz differ diff --git a/tests/_data/disk/qcow2/data-file.qcow2.gz b/tests/_data/disk/qcow2/data-file.qcow2.gz new file mode 100644 index 0000000..a3fc3fa Binary files /dev/null and b/tests/_data/disk/qcow2/data-file.qcow2.gz differ diff --git a/tests/_data/disk/qcow2/snapshot.qcow2.gz b/tests/_data/disk/qcow2/snapshot.qcow2.gz new file mode 100644 index 0000000..f62e130 Binary files /dev/null and b/tests/_data/disk/qcow2/snapshot.qcow2.gz differ diff --git a/tests/conftest.py b/tests/conftest.py index 7ae0635..6a7895e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -70,20 +70,44 @@ def sesparse_vmdk() -> Iterator[BinaryIO]: @pytest.fixture -def plain_hdd() -> Iterator[str]: +def plain_hdd() -> Path: return absolute_path("_data/disk/hdd/plain.hdd") @pytest.fixture -def expanding_hdd() -> Iterator[str]: +def expanding_hdd() -> Path: return absolute_path("_data/disk/hdd/expanding.hdd") @pytest.fixture -def split_hdd() -> Iterator[str]: +def split_hdd() -> Path: return absolute_path("_data/disk/hdd/split.hdd") +@pytest.fixture +def basic_qcow2() -> Iterator[BinaryIO]: + yield from open_file_gz("_data/disk/qcow2/basic.qcow2.gz") + + +@pytest.fixture +def data_file_qcow2() -> Path: + return absolute_path("_data/disk/qcow2/data-file.qcow2.gz") + + +@pytest.fixture +def backing_chain_qcow2() -> tuple[Path, Path, Path]: + return ( + absolute_path("_data/disk/qcow2/backing-chain-1.qcow2.gz"), + absolute_path("_data/disk/qcow2/backing-chain-2.qcow2.gz"), + absolute_path("_data/disk/qcow2/backing-chain-3.qcow2.gz"), + ) + + +@pytest.fixture +def snapshot_qcow2() -> Iterator[BinaryIO]: + yield from open_file_gz("_data/disk/qcow2/snapshot.qcow2.gz") + + @pytest.fixture def envelope() -> Iterator[BinaryIO]: yield from open_file("_data/util/envelope/local.tgz.ve") diff --git a/tests/disk/test_hdd.py b/tests/disk/test_hdd.py index 98861f7..cdfc5ad 100644 --- a/tests/disk/test_hdd.py +++ b/tests/disk/test_hdd.py @@ -17,8 +17,8 @@ def mock_open_gz(self: Path, *args, **kwargs) -> BinaryIO: return gzip.open(self.with_suffix(self.suffix + ".gz")) -def test_plain_hdd(plain_hdd: str) -> None: - hdd = HDD(Path(plain_hdd)) +def test_plain_hdd(plain_hdd: Path) -> None: + hdd = HDD(plain_hdd) storages = hdd.descriptor.storage_data.storages assert len(storages) == 1 @@ -31,11 +31,11 @@ def test_plain_hdd(plain_hdd: str) -> None: stream = hdd.open() for i in range(100): - assert stream.read(1024 * 1024).strip(bytes([i])) == b"" + assert stream.read(1024 * 1024).strip(bytes([i])) == b"", f"Mismatch at offset {i * 1024 * 1024:#x}" -def test_expanding_hdd(expanding_hdd: str) -> None: - hdd = HDD(Path(expanding_hdd)) +def test_expanding_hdd(expanding_hdd: Path) -> None: + hdd = HDD(expanding_hdd) storages = hdd.descriptor.storage_data.storages assert len(storages) == 1 @@ -48,11 +48,11 @@ def test_expanding_hdd(expanding_hdd: str) -> None: stream = hdd.open() for i in range(100): - assert stream.read(1024 * 1024).strip(bytes([i])) == b"" + assert stream.read(1024 * 1024).strip(bytes([i])) == b"", f"Mismatch at offset {i * 1024 * 1024:#x}" -def test_split_hdd(split_hdd: str) -> None: - hdd = HDD(Path(split_hdd)) +def test_split_hdd(split_hdd: Path) -> None: + hdd = HDD(split_hdd) storages = hdd.descriptor.storage_data.storages assert len(storages) == 6 @@ -86,8 +86,8 @@ def test_split_hdd(split_hdd: str) -> None: assert buf == bytes([i + 1] * 512) -def test_file_use_parent(plain_hdd: str) -> None: - hdd = HDD(Path(plain_hdd).joinpath("plain.hdd")) +def test_file_use_parent(plain_hdd: Path) -> None: + hdd = HDD(plain_hdd.joinpath("plain.hdd")) storages = hdd.descriptor.storage_data.storages assert len(storages) == 1 diff --git a/tests/disk/test_qcow2.py b/tests/disk/test_qcow2.py new file mode 100644 index 0000000..9821600 --- /dev/null +++ b/tests/disk/test_qcow2.py @@ -0,0 +1,147 @@ +from __future__ import annotations + +import gzip +import hashlib +from pathlib import Path +from typing import BinaryIO +from unittest.mock import patch + +import pytest + +from dissect.hypervisor.disk.qcow2 import QCow2, QCow2Stream +from dissect.hypervisor.exceptions import Error + + +def mock_open_gz(self: Path, *args, **kwargs) -> BinaryIO: + return gzip.open(self if self.suffix.lower() == ".gz" else self.with_suffix(self.suffix + ".gz")) + + +def test_basic(basic_qcow2: BinaryIO) -> None: + qcow2 = QCow2(basic_qcow2) + + assert qcow2.backing_file is None + assert qcow2.data_file is qcow2.fh + assert qcow2.size == 536870912 + + with qcow2.open() as stream: + for i in range(255): + assert stream.read(1024 * 1024).strip(bytes([i])) == b"", f"Mismatch at offset {i * 1024 * 1024:#x}" + + +def test_data_file(data_file_qcow2: Path) -> None: + # Test with file handle + with gzip.open(data_file_qcow2, "rb") as fh: + with pytest.raises(Error, match="data-file required but not provided \\(image_data_file = 'data-file.bin'\\)"): + QCow2(fh) + + with gzip.open(data_file_qcow2.with_name("data-file.bin.gz"), "rb") as fh_bin: + qcow2 = QCow2(fh, data_file=fh_bin) + + assert qcow2.backing_file is None + assert qcow2.data_file is fh_bin + + with qcow2.open() as stream: + for i in range(255): + assert stream.read(1024 * 1024).strip(bytes([i])) == b"", f"Mismatch at offset {i * 1024 * 1024:#x}" + + # Test with allow_no_data_file + qcow2 = QCow2(fh, allow_no_data_file=True) + assert qcow2.data_file is None + with pytest.raises(Error, match="data-file required but not provided \\(image_data_file = 'data-file.bin'\\)"): + qcow2.open() + + # Test with Path + with patch.object(Path, "open", mock_open_gz), patch.object(Path, "exists", return_value=True): + qcow2 = QCow2(data_file_qcow2) + + assert qcow2.backing_file is None + assert qcow2.data_file is not qcow2.fh + + with qcow2.open() as stream: + for i in range(255): + assert stream.read(1024 * 1024).strip(bytes([i])) == b"", f"Mismatch at offset {i * 1024 * 1024:#x}" + + +def test_backing_file(backing_chain_qcow2: tuple[Path, Path, Path]) -> None: + file1, file2, file3 = backing_chain_qcow2 + + # Test with file handle + with gzip.open(file1, "rb") as fh1, gzip.open(file2, "rb") as fh2, gzip.open(file3, "rb") as fh3: + with pytest.raises( + Error, match="backing-file required but not provided \\(auto_backing_file = 'backing-chain-2.qcow2'\\)" + ): + QCow2(fh1) + + with pytest.raises( + Error, match="backing-file required but not provided \\(auto_backing_file = 'backing-chain-3.qcow2'\\)" + ): + QCow2(fh1, backing_file=fh2) + + backing2 = QCow2(fh2, backing_file=fh3) + assert isinstance(backing2.backing_file, QCow2Stream) + + qcow2 = QCow2(fh1, backing_file=backing2.open()) + assert isinstance(qcow2.backing_file, QCow2Stream) + + # Test with allow_no_backing_file + qcow2 = QCow2(fh1, allow_no_backing_file=True) + assert qcow2.backing_file is None + with pytest.raises( + Error, match="backing-file required but not provided \\(auto_backing_file = 'backing-chain-2.qcow2'\\)" + ): + qcow2.open() + + # Test with Path + with patch.object(Path, "open", mock_open_gz), patch.object(Path, "exists", return_value=True): + qcow2 = QCow2(file1) + + assert isinstance(qcow2.backing_file, QCow2Stream) + assert qcow2.backing_file.qcow2.fh.name == str(file2) + + assert isinstance(qcow2.backing_file.qcow2.backing_file, QCow2Stream) + assert qcow2.backing_file.qcow2.backing_file.qcow2.fh.name == str(file3) + + # Test reading through the backing chain + with QCow2(file3).open() as stream: + assert stream.read(1024 * 1024).strip(b"\x00") == b"Something here" + assert stream.read(1024 * 1024).strip(b"\x00") == b"Something here too" + assert stream.read(1024 * 1024).strip(b"\x00") == b"Something here three" + assert stream.read(1024 * 1024).strip(b"\x00") == b"" + assert stream.read(1024 * 1024).strip(b"\x00") == b"" + + with QCow2(file2).open() as stream: + assert stream.read(1024 * 1024).strip(b"\x00") == b"Nothing here" + assert stream.read(1024 * 1024).strip(b"\x00") == b"Something here too" + assert stream.read(1024 * 1024).strip(b"\x00") == b"Something here three" + assert stream.read(1024 * 1024).strip(b"\x00") == b"Something here four" + assert stream.read(1024 * 1024).strip(b"\x00") == b"" + + with QCow2(file1).open() as stream: + assert stream.read(1024 * 1024).strip(b"\x00") == b"Nothing here" + assert stream.read(1024 * 1024).strip(b"\x00") == b"Nothing here two" + assert stream.read(1024 * 1024).strip(b"\x00") == b"Something here three" + assert stream.read(1024 * 1024).strip(b"\x00") == b"Something here four" + assert stream.read(1024 * 1024).strip(b"\x00") == b"Something here five" + + +def test_snapshot(snapshot_qcow2: BinaryIO) -> None: + qcow2 = QCow2(snapshot_qcow2) + + assert qcow2.backing_file is None + assert qcow2.data_file is qcow2.fh + assert qcow2.size == 536870912 + + with qcow2.open() as stream: + assert stream.read(4 * 1024 * 1024).strip(b"\x00") == b"" + + assert len(qcow2.snapshots) == 2 + assert qcow2.snapshots[0].id == "1" + assert qcow2.snapshots[0].name == "you can't see me" + assert qcow2.snapshots[1].id == "2" + assert qcow2.snapshots[1].name == "confused" + + with qcow2.snapshots[1].open() as stream: + assert hashlib.sha1(stream.read(813857)).hexdigest() == "c97f53aece77ea49099d15e5f53af3af5f62fb54" + + with qcow2.snapshots[0].open() as stream: + assert hashlib.sha1(stream.read(2261577)).hexdigest() == "2c7a6b5f6b5c4739f6d24c11e86c764bdf86096f"