diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..510120e --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +tests/_data/** filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a5a2673 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +coverage.xml +.coverage +dist/ +.eggs/ +*.egg-info/ +*.pyc +__pycache__/ +.pytest_cache/ +tests/_docs/api +tests/_docs/build +.tox/ \ No newline at end of file diff --git a/COPYRIGHT b/COPYRIGHT new file mode 100644 index 0000000..9dd882e --- /dev/null +++ b/COPYRIGHT @@ -0,0 +1,5 @@ +`dissect.erofs` is developed and released as open source by the Joint Sigint Cyber Unit () in collaboration with Fox-IT () part of NCC Group Plc () + +Made available at https://github.com/fox-it/dissect.erofs + +License terms: Apache 2.0 () \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c1a81fe --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..f58ff7d --- /dev/null +++ b/README.md @@ -0,0 +1,53 @@ +# dissect.erofs + +A Dissect module implementing a parser for Enhanced Read-Only File System (EROFS). For more +information, please see [the documentation](https://docs.dissect.tools/en/latest/projects/dissect.erofs/index.html). + +## Requirements + +This project is part of the Dissect framework and requires Python. + +Information on the supported Python versions can be found in the Getting Started section of [the documentation](https://docs.dissect.tools/en/latest/index.html#getting-started). + +## Installation + +`dissect.erofs` is available on [PyPI](https://pypi.org/project/dissect.erofs/). + +```bash +pip install dissect.erofs +``` + +This module is also automatically installed if you install the `dissect` package. + +## Build and test instructions + +This project uses `tox` to build source and wheel distributions. Run the following command from the root folder to build +these: + +```bash +tox -e build +``` + +The build artifacts can be found in the `dist/` directory. + +`tox` is also used to run linting and unit tests in a self-contained environment. To run both linting and unit tests +using the default installed Python version, run: + +```bash +tox +``` + +For a more elaborate explanation on how to build and test the project, please see [the +documentation](https://docs.dissect.tools/en/latest/contributing/tooling.html). + +## Contributing + +The Dissect project encourages any contribution to the codebase. To make your contribution fit into the project, please +refer to [the development guide](https://docs.dissect.tools/en/latest/contributing/developing.html). + +## Copyright and license + +`dissect.erofs` is developed and released as open source by the Joint Sigint Cyber Unit () +in collaboration with Fox-IT () part of NCC Group Plc (). + +License terms: Apache Licence 2.0 (). For more information, see the LICENSE file. diff --git a/dissect/erofs/__init__.py b/dissect/erofs/__init__.py new file mode 100644 index 0000000..d441fe5 --- /dev/null +++ b/dissect/erofs/__init__.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from dissect.erofs.erofs import EROFS, INode +from dissect.erofs.exception import ( + Error, + FileNotFoundError, + NotADirectoryError, + NotAFileError, + NotASymlinkError, +) + +__all__ = [ + "EROFS", + "Error", + "FileNotFoundError", + "INode", + "NotADirectoryError", + "NotAFileError", + "NotASymlinkError", +] diff --git a/dissect/erofs/c_erofs.py b/dissect/erofs/c_erofs.py new file mode 100644 index 0000000..59b0df6 --- /dev/null +++ b/dissect/erofs/c_erofs.py @@ -0,0 +1,218 @@ +from dissect.cstruct import cstruct + +erofs_def = """ +// https://github.com/torvalds/linux/blob/master/fs/erofs/erofs_fs.h + +#define EROFS_SUPER_OFFSET 1024 +#define EROFS_SUPER_MAGIC_V1 0xE0F5E1E2 + +/* 32 bytes on-disk inode */ +#define EROFS_INODE_LAYOUT_V1 0 +/* 64 bytes on-disk inode */ +#define EROFS_INODE_LAYOUT_V2 1 + +struct erofs_super_block { + uint32 magic; /* file system magic number */ + uint32 checksum; /* crc32c to avoid unexpected on-disk overlap */ + uint32 feature_compat; + uint8 blkszbits; /* filesystem block size in bit shift */ + uint8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */ + uint16 rootnid_2b; /* nid of root directory */ + uint64 inos; /* total valid ino # (== f_files - f_favail) */ + uint64 epoch; /* base seconds used for compact inodes */ + uint32 fixed_nsec; /* fixed nanoseconds for compact inodes */ + uint32 blocks_lo; /* blocks count LSB */ + uint32 meta_blkaddr; /* start block address of metadata area */ + uint32 xattr_blkaddr; /* start block address of shared xattr area */ + char uuid[16]; /* 128-bit uuid for volume */ + char volume_name[16]; /* volume name */ + uint32 feature_incompat; + uint16 available_compr_algs; + uint16 extra_devices; /* # of devices besides the primary device */ + uint16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */ + uint8 dirblkbits; /* directory block size in bit shift */ + uint8 xattr_prefix_count; /* # of long xattr name prefixes */ + uint32 xattr_prefix_start; /* start of long xattr prefixes */ + uint64 packed_nid; /* nid of the special packed inode */ + uint8 xattr_filter_reserved; /* reserved for xattr name filter */ + uint8 reserved[3]; + uint32 build_time; /* seconds added to epoch for mkfs time */ + uint64 rootnid_8b; /* (48BIT on) nid of root directory */ + uint8 reserved2[8]; +}; + +struct erofs_i_format { + uint16 inode_version:1; + uint16 inode_data_layout:3; +} + +struct erofs_inode { + erofs_i_format i_advise; + uint16 i_xattr_icount; + uint16 i_mode; + uint16 i_nlink; + uint32 i_size; + uint32 reserved; + uint32 i_u; + uint32 i_ino; + uint16 i_uid; + uint16 i_gid; + uint32 i_checksum; +}; + +struct erofs_inode_extended { + erofs_i_format i_advise; + uint16 i_xattr_icount; + uint16 i_mode; + uint16 i_reserved; + uint64 i_size; + uint32 i_u; + uint32 i_ino; + uint32 i_uid; + uint32 i_gid; + uint64 i_mtime; + uint32 i_mtime_nsec; + uint32 i_nlink; + char i_reserved2[16]; +} + +struct erofs_dir_entry { + uint64 inode; + int16 name_offset; + uint8 file_type; + uint8 reserved; +}; + +enum EROFS_DATA_MAPPING { + EROFS_INODE_FLAT_PLAIN, + EROFS_INODE_COMPRESSED_FULL, + EROFS_INODE_FLAT_INLINE, + EROFS_INODE_COMPRESSED_COMPACT, + EROFS_INODE_CHUNK_BASED, + EROFS_INODE_DATALAYOUT_MAX +}; + +/* available compression algorithm types (for h_algorithmtype) */ +enum EROFS_COMPRESSION_ALGORITHM { + Z_EROFS_COMPRESSION_LZ4, + Z_EROFS_COMPRESSION_LZMA, + Z_EROFS_COMPRESSION_DEFLATE, + Z_EROFS_COMPRESSION_ZSTD, + Z_EROFS_COMPRESSION_MAX +}; + +enum Z_EROFS_LCLUSTER_TYPE { + Z_EROFS_LCLUSTER_TYPE_PLAIN + Z_EROFS_LCLUSTER_TYPE_HEAD1 + Z_EROFS_LCLUSTER_TYPE_NONHEAD + Z_EROFS_LCLUSTER_TYPE_HEAD2 + Z_EROFS_LCLUSTER_TYPE_MAX +}; + +#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2 +#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0 + +/* Name indexes */ +#define EROFS_XATTR_INDEX_USER 1 +#define EROFS_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EROFS_XATTR_INDEX_TRUSTED 4 +#define EROFS_XATTR_INDEX_LUSTRE 5 +#define EROFS_XATTR_INDEX_SECURITY 6 + +#define XATTR_HURD_PREFIX "gnu." +#define XATTR_SECURITY_PREFIX "security." +#define XATTR_SYSTEM_PREFIX "system." +#define XATTR_TRUSTED_PREFIX "trusted." +#define XATTR_USER_PREFIX "user." +#define XATTR_ENCRYPTION_PREFIX "encryption." +#define XATTR_POSIX_ACL_ACCESS "posix_acl_access" +#define XATTR_POSIX_ACL_DEFAULT "posix_acl_default" + +struct erofs_xattr_ibody_header { + uint32 h_name_filter; + uint8 h_shared_count; + uint8 h_reserved2[7]; + uint32 h_shared_xattrs[h_shared_count]; +}; + +/* xattr entry (for both inline & shared xattrs) */ +struct erofs_xattr_entry { + uint8 e_name_len; + uint8 e_name_index; + uint16 e_value_size; + char e_name[e_name_len]; + char e_value[e_value_size]; +}; + +struct z_erofs_map_header { + uint32 h_reserved1; + uint16 h_advise; + /* + * bit 0-3 : algorithm type of head 1 (logical cluster type 01); + * bit 4-7 : algorithm type of head 2 (logical cluster type 11). + */ + uint8 h_algorithmtype_head_1:4; + uint8 h_algorithmtype_head_2:4; + /* + * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; + * bit 3-4 : (physical - logical) cluster bits of head 1: + * For example, if logical clustersize = 4096, 1 for 8192. + * bit 5-7 : (physical - logical) cluster bits of head 2. + */ + uint8 h_clusterbits_lclusterbits:3; + uint8 h_pclusterbits_head1:2; + uint8 h_pclusterbits_head2:3; +}; + +#define Z_EROFS_ADVISE_COMPACTED_2B_BIT 0x0000 +#define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 +#define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 +#define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 +#define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008 +#define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010 +#define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020 + +/* + * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the + * compressed block count of a compressed extent (in logical clusters, aka. + * block count of a pcluster). +*/ +#define Z_EROFS_LI_D0_CBLKCNT (1 << 11) + +struct z_erofs_vle_decompressed_index { + uint16 di_advise; + /* where to decompress in the head cluster */ + uint16 di_clusterofs; + + union { + /* for the head cluster */ + uint32 blkaddr; + /* + * for the rest clusters + * eg. for 4k page-sized cluster, maximum 4K*64k = 256M) + * [0] - pointing to the head cluster + * [1] - pointing to the tail cluster + */ + uint16 delta[2]; + } di_u; +}; + +#define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8 +#define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001 +""" + +c_erofs = cstruct().load(erofs_def) + +XATTR_NAME_MAP = { + c_erofs.EROFS_XATTR_INDEX_USER: c_erofs.XATTR_USER_PREFIX, + c_erofs.EROFS_XATTR_INDEX_POSIX_ACL_ACCESS: c_erofs.XATTR_POSIX_ACL_ACCESS, + c_erofs.EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT: c_erofs.XATTR_POSIX_ACL_DEFAULT, + c_erofs.EROFS_XATTR_INDEX_TRUSTED: c_erofs.XATTR_TRUSTED_PREFIX, + c_erofs.EROFS_XATTR_INDEX_SECURITY: c_erofs.XATTR_SECURITY_PREFIX, +} + +COMPACT_INODE_SIZE = len(c_erofs.erofs_inode) +EXTENDED_INODE_SIZE = len(c_erofs.erofs_inode_extended) +DIR_ENTRY_SIZE = len(c_erofs.erofs_dir_entry) +COMPRESSION_MAP_SIZE = len(c_erofs.z_erofs_map_header) diff --git a/dissect/erofs/c_erofs.pyi b/dissect/erofs/c_erofs.pyi new file mode 100644 index 0000000..5ed9b66 --- /dev/null +++ b/dissect/erofs/c_erofs.pyi @@ -0,0 +1,292 @@ +# Generated by cstruct-stubgen +from typing import BinaryIO, Literal, TypeAlias, overload + +import dissect.cstruct as __cs__ + +class _c_erofs(__cs__.cstruct): + EROFS_SUPER_OFFSET: Literal[1024] = ... + EROFS_SUPER_MAGIC_V1: Literal[3774210530] = ... + EROFS_INODE_LAYOUT_V1: Literal[0] = ... + EROFS_INODE_LAYOUT_V2: Literal[1] = ... + Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS: Literal[2] = ... + Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT: Literal[0] = ... + EROFS_XATTR_INDEX_USER: Literal[1] = ... + EROFS_XATTR_INDEX_POSIX_ACL_ACCESS: Literal[2] = ... + EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT: Literal[3] = ... + EROFS_XATTR_INDEX_TRUSTED: Literal[4] = ... + EROFS_XATTR_INDEX_LUSTRE: Literal[5] = ... + EROFS_XATTR_INDEX_SECURITY: Literal[6] = ... + XATTR_HURD_PREFIX: Literal["gnu."] = ... + XATTR_SECURITY_PREFIX: Literal["security."] = ... + XATTR_SYSTEM_PREFIX: Literal["system."] = ... + XATTR_TRUSTED_PREFIX: Literal["trusted."] = ... + XATTR_USER_PREFIX: Literal["user."] = ... + XATTR_ENCRYPTION_PREFIX: Literal["encryption."] = ... + XATTR_POSIX_ACL_ACCESS: Literal["posix_acl_access"] = ... + XATTR_POSIX_ACL_DEFAULT: Literal["posix_acl_default"] = ... + Z_EROFS_ADVISE_COMPACTED_2B_BIT: Literal[0] = ... + Z_EROFS_ADVISE_COMPACTED_2B: Literal[1] = ... + Z_EROFS_ADVISE_BIG_PCLUSTER_1: Literal[2] = ... + Z_EROFS_ADVISE_BIG_PCLUSTER_2: Literal[4] = ... + Z_EROFS_ADVISE_INLINE_PCLUSTER: Literal[8] = ... + Z_EROFS_ADVISE_INTERLACED_PCLUSTER: Literal[16] = ... + Z_EROFS_ADVISE_FRAGMENT_PCLUSTER: Literal[32] = ... + Z_EROFS_LI_D0_CBLKCNT: Literal[2048] = ... + Z_EROFS_VLE_LEGACY_HEADER_PADDING: Literal[8] = ... + EROFS_FEATURE_INCOMPAT_LZ4_0PADDING: Literal[1] = ... + class erofs_super_block(__cs__.Structure): + magic: _c_erofs.uint32 + checksum: _c_erofs.uint32 + feature_compat: _c_erofs.uint32 + blkszbits: _c_erofs.uint8 + sb_extslots: _c_erofs.uint8 + rootnid_2b: _c_erofs.uint16 + inos: _c_erofs.uint64 + epoch: _c_erofs.uint64 + fixed_nsec: _c_erofs.uint32 + blocks_lo: _c_erofs.uint32 + meta_blkaddr: _c_erofs.uint32 + xattr_blkaddr: _c_erofs.uint32 + uuid: __cs__.CharArray + volume_name: __cs__.CharArray + feature_incompat: _c_erofs.uint32 + available_compr_algs: _c_erofs.uint16 + extra_devices: _c_erofs.uint16 + devt_slotoff: _c_erofs.uint16 + dirblkbits: _c_erofs.uint8 + xattr_prefix_count: _c_erofs.uint8 + xattr_prefix_start: _c_erofs.uint32 + packed_nid: _c_erofs.uint64 + xattr_filter_reserved: _c_erofs.uint8 + reserved: __cs__.Array[_c_erofs.uint8] + build_time: _c_erofs.uint32 + rootnid_8b: _c_erofs.uint64 + reserved2: __cs__.Array[_c_erofs.uint8] + @overload + def __init__( + self, + magic: _c_erofs.uint32 | None = ..., + checksum: _c_erofs.uint32 | None = ..., + feature_compat: _c_erofs.uint32 | None = ..., + blkszbits: _c_erofs.uint8 | None = ..., + sb_extslots: _c_erofs.uint8 | None = ..., + rootnid_2b: _c_erofs.uint16 | None = ..., + inos: _c_erofs.uint64 | None = ..., + epoch: _c_erofs.uint64 | None = ..., + fixed_nsec: _c_erofs.uint32 | None = ..., + blocks_lo: _c_erofs.uint32 | None = ..., + meta_blkaddr: _c_erofs.uint32 | None = ..., + xattr_blkaddr: _c_erofs.uint32 | None = ..., + uuid: __cs__.CharArray | None = ..., + volume_name: __cs__.CharArray | None = ..., + feature_incompat: _c_erofs.uint32 | None = ..., + available_compr_algs: _c_erofs.uint16 | None = ..., + extra_devices: _c_erofs.uint16 | None = ..., + devt_slotoff: _c_erofs.uint16 | None = ..., + dirblkbits: _c_erofs.uint8 | None = ..., + xattr_prefix_count: _c_erofs.uint8 | None = ..., + xattr_prefix_start: _c_erofs.uint32 | None = ..., + packed_nid: _c_erofs.uint64 | None = ..., + xattr_filter_reserved: _c_erofs.uint8 | None = ..., + reserved: __cs__.Array[_c_erofs.uint8] | None = ..., + build_time: _c_erofs.uint32 | None = ..., + rootnid_8b: _c_erofs.uint64 | None = ..., + reserved2: __cs__.Array[_c_erofs.uint8] | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class erofs_i_format(__cs__.Structure): + inode_version: _c_erofs.uint16 + inode_data_layout: _c_erofs.uint16 + @overload + def __init__( + self, inode_version: _c_erofs.uint16 | None = ..., inode_data_layout: _c_erofs.uint16 | None = ... + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class erofs_inode(__cs__.Structure): + i_advise: _c_erofs.erofs_i_format + i_xattr_icount: _c_erofs.uint16 + i_mode: _c_erofs.uint16 + i_nlink: _c_erofs.uint16 + i_size: _c_erofs.uint32 + reserved: _c_erofs.uint32 + i_u: _c_erofs.uint32 + i_ino: _c_erofs.uint32 + i_uid: _c_erofs.uint16 + i_gid: _c_erofs.uint16 + i_checksum: _c_erofs.uint32 + @overload + def __init__( + self, + i_advise: _c_erofs.erofs_i_format | None = ..., + i_xattr_icount: _c_erofs.uint16 | None = ..., + i_mode: _c_erofs.uint16 | None = ..., + i_nlink: _c_erofs.uint16 | None = ..., + i_size: _c_erofs.uint32 | None = ..., + reserved: _c_erofs.uint32 | None = ..., + i_u: _c_erofs.uint32 | None = ..., + i_ino: _c_erofs.uint32 | None = ..., + i_uid: _c_erofs.uint16 | None = ..., + i_gid: _c_erofs.uint16 | None = ..., + i_checksum: _c_erofs.uint32 | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class erofs_inode_extended(__cs__.Structure): + i_advise: _c_erofs.erofs_i_format + i_xattr_icount: _c_erofs.uint16 + i_mode: _c_erofs.uint16 + i_reserved: _c_erofs.uint16 + i_size: _c_erofs.uint64 + i_u: _c_erofs.uint32 + i_ino: _c_erofs.uint32 + i_uid: _c_erofs.uint32 + i_gid: _c_erofs.uint32 + i_mtime: _c_erofs.uint64 + i_mtime_nsec: _c_erofs.uint32 + i_nlink: _c_erofs.uint32 + i_reserved2: __cs__.CharArray + @overload + def __init__( + self, + i_advise: _c_erofs.erofs_i_format | None = ..., + i_xattr_icount: _c_erofs.uint16 | None = ..., + i_mode: _c_erofs.uint16 | None = ..., + i_reserved: _c_erofs.uint16 | None = ..., + i_size: _c_erofs.uint64 | None = ..., + i_u: _c_erofs.uint32 | None = ..., + i_ino: _c_erofs.uint32 | None = ..., + i_uid: _c_erofs.uint32 | None = ..., + i_gid: _c_erofs.uint32 | None = ..., + i_mtime: _c_erofs.uint64 | None = ..., + i_mtime_nsec: _c_erofs.uint32 | None = ..., + i_nlink: _c_erofs.uint32 | None = ..., + i_reserved2: __cs__.CharArray | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class erofs_dir_entry(__cs__.Structure): + inode: _c_erofs.uint64 + name_offset: _c_erofs.int16 + file_type: _c_erofs.uint8 + reserved: _c_erofs.uint8 + @overload + def __init__( + self, + inode: _c_erofs.uint64 | None = ..., + name_offset: _c_erofs.int16 | None = ..., + file_type: _c_erofs.uint8 | None = ..., + reserved: _c_erofs.uint8 | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class EROFS_DATA_MAPPING(__cs__.Enum): + EROFS_INODE_FLAT_PLAIN = ... + EROFS_INODE_COMPRESSED_FULL = ... + EROFS_INODE_FLAT_INLINE = ... + EROFS_INODE_COMPRESSED_COMPACT = ... + EROFS_INODE_CHUNK_BASED = ... + EROFS_INODE_DATALAYOUT_MAX = ... + + class EROFS_COMPRESSION_ALGORITHM(__cs__.Enum): + Z_EROFS_COMPRESSION_LZ4 = ... + Z_EROFS_COMPRESSION_LZMA = ... + Z_EROFS_COMPRESSION_DEFLATE = ... + Z_EROFS_COMPRESSION_ZSTD = ... + Z_EROFS_COMPRESSION_MAX = ... + + class Z_EROFS_LCLUSTER_TYPE(__cs__.Enum): + Z_EROFS_LCLUSTER_TYPE_PLAIN = ... + Z_EROFS_LCLUSTER_TYPE_HEAD1 = ... + Z_EROFS_LCLUSTER_TYPE_NONHEAD = ... + Z_EROFS_LCLUSTER_TYPE_HEAD2 = ... + Z_EROFS_LCLUSTER_TYPE_MAX = ... + + class erofs_xattr_ibody_header(__cs__.Structure): + h_name_filter: _c_erofs.uint32 + h_shared_count: _c_erofs.uint8 + h_reserved2: __cs__.Array[_c_erofs.uint8] + h_shared_xattrs: __cs__.Array[_c_erofs.uint32] + @overload + def __init__( + self, + h_name_filter: _c_erofs.uint32 | None = ..., + h_shared_count: _c_erofs.uint8 | None = ..., + h_reserved2: __cs__.Array[_c_erofs.uint8] | None = ..., + h_shared_xattrs: __cs__.Array[_c_erofs.uint32] | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class erofs_xattr_entry(__cs__.Structure): + e_name_len: _c_erofs.uint8 + e_name_index: _c_erofs.uint8 + e_value_size: _c_erofs.uint16 + e_name: __cs__.CharArray + e_value: __cs__.CharArray + @overload + def __init__( + self, + e_name_len: _c_erofs.uint8 | None = ..., + e_name_index: _c_erofs.uint8 | None = ..., + e_value_size: _c_erofs.uint16 | None = ..., + e_name: __cs__.CharArray | None = ..., + e_value: __cs__.CharArray | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class z_erofs_map_header(__cs__.Structure): + h_reserved1: _c_erofs.uint32 + h_advise: _c_erofs.uint16 + h_algorithmtype_head_1: _c_erofs.uint8 + h_algorithmtype_head_2: _c_erofs.uint8 + h_clusterbits_lclusterbits: _c_erofs.uint8 + h_pclusterbits_head1: _c_erofs.uint8 + h_pclusterbits_head2: _c_erofs.uint8 + @overload + def __init__( + self, + h_reserved1: _c_erofs.uint32 | None = ..., + h_advise: _c_erofs.uint16 | None = ..., + h_algorithmtype_head_1: _c_erofs.uint8 | None = ..., + h_algorithmtype_head_2: _c_erofs.uint8 | None = ..., + h_clusterbits_lclusterbits: _c_erofs.uint8 | None = ..., + h_pclusterbits_head1: _c_erofs.uint8 | None = ..., + h_pclusterbits_head2: _c_erofs.uint8 | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class z_erofs_vle_decompressed_index(__cs__.Structure): + di_advise: _c_erofs.uint16 + di_clusterofs: _c_erofs.uint16 + class __anonymous_0__(__cs__.Union): + blkaddr: _c_erofs.uint32 + delta: __cs__.Array[_c_erofs.uint16] + @overload + def __init__( + self, blkaddr: _c_erofs.uint32 | None = ..., delta: __cs__.Array[_c_erofs.uint16] | None = ... + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + di_u: __anonymous_0__ + @overload + def __init__( + self, + di_advise: _c_erofs.uint16 | None = ..., + di_clusterofs: _c_erofs.uint16 | None = ..., + di_u: __anonymous_0__ | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + +# Technically `c_erofs` is an instance of `_c_erofs`, but then we can't use it in type hints +c_erofs: TypeAlias = _c_erofs diff --git a/dissect/erofs/erofs.py b/dissect/erofs/erofs.py new file mode 100644 index 0000000..6e89827 --- /dev/null +++ b/dissect/erofs/erofs.py @@ -0,0 +1,548 @@ +from __future__ import annotations + +import math +from dataclasses import dataclass +from functools import cached_property, lru_cache +from os import SEEK_CUR +from stat import S_IFDIR, S_IFLNK, S_IFMT, S_IFREG +from typing import TYPE_CHECKING, BinaryIO + +from dissect.util.compression import lz4 +from dissect.util.stream import AlignedStream, MappingStream, RangeStream + +from dissect.erofs.c_erofs import ( + COMPACT_INODE_SIZE, + COMPRESSION_MAP_SIZE, + DIR_ENTRY_SIZE, + EXTENDED_INODE_SIZE, + XATTR_NAME_MAP, + c_erofs, +) +from dissect.erofs.exception import FileNotFoundError, NotADirectoryError, NotASymlinkError +from dissect.erofs.legacy_lz4 import legacy_lz4_decompress + +if TYPE_CHECKING: + from collections.abc import Iterator + +EROFS_LOGICAL_CLUSTER_BITS = 12 + + +class EROFS: + """Enhanced Read-Only File System. + + Args: + fh: A file-like object of the volume containing the filesystem. + + References: + - https://www.kernel.org/doc/Documentation/filesystems/erofs.txt + - https://www.usenix.org/system/files/atc19-gao.pdf + - https://erofs.docs.kernel.org/en/latest/core_ondisk.html + + References specific to compacted clusters: + - https://android.googlesource.com/kernel/common/+/ed9be64eefe2/fs/erofs/zmap.c + - https://review.lineageos.org/c/LineageOS/android_kernel_xiaomi_surya/+/390644 + + """ + + def __init__(self, fh: BinaryIO): + if not self.detect_erofs(fh): + raise ValueError("Not a valid EROFS filesystem") + self.fh = fh + self.fh.seek(c_erofs.EROFS_SUPER_OFFSET) + self.super_block = c_erofs.erofs_super_block(fh) + + self.block_size = 1 << self.super_block.blkszbits + if self.super_block.blkszbits != EROFS_LOGICAL_CLUSTER_BITS: + raise NotImplementedError("Only 4K block size is currently supported") + + self._decompress_block = lru_cache(1024)(self._decompress_block) + self._shared_xattr = lru_cache(1024)(self._shared_xattr) + self.root = INode(self, self.super_block.rootnid_2b) + + @staticmethod + def detect_erofs(fh: BinaryIO) -> bool: + """Return whether the given file-like object contains an EROFS filesystem.""" + fh.seek(c_erofs.EROFS_SUPER_OFFSET) + return fh.read(4) == c_erofs.EROFS_SUPER_MAGIC_V1.to_bytes(4, "little") + + def get(self, path: str, node: INode | None = None) -> INode: + """Returns an inode for a given path.""" + node = node if node else self.root + parts = path.split("/") + for part in parts: + if not part: + continue + + for entry in node.iterdir(): + if entry.name == part: + node = entry + break + else: + raise FileNotFoundError(f"File not found: {path}") + + return node + + def _shared_xattr(self, xattr_id: int) -> tuple[str, bytes]: + """Return a shared xattr by its ID.""" + self.fh.seek(self.super_block.xattr_blkaddr * self.block_size + 4 * xattr_id) + xattr_entry = c_erofs.erofs_xattr_entry(self.fh) + prefix = XATTR_NAME_MAP.get(xattr_entry.e_name_index, "unknown") + return f"{prefix}{xattr_entry.e_name.decode()}", xattr_entry.e_value + + def _decompress_block( + self, + block_address: int, + num_blocks: int, + algorithm: c_erofs.EROFS_COMPRESSION_ALGORITHM, + legacy_lz4: bool = False, + ) -> bytes: + """Decompress a block given its address, number of blocks and compression algorithm.""" + self.fh.seek(block_address * self.block_size) + compressed = self.fh.read(self.block_size * num_blocks) + if algorithm == c_erofs.EROFS_COMPRESSION_ALGORITHM.Z_EROFS_COMPRESSION_LZ4: + supports_lz4_0padding = self.super_block.feature_incompat & c_erofs.EROFS_FEATURE_INCOMPAT_LZ4_0PADDING + if supports_lz4_0padding: + compressed = compressed.lstrip(b"\0") + + # Older versions of erofs-utils seemed to use a lz4_decompress_partial functionality that was never + # intended: https://github.com/lz4/lz4/issues/783 is mentioned in the readme. We chose to copy paste an + # implementation of that function (thanks srlabs) here for legacy support. Possibly this legacy function + # can be replaced by digging a little deeper into lz4 + return lz4.decompress(compressed) if not legacy_lz4 else legacy_lz4_decompress(compressed) + raise NotImplementedError(f"No decompression support for algorithm type {algorithm.name}") + + +class INode: + def __init__(self, erofs: EROFS, inode_number: int, name: str | None = None): + self.erofs = erofs + self.inode_number = inode_number + self.name = name + + self._offset = ( + self.erofs.super_block.meta_blkaddr * self.erofs.block_size + COMPACT_INODE_SIZE * self.inode_number + ) + + def __repr__(self) -> str: + return f"" + + @cached_property + def _xattr_offset(self) -> int: + """Return the offset of the xattr data for this inode.""" + self.erofs.fh.seek(self._offset) + i_format = c_erofs.erofs_i_format(self.erofs.fh) + if i_format.inode_version == c_erofs.EROFS_INODE_LAYOUT_V1: + inode_size = COMPACT_INODE_SIZE + elif i_format.inode_version == c_erofs.EROFS_INODE_LAYOUT_V2: + inode_size = EXTENDED_INODE_SIZE + return self._offset + inode_size + + @cached_property + def inode(self) -> c_erofs.erofs_inode: + """Return the raw inode structure.""" + self.erofs.fh.seek(self._offset) + inode = c_erofs.erofs_inode(self.erofs.fh) + if inode.i_advise.inode_version != c_erofs.EROFS_INODE_LAYOUT_V1: + if inode.i_advise.inode_version == c_erofs.EROFS_INODE_LAYOUT_V2: + self.erofs.fh.seek(self._offset) + inode = c_erofs.erofs_inode_extended(self.erofs.fh) + else: + raise ValueError(f"Unsupported Inode layout {inode.i_advise.inode_version}") + + return inode + + @property + def gid(self) -> int: + """Return the Group-ID of this inode.""" + return self.inode.i_gid + + @property + def uid(self) -> int: + """Return the User-ID of this inode.""" + return self.inode.i_uid + + @property + def mode(self) -> int: + """Return the mode of this inode.""" + return self.inode.i_mode + + @property + def size(self) -> int: + """Return the size of this inode in bytes.""" + return self.inode.i_size + + @property + def nlink(self) -> int: + """Return the number of hard links to this inode.""" + return self.inode.i_nlink + + @cached_property + def xattr(self) -> dict[str, bytes]: + """Return the xattr key-value pairs for this inode.""" + xattr = {} + self.erofs.fh.seek(self._xattr_offset) + + x_attr_header = c_erofs.erofs_xattr_ibody_header(self.erofs.fh) + + for xattr_id in x_attr_header.h_shared_xattrs: + key, value = self.erofs._shared_xattr(xattr_id) + xattr[key] = value + bytes_read = len(x_attr_header) + + if bytes_read < self.xattr_size: + # Due to the shared_xattr lookups we might have seeked away from our inodes' xattr_offset + self.erofs.fh.seek(self._xattr_offset + len(x_attr_header)) + + while bytes_read < self.xattr_size: + xattr_entry = c_erofs.erofs_xattr_entry(self.erofs.fh) + + prefix = XATTR_NAME_MAP.get(xattr_entry.e_name_index, "unknown") + key, value = f"{prefix}{xattr_entry.e_name.decode()}", xattr_entry.e_value + xattr[key] = value + + bytes_read += len(xattr_entry) + + # Align on 4 byte boundary + misalignment = (-self.erofs.fh.tell()) % 4 + self.erofs.fh.seek(misalignment, SEEK_CUR) + bytes_read += misalignment + return xattr + + @cached_property + def xattr_size(self) -> int: + """Return the size of the xattr data for this inode.""" + return 0 if self.inode.i_xattr_icount == 0 else 12 + (self.inode.i_xattr_icount - 1) * 4 + + def is_dir(self) -> bool: + """Return whether this inode is a directory.""" + return S_IFMT(self.inode.i_mode) == S_IFDIR + + def is_symlink(self) -> bool: + """Return whether this inode is a symbolic link.""" + return S_IFMT(self.inode.i_mode) == S_IFLNK + + def is_file(self) -> bool: + """Return whether this inode is a regular file.""" + return S_IFMT(self.inode.i_mode) == S_IFREG + + @cached_property + def link(self) -> str: + """Return the target of this symlink.""" + if not self.is_symlink(): + raise NotASymlinkError + return self.open().read().decode("utf-8") + + def listdir(self) -> dict[str, INode]: + """Return a directory listing.""" + return {inode.name: inode for inode in self.iterdir()} + + dirlist = listdir + + def iterdir(self) -> Iterator[INode]: + """Iterate directory contents.""" + if self.is_symlink(): + yield from self.erofs.get(self.link).iterdir() + + if not self.is_dir(): + raise NotADirectoryError + + dir_data = self.open() + block = 0 + while True: + dir_data.seek(block * self.erofs.block_size) + # READ IN BLOCKS of 4096 + first_dir_entry = c_erofs.erofs_dir_entry(dir_data) + dir_entries = [first_dir_entry] + # First we get all the entries, then we get their name offsets + + # We know that directory entries are sorted so we can derive the number of entries from the offset + num_dir_entries = int(first_dir_entry.name_offset / DIR_ENTRY_SIZE) + + for _ in range(1, int(num_dir_entries)): + dir_entry = c_erofs.erofs_dir_entry(dir_data) + dir_entries.append(dir_entry) + + for i in range(num_dir_entries): + if i + 1 == num_dir_entries: + read_length = -1 # Read until end of stream + else: + # Read until offset of the next one + read_length = dir_entries[i + 1].name_offset - dir_entries[i].name_offset + + name = dir_data.read(read_length) + + # The last filename entry can contain trailing null bytes + if i + 1 == num_dir_entries and b"\0" in name: + name = name[: name.find(b"\0")] + + yield INode(self.erofs, dir_entries[i].inode, name.decode("utf-8")) + block += 1 + if block * self.erofs.block_size >= self.size: + break + + def open(self) -> AlignedStream: + """Return a file-like object for reading.""" + if self.inode.i_advise.inode_data_layout == c_erofs.EROFS_DATA_MAPPING.EROFS_INODE_FLAT_INLINE: + offset = self._xattr_offset + self.xattr_size + max_data_size_inode = self.erofs.block_size - (offset % self.erofs.block_size) + if self.size <= max_data_size_inode: + # All data fits within the inode + return RangeStream(self.erofs.fh, offset, self.size) + + blocks_needed = math.ceil((self.size - max_data_size_inode) / self.erofs.block_size) + head_data = RangeStream( + self.erofs.fh, self.inode.i_u * self.erofs.block_size, blocks_needed * self.erofs.block_size + ) + tail_data = RangeStream(self.erofs.fh, offset, self.size - head_data.size) + out = MappingStream() + out.add(0, head_data.size, head_data) + out.add(head_data.size, tail_data.size, tail_data) + return out + if self.inode.i_advise.inode_data_layout == c_erofs.EROFS_DATA_MAPPING.EROFS_INODE_FLAT_PLAIN: + return RangeStream(self.erofs.fh, self.inode.i_u * self.erofs.block_size, self.inode.i_size) + if self.inode.i_advise.inode_data_layout == c_erofs.EROFS_DATA_MAPPING.EROFS_INODE_COMPRESSED_FULL: + return DecompressionStream(self, compacted_clusters=False) + if self.inode.i_advise.inode_data_layout == c_erofs.EROFS_DATA_MAPPING.EROFS_INODE_COMPRESSED_COMPACT: + return DecompressionStream(self, compacted_clusters=True) + # TODO Unsupported: Chunk Based + raise NotImplementedError(f"Unsupported inode data layout {self.inode.i_advise.inode_data_layout}") + + +@dataclass +class LogicalCluster: + """A logical cluster helps mapping a logical address to a physical one. For HEAD / PLAIN clusters, you combine a + block address with a cluster offset, which determines how many bytes you still need from the previous physical + block. In this dataclass we abstract away from differences between legacy and compacted clusters. + """ + + type: c_erofs.Z_EROFS_LCLUSTER_TYPE + di_clusterofs: int + di_u_blkaddr: int + di_u_delta: list[int] + + +class DecompressionStream(AlignedStream): + """A stream that decompresses from a compressed EROFS inode. Supports both legacy and compacted clusters.""" + + def __init__(self, inode: INode, compacted_clusters: bool): + self.inode = inode + self.compacted_clusters = compacted_clusters + self._load_lcluster = lru_cache(1024)(self._load_lcluster) + self._decode_compacted_bits = lru_cache(1024)(self._decode_compacted_bits) + super().__init__(inode.size, inode.erofs.block_size) + + @cached_property + def compression_map(self) -> c_erofs.z_erofs_map_header: + """The compression map records the compression algorithm used and other metadata about the compressed inode.""" + compression_map_offset = self.inode._xattr_offset + self.inode.xattr_size + compression_map_offset = ((compression_map_offset + 7) // 8) * 8 + + self.inode.erofs.fh.seek(compression_map_offset) + return c_erofs.z_erofs_map_header(self.inode.erofs.fh) + + @cached_property + def compression_algorithm(self) -> c_erofs.EROFS_COMPRESSION_ALGORITHM: + return c_erofs.EROFS_COMPRESSION_ALGORITHM(self.compression_map.h_algorithmtype_head_1) + + def _load_legacy_lcluster(self, logical_cluster_index: int) -> LogicalCluster: + """A legacy logical cluster is straightforward: it follows after the compression map and padding.""" + offset = self.inode._xattr_offset + self.inode.xattr_size + offset = ((offset + 7) // 8) * 8 + offset += COMPRESSION_MAP_SIZE + c_erofs.Z_EROFS_VLE_LEGACY_HEADER_PADDING + offset += len(c_erofs.z_erofs_vle_decompressed_index) * logical_cluster_index + self.inode.erofs.fh.seek(offset) + + lcluster = c_erofs.z_erofs_vle_decompressed_index(self.inode.erofs.fh) + + type_int = (lcluster.di_advise >> c_erofs.Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT) & ( + (1 << c_erofs.Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) - 1 + ) + lcluster_type = c_erofs.Z_EROFS_LCLUSTER_TYPE(type_int) + + return LogicalCluster(lcluster_type, lcluster.di_clusterofs, lcluster.di_u.blkaddr, lcluster.di_u.delta) + + def _decode_compacted_bits(self, offset: int, pos: int) -> tuple[int, c_erofs.Z_EROFS_LCLUSTER_TYPE]: + """Several logical clusters are recorded at the same offset. This function decodes the bits for one.""" + self.inode.erofs.fh.seek(offset + pos // 8) + v = int.from_bytes(self.inode.erofs.fh.read(4), "little") >> (pos & 7) + lo = v & 4095 # Lomask is always 4095 for 4K clusters + lcluster_type = c_erofs.Z_EROFS_LCLUSTER_TYPE((v >> EROFS_LOGICAL_CLUSTER_BITS) & 3) + return lo, lcluster_type + + def _load_compacted_lcluster(self, logical_cluster_index: int) -> LogicalCluster: + """Loads a compacted logical cluster, which requires some decoding of the bit-packed format. It's a very rough + copy of the erofs-utils implementation and can probably be optimized further.""" + if self.compression_map.h_clusterbits_lclusterbits != 0: + raise NotImplementedError("Only 4K clusters are supported for compacted clusters") + + offset = self.inode._xattr_offset + self.inode.xattr_size + offset = ((offset + 7) // 8) * 8 # align to 8-byte boundary + offset += COMPRESSION_MAP_SIZE + + compacted_4b_initial = (32 - offset % 32) // 4 + if compacted_4b_initial == 32 / 4: + compacted_4b_initial = 0 + if self.compression_map.h_advise & c_erofs.Z_EROFS_ADVISE_COMPACTED_2B: + total_blocks = math.ceil(self.inode.inode.i_size / self.align) + compacted_2b = 16 * ((total_blocks - compacted_4b_initial) // 16) + + vcnt = 2 + amortized_shift = 2 + if logical_cluster_index >= compacted_4b_initial: + offset += compacted_4b_initial * 4 + logical_cluster_index -= compacted_4b_initial + + if logical_cluster_index < compacted_2b: + amortized_shift = 1 + vcnt = 16 + else: + offset += compacted_2b * 2 + logical_cluster_index -= compacted_2b + + vcnt_shifted = vcnt << amortized_shift + encodebits = (vcnt_shifted - 4) * 8 // vcnt + + offset += logical_cluster_index * (2 * amortized_shift) + offset_in_block = offset % self.align + cluster_base = (offset_in_block // vcnt_shifted) * vcnt_shifted + cluster_pos = (offset // self.align) * self.align + cluster_pos += cluster_base + i = (offset_in_block - cluster_base) >> amortized_shift + + lo, cluster_type = self._decode_compacted_bits(cluster_pos, i * encodebits) + + if cluster_type == c_erofs.Z_EROFS_LCLUSTER_TYPE.Z_EROFS_LCLUSTER_TYPE_NONHEAD: + if lo & c_erofs.Z_EROFS_LI_D0_CBLKCNT: + # First non-head cluster of a compressed extent + compressed_blocks = lo & ~c_erofs.Z_EROFS_LI_D0_CBLKCNT + return LogicalCluster(cluster_type, lo, 0, [compressed_blocks]) + + if i + 1 == vcnt: + lo, previous_lcluster_type = self._decode_compacted_bits(cluster_pos, (i - 1) * encodebits) + if previous_lcluster_type != c_erofs.Z_EROFS_LCLUSTER_TYPE.Z_EROFS_LCLUSTER_TYPE_NONHEAD: + lo = 0 + lo += 1 + + # Non-head cluster, the only thing we care about is delta 0 + return LogicalCluster(cluster_type, 0, 0, [lo]) + + big_pcluster = self.compression_map.h_advise & c_erofs.Z_EROFS_ADVISE_BIG_PCLUSTER_1 + if not big_pcluster: + raise NotImplementedError + + nblk = 0 + while i > 0: + i -= 1 + prev_cluster_lo, prev_cluster_type = self._decode_compacted_bits(cluster_pos, i * encodebits) + if prev_cluster_type == c_erofs.Z_EROFS_LCLUSTER_TYPE.Z_EROFS_LCLUSTER_TYPE_NONHEAD: + if prev_cluster_lo & c_erofs.Z_EROFS_LI_D0_CBLKCNT: + i -= 1 + nblk += prev_cluster_lo & ~c_erofs.Z_EROFS_LI_D0_CBLKCNT + else: + i -= prev_cluster_lo - 2 + continue + nblk += 1 + + self.inode.erofs.fh.seek(cluster_pos + (vcnt_shifted - 4)) + physical_block = int.from_bytes(self.inode.erofs.fh.read(4), "little") + nblk + return LogicalCluster(cluster_type, lo, physical_block, []) + + def _load_lcluster(self, logical_cluster_index: int) -> LogicalCluster: + """Loads a logical cluster, selecting the right format.""" + return ( + self._load_compacted_lcluster(logical_cluster_index) + if self.compacted_clusters + else self._load_legacy_lcluster(logical_cluster_index) + ) + + def _previous_block_remaining_bytes(self, current_block_idx: int) -> bytes: + """Given a block index, we want to start reading from the previous block's remaining bytes.""" + needle_idx = current_block_idx - 1 + nonhead_distance_covered = 0 + + while needle_idx >= 0: + previous_lcluster = self._load_lcluster(needle_idx) + if previous_lcluster.type == c_erofs.Z_EROFS_LCLUSTER_TYPE.Z_EROFS_LCLUSTER_TYPE_HEAD1: + num_blocks = self._get_extent_compressed_length(needle_idx) + physical_block = self.inode.erofs._decompress_block( + previous_lcluster.di_u_blkaddr, + num_blocks, + self.compression_algorithm, + legacy_lz4=not self.compacted_clusters, + ) + reserved_for_this_logical_block = self.align - previous_lcluster.di_clusterofs + return physical_block[reserved_for_this_logical_block + nonhead_distance_covered :] + + if previous_lcluster.type == c_erofs.Z_EROFS_LCLUSTER_TYPE.Z_EROFS_LCLUSTER_TYPE_PLAIN: + self.inode.erofs.fh.seek(self.align * previous_lcluster.di_u_blkaddr) + physical_block = self.inode.erofs.fh.read(self.align) + reserved_for_this_logical_block = self.align - previous_lcluster.di_clusterofs + return physical_block[reserved_for_this_logical_block + nonhead_distance_covered :] + + nonhead_distance_covered += self.align + needle_idx -= 1 + + raise ValueError("No previous data block found") + + def _get_extent_compressed_length(self, logical_cluster_index: int) -> int: + """Determine how many compressed blocks are used by checking the delta[0] of the next logical cluster, if the + next cluster is a NONHEAD cluster.""" + if logical_cluster_index >= self.inode.size // self.align: + # Last block + return 1 + next_lcluster = self._load_lcluster(logical_cluster_index + 1) + if next_lcluster.type != c_erofs.Z_EROFS_LCLUSTER_TYPE.Z_EROFS_LCLUSTER_TYPE_NONHEAD: + # The next block is not NONHEAD and will have its own block address, so this must be a single block extent + return 1 + + return next_lcluster.di_u_delta[0] + + def _read(self, offset: int, length: int) -> bytes: + """Read and decompress data from the compressed inode for a given offset and length.""" + block_start = offset // self.align + block_end = (offset + length) // self.align + blocks_out = [] + for block_idx in range(block_start, block_end): + lcluster = self._load_lcluster(block_idx) + + wanted_bytes = min(self.align, self.inode.size - (self.align * block_idx)) + block_out = b"" + if lcluster.type == c_erofs.Z_EROFS_LCLUSTER_TYPE.Z_EROFS_LCLUSTER_TYPE_HEAD1: + if lcluster.di_clusterofs == 0: + # This HEAD cluster is fully contained in one block + num_blocks = self._get_extent_compressed_length(block_idx) + block_out += self.inode.erofs._decompress_block( + lcluster.di_u_blkaddr, + num_blocks, + self.compression_algorithm, + legacy_lz4=not self.compacted_clusters, + )[: self.align] + else: + # This HEAD cluster is split between two blocks + block_out += self._previous_block_remaining_bytes(block_idx)[:wanted_bytes] + if len(block_out) < wanted_bytes: + num_blocks = self._get_extent_compressed_length(block_idx) + new_block = self.inode.erofs._decompress_block( + lcluster.di_u_blkaddr, + num_blocks, + self.compression_algorithm, + legacy_lz4=not self.compacted_clusters, + ) + block_out += new_block[: self.align - lcluster.di_clusterofs] + + elif lcluster.type == c_erofs.Z_EROFS_LCLUSTER_TYPE.Z_EROFS_LCLUSTER_TYPE_NONHEAD: + block_out += self._previous_block_remaining_bytes(block_idx)[:wanted_bytes] + + elif lcluster.type == c_erofs.Z_EROFS_LCLUSTER_TYPE.Z_EROFS_LCLUSTER_TYPE_PLAIN: + if lcluster.di_clusterofs != 0: + # PLAIN cluster starts in the previous block + block_out += self._previous_block_remaining_bytes(block_idx)[:wanted_bytes] + if len(block_out) < wanted_bytes: + self.inode.erofs.fh.seek(self.align * lcluster.di_u_blkaddr) + new_block = self.inode.erofs.fh.read(self.align) + block_out += new_block[: self.align - lcluster.di_clusterofs] + else: + raise NotImplementedError(f"Not supported cluster type {lcluster.type}") + + blocks_out.append(block_out) + + return b"".join(blocks_out) diff --git a/dissect/erofs/exception.py b/dissect/erofs/exception.py new file mode 100644 index 0000000..ad5a669 --- /dev/null +++ b/dissect/erofs/exception.py @@ -0,0 +1,25 @@ +from __future__ import annotations + + +class Error(Exception): + pass + + +class FileNotFoundError(Error, FileNotFoundError): + pass + + +class IsADirectoryError(Error, IsADirectoryError): + pass + + +class NotADirectoryError(Error, NotADirectoryError): + pass + + +class NotAFileError(Error): + pass + + +class NotASymlinkError(Error): + pass diff --git a/dissect/erofs/legacy_lz4.py b/dissect/erofs/legacy_lz4.py new file mode 100644 index 0000000..6b6ac1f --- /dev/null +++ b/dissect/erofs/legacy_lz4.py @@ -0,0 +1,53 @@ +from io import BytesIO + + +# Copied from https://github.com/srlabs/extractor/blob/main/erofs_tool.py +def legacy_lz4_decompress(buf: bytes) -> bytes: + """ + https://github.com/lz4/lz4/blob/master/doc/lz4_Block_format.md + :param buf: Compressed buffer, raw LZ4 without framing or length header + :param maxlen: Maximum length to extract, will return buffer after extracting that amount of bytes + :param expected: Optional known decompressed value to debug extraction errors + :return: + """ + with BytesIO() as out: + pos = 0 + while pos < len(buf): + token_byte = buf[pos] + pos += 1 + # Get length of literal from input + literal_length = token_byte >> 4 + if literal_length == 0xF: + length_byte = buf[pos] + pos += 1 + literal_length += length_byte + while length_byte == 0xFF: + length_byte = buf[pos] + pos += 1 + literal_length += length_byte + literal_buf = buf[pos : pos + literal_length] + pos += literal_length + out.write(literal_buf) + if pos == len(buf) or pos == len(buf) - 1: + break + offset = buf[pos] + 256 * buf[pos + 1] + pos += 2 + if offset == 0: + continue + matchlength = token_byte & 0x0F + if matchlength == 0xF: + length_byte = buf[pos] + pos += 1 + matchlength += length_byte + while length_byte == 0xFF: + length_byte = buf[pos] + pos += 1 + matchlength += length_byte + matchlength += 4 + match_pos = out.tell() - offset + while matchlength > 0: + copylen = min(matchlength, out.tell() - match_pos) + copybuf = out.getvalue()[match_pos : match_pos + copylen] + out.write(copybuf) + matchlength -= copylen + return out.getvalue() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..bc30968 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,119 @@ +[build-system] +requires = ["setuptools>=65.5.0", "setuptools_scm[toml]>=6.4.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "dissect.erofs" +description = "A Dissect module implementing a parser for erofs, the Enhanced Read-Only File System." +readme = "README.md" +requires-python = ">=3.10.0" +license.text = "Apache License 2.0" +authors = [ + {name = "JSCU-NL", email = "121175071+JSCU-CNI@users.noreply.github.com"}, + {name = "Dissect Team", email = "dissect@fox-it.com"} +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "License :: OSI Approved", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Topic :: Internet :: Log Analysis", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Security", + "Topic :: Utilities", +] +dependencies = [ + "dissect.cstruct>=4,<5.0", + "dissect.util>=3,<4", +] +dynamic = ["version"] + +[project.urls] +homepage = "https://dissect.tools" +documentation = "https://docs.dissect.tools/en/latest/projects/dissect.erofs" +repository = "https://github.com/fox-it/dissect.erofs" + +[project.optional-dependencies] +dev = [ + "dissect.cstruct>=4.0.dev,<5.0.dev", + "dissect.util>=3.0.dev,<4.0.dev", +] + +[dependency-groups] +test = [ + "pytest", +] +lint = [ + "ruff==0.13.1", + "vermin", + "typing_extensions", +] +build = [ + "build", +] +debug = [ + "ipdb", +] +dev = [ + {include-group = "test"}, + {include-group = "lint"}, + {include-group = "debug"}, +] + +[tool.ruff] +line-length = 120 +required-version = ">=0.11.0" + +[tool.ruff.format] +docstring-code-format = true + +[tool.ruff.lint] +select = [ + "F", + "E", + "W", + "I", + "UP", + "YTT", + "ANN", + "B", + "C4", + "DTZ", + "T10", + "FA", + "ISC", + "G", + "INP", + "PIE", + "PYI", + "PT", + "Q", + "RSE", + "RET", + "SLOT", + "SIM", + "TID", + "TCH", + "PTH", + "PLC", + "TRY", + "FLY", + "PERF", + "FURB", + "RUF", +] +ignore = ["E203", "B904", "UP024", "ANN002", "ANN003", "ANN204", "ANN401", "SIM105", "TRY003"] + +[tool.ruff.lint.per-file-ignores] +"tests/_docs/**" = ["INP001"] +"**/*.pyi" = ["PYI042", "PYI047"] + +[tool.ruff.lint.isort] +known-first-party = ["dissect.erofs"] +known-third-party = ["dissect"] + +[tool.setuptools.packages.find] +include = ["dissect.*"] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/_data/legacy_compression_new.erofs b/tests/_data/legacy_compression_new.erofs new file mode 100644 index 0000000..757f202 --- /dev/null +++ b/tests/_data/legacy_compression_new.erofs @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dff26ef44100d63415537a8c8dcdd7ff1f1b3db974ba634e8b2a9b717f8ebe7c +size 503808 diff --git a/tests/_data/vendor.erofs b/tests/_data/vendor.erofs new file mode 100644 index 0000000..96e9a5f --- /dev/null +++ b/tests/_data/vendor.erofs @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00b7da4eb7703f3188bdf059611cf6f66e1e75158929ab6adaba8b53b4ea49cc +size 47927296 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..d1ca3f0 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,19 @@ +import gzip +from collections.abc import Iterator +from pathlib import Path + +import pytest + + +def absolute_path(filename: str) -> Path: + return Path(__file__).parent.resolve() / filename + + +@pytest.fixture +def image() -> Iterator[gzip.GzipFile]: + return absolute_path("_data/vendor.erofs").open("rb") + + +@pytest.fixture +def legacy_image() -> Iterator[gzip.GzipFile]: + return absolute_path("_data/legacy_compression_new.erofs").open("rb") diff --git a/tests/test_erofs.py b/tests/test_erofs.py new file mode 100644 index 0000000..1cf2666 --- /dev/null +++ b/tests/test_erofs.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +from hashlib import md5 +from typing import BinaryIO + +from dissect.erofs import EROFS + + +def test_erofs(image: BinaryIO) -> None: + assert EROFS.detect_erofs(image) is True + erofs = EROFS(image) + assert sorted(erofs.get("/").listdir().keys()) == [ + ".", + "..", + "apex", + "bin", + "build.prop", + "etc", + "lib", + "lib64", + "odm", + "odm_dlkm", + "overlay", + "usr", + "vendor_dlkm", + ] + + # EROFS_INODE_LAYOUT_INLINE + notice = erofs.get("/etc/NOTICE.xml.gz") + assert notice.inode.i_advise.inode_data_layout == 2 + assert notice.size == 138987 + assert md5(notice.open().read()).hexdigest() == "e4ac5d3d1fc34ea5cd9a57dda2c8a9b4" + assert notice.nlink == 1 + assert notice.gid == 0 + assert notice.uid == 0 + assert notice.is_file() + + # Shared xattr + assert erofs.get("/bin").xattr == {"security.selinux": b"u:object_r:vendor_file:s0"} + + # Symlink + zcat = erofs.get("/bin/zcat") + assert zcat.is_symlink() + assert zcat.link == "toybox_vendor" + + # Compressed, and custom xattr + toybox = erofs.get("/bin/toybox_vendor") + assert toybox.inode.i_advise.inode_data_layout == 3 + assert md5(toybox.open().read()).hexdigest() == "a3628f1cd6765c5b0f3f07555bcf6283" + assert toybox.xattr == {"security.selinux": b"u:object_r:vendor_toolbox_exec:s0"} + + # Largest file in the image + neural_networks = erofs.get("/apex/com.android.hardware.neuralnetworks.apex") + assert neural_networks.inode.i_advise.inode_data_layout == 3 + assert neural_networks.size == 13959168 + assert md5(neural_networks.open().read()).hexdigest() == "32821b8a8a677c49acd9d1a911bf4322" + + +def test_legacy_compression(legacy_image: BinaryIO) -> None: + assert EROFS.detect_erofs(legacy_image) is True + erofs = EROFS(legacy_image) + entry = erofs.get("/Apollo_11.html") + assert len(entry.inode) == 64 + assert entry.inode.i_advise.inode_data_layout == 1 + stream = entry.open() + stream.seek(787895) + assert stream.read(44) == b"It's different but it's very pretty out here" + stream.seek(0) + assert md5(stream.read()).hexdigest() == "be0107f993e925f1c3efb1048653d766" + + # Plain layout + entry = erofs.get("/certinstaller.vdex") + assert entry.inode.i_advise.inode_data_layout == 0 + stream = entry.open() + assert md5(stream.read()).hexdigest() == "9b893617181edb745b4c9537b694078b" diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..5af4811 --- /dev/null +++ b/tox.ini @@ -0,0 +1,63 @@ +[tox] +envlist = lint, py3, pypy3 +# This version of tox will autoprovision itself and the requirements defined in +# requires if they are not available on the host system. This requires the +# locally installed tox to have a minimum version 3.3.0. This means the names +# of the configuration options are still according to the tox 3.x syntax. +minversion = 4.4.3 +# This version of virtualenv will install setuptools version 65.5.0 and pip +# 22.3. These versions fully support python projects defined only through a +# pyproject.toml file (PEP-517/PEP-518/PEP-621) +requires = virtualenv>=20.16.6 + +[testenv] +extras = dev +deps = + pytest-cov + coverage +dependency_groups = test +commands = + pytest --basetemp="{envtmpdir}" {posargs:--color=yes --cov=dissect --cov-report=term-missing -v tests} + coverage report + coverage xml + +[testenv:build] +package = skip +dependency_groups = build +commands = + pyproject-build + +[testenv:fix] +package = skip +dependency_groups = lint +commands = + ruff check --fix dissect tests + ruff format dissect tests + +[testenv:lint] +package = skip +dependency_groups = lint +commands = + ruff check dissect tests + ruff format --check dissect tests + vermin -t=3.10- --no-tips --lint dissect tests + +[testenv:docs-build] +allowlist_externals = make +deps = + sphinx + sphinx-autoapi + sphinx_argparse_cli + sphinx-copybutton + sphinx-design + furo +commands = + make -C tests/_docs clean + make -C tests/_docs html + +[testenv:docs-linkcheck] +allowlist_externals = make +deps = {[testenv:docs-build]deps} +commands = + make -C tests/_docs clean + make -C tests/_docs linkcheck