From aab056e7b5afc55a5b0782c740e464de92912e4b Mon Sep 17 00:00:00 2001
From: Richard Hughes <richard@hughsie.com>
Date: Thu, 27 Feb 2025 12:43:08 +0000
Subject: [PATCH 1/9] trivial: Do not check for byte-identical zlib enciding
 results

This is going to be different depending on OS.
---
 cabarchive/test_misc.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cabarchive/test_misc.py b/cabarchive/test_misc.py
index f38c849..952f054 100755
--- a/cabarchive/test_misc.py
+++ b/cabarchive/test_misc.py
@@ -25,7 +25,7 @@
 def _check_range(data: bytes, expected: bytes) -> None:
     assert data
     assert expected
-    failures = 0
+    failures: int = 0
     if len(data) != len(expected):
         print(f"different sizes, got {len(data)} expected {len(expected)}")
         failures += 1
@@ -36,7 +36,8 @@ def _check_range(data: bytes, expected: bytes) -> None:
             if failures > 10:
                 print("More than 10 failures, giving up...")
                 break
-    assert failures == 0, "Data is not the same"
+    if failures:
+        raise ValueError("Data is not the same")
 
 
 class TestInfParser(unittest.TestCase):
@@ -134,7 +135,6 @@ def test_large_compressed(self):
             hashlib.sha1(cff.buf).hexdigest(),
             "8497fe89c41871e3cbd7955e13321e056dfbd170",
         )
-        _check_range(arc.save(compress=True), old)
 
     def test_multi_folder(self):
         # open a folder with multiple folders

From b4d6e54e9c95e4e0b68c85979b32f7947e997e44 Mon Sep 17 00:00:00 2001
From: Richard Hughes <richard@hughsie.com>
Date: Thu, 27 Feb 2025 13:44:56 +0000
Subject: [PATCH 2/9] Remove --autorepack as cabarchive is not available

---
 cabarchive/cli.py | 31 +++----------------------------
 1 file changed, 3 insertions(+), 28 deletions(-)

diff --git a/cabarchive/cli.py b/cabarchive/cli.py
index 3152cea..4f512eb 100755
--- a/cabarchive/cli.py
+++ b/cabarchive/cli.py
@@ -10,27 +10,10 @@
 import sys
 import os
 import argparse
-import tempfile
-import subprocess
-import glob
 
 sys.path.append(os.path.realpath("."))
 
-from cabarchive import CabArchive, CabFile, NotSupportedError
-
-
-def repack(arc: CabArchive, arg: str) -> None:
-    with tempfile.TemporaryDirectory("cabarchive") as tmpdir:
-        print(f"Extracting to {tmpdir}")
-        subprocess.call(["cabextract", "--fix", "--quiet", "--directory", tmpdir, arg])
-        for fn in glob.iglob(os.path.join(tmpdir, "**"), recursive=True):
-            try:
-                with open(fn, "rb") as f:
-                    fn_noprefix = fn[len(tmpdir) + 1 :]
-                    print(f"Adding: {fn_noprefix}")
-                    arc[fn_noprefix] = CabFile(f.read())
-            except IsADirectoryError as _:
-                pass
+from cabarchive import CabArchive, NotSupportedError
 
 
 def main():
@@ -41,12 +24,6 @@ def main():
         help="decompress the archives",
         default=False,
     )
-    parser.add_argument(
-        "--autorepack",
-        action="store_true",
-        help="Repack using cabextract when required",
-        default=False,
-    )
     parser.add_argument(
         "--info",
         action="store_true",
@@ -71,10 +48,8 @@ def main():
             with open(arg, "rb") as f:
                 arc.parse(f.read())
         except NotSupportedError as e:
-            if not args.autorepack:
-                print(f"Failed to parse: {str(e)}; perhaps try --autorepack")
-                return 1
-            repack(arc, arg)
+            print(f"Failed to parse: {str(e)}")
+            return 1
         print(f"Parsing {arg}:")
         if args.info:
             for fn in arc:

From a817c364131ca7e3534993c8d3fd6fffe2661343 Mon Sep 17 00:00:00 2001
From: Richard Hughes <richard@hughsie.com>
Date: Thu, 27 Feb 2025 15:09:15 +0000
Subject: [PATCH 3/9] Use immutable bytes() to speed up writing by ~30%

---
 cabarchive/parser.py    | 4 ++--
 cabarchive/test_misc.py | 2 +-
 cabarchive/writer.py    | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cabarchive/parser.py b/cabarchive/parser.py
index 385c31e..d55aa98 100644
--- a/cabarchive/parser.py
+++ b/cabarchive/parser.py
@@ -30,7 +30,7 @@ class CabArchiveParser:
     def __init__(self, cfarchive: "CabArchive", flattern: bool = False):
         self.cfarchive: "CabArchive" = cfarchive
         self.flattern: bool = flattern
-        self._folder_data: List[bytearray] = []
+        self._folder_data: List[bytes] = []
         self._buf: bytes = b""
         self._header_reserved: bytes = b""
         self._zdict: Optional[bytes] = None
@@ -127,7 +127,7 @@ def parse_cfdata(self, idx: int, offset: int, compression: int) -> int:
         # verify checksum
         if checksum != 0:
             checksum_actual = _checksum_compute(buf_cfdata)
-            hdr = bytearray(struct.pack("<HH", blob_comp, blob_uncomp))
+            hdr = struct.pack("<HH", blob_comp, blob_uncomp)
             checksum_actual = _checksum_compute(hdr, checksum_actual)
             if checksum_actual != checksum:
                 raise CorruptionError(
diff --git a/cabarchive/test_misc.py b/cabarchive/test_misc.py
index 952f054..56508ab 100755
--- a/cabarchive/test_misc.py
+++ b/cabarchive/test_misc.py
@@ -213,7 +213,7 @@ def test_create(self):
             b"\x20\x70\x72\x69\x6E\x74\x66\x28\x22\x57\x65\x6C\x63\x6F\x6D\x65"
             b"\x21\x5C\x6E\x22\x29\x3B\x0D\x0A\x7D\x0D\x0A\x0D\x0A"
         )
-        _check_range(bytearray(data), bytearray(expected))
+        _check_range(data, expected)
 
         # use cabextract to test validity
         try:
diff --git a/cabarchive/writer.py b/cabarchive/writer.py
index e8efcc5..fdc5311 100644
--- a/cabarchive/writer.py
+++ b/cabarchive/writer.py
@@ -54,7 +54,7 @@ def write(self) -> bytes:
             chunks_zlib = []
             for chunk in chunks:
                 compressobj = zlib.compressobj(9, zlib.DEFLATED, -zlib.MAX_WBITS)
-                chunk_zlib = bytearray(b"CK")
+                chunk_zlib = b"CK"
                 chunk_zlib += compressobj.compress(chunk)
                 chunk_zlib += compressobj.flush()
                 chunks_zlib.append(chunk_zlib)
@@ -126,7 +126,7 @@ def write(self) -> bytes:
             # first do the 'checksum' on the data, then the partial
             # header. slightly crazy, but anyway
             checksum = _checksum_compute(chunk_zlib)
-            hdr = bytearray(struct.pack("<HH", len(chunk_zlib), len(chunk)))
+            hdr = struct.pack("<HH", len(chunk_zlib), len(chunk))
             checksum = _checksum_compute(hdr, checksum)
             data += struct.pack(
                 FMT_CFDATA,
@@ -136,5 +136,5 @@ def write(self) -> bytes:
             )  # uncompressed bytes
             data += chunk_zlib
 
-        # return bytearray
+        # success
         return data

From 0563d759f532c6f1fc4f616c904355cb751a7aca Mon Sep 17 00:00:00 2001
From: Richard Hughes <richard@hughsie.com>
Date: Thu, 27 Feb 2025 15:16:00 +0000
Subject: [PATCH 4/9] trivial: Add CabArchive.size property for future use

---
 cabarchive/archive.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cabarchive/archive.py b/cabarchive/archive.py
index 847ecb5..48bc7f6 100644
--- a/cabarchive/archive.py
+++ b/cabarchive/archive.py
@@ -126,5 +126,10 @@ def save(self, compress: bool = False, sort: bool = True) -> bytes:
         """
         return CabArchiveWriter(self, compress=compress, sort=sort).write()
 
+    @property
+    def size(self) -> int:
+        """Returns cabinet uncompressed data size"""
+        return sum(len(cffile) for cffile in self.values())
+
     def __repr__(self) -> str:
         return f"CabArchive({[str(self[cabfile]) for cabfile in self]})"

From 606833d20591c6a8e4897ea8953826aca17dc914 Mon Sep 17 00:00:00 2001
From: Richard Hughes <richard@hughsie.com>
Date: Thu, 27 Feb 2025 15:16:41 +0000
Subject: [PATCH 5/9] Add a --create action to the test CLI tool

---
 cabarchive/cli.py | 46 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/cabarchive/cli.py b/cabarchive/cli.py
index 4f512eb..7aa246b 100755
--- a/cabarchive/cli.py
+++ b/cabarchive/cli.py
@@ -13,7 +13,7 @@
 
 sys.path.append(os.path.realpath("."))
 
-from cabarchive import CabArchive, NotSupportedError
+from cabarchive import CabArchive, CabFile, NotSupportedError
 
 
 def main():
@@ -24,6 +24,12 @@ def main():
         help="decompress the archives",
         default=False,
     )
+    parser.add_argument(
+        "--create",
+        action="store_true",
+        help="create an archive",
+        default=False,
+    )
     parser.add_argument(
         "--info",
         action="store_true",
@@ -42,25 +48,37 @@ def main():
         return 1
 
     args, argv = parser.parse_known_args()
-    for arg in argv:
-        arc = CabArchive()
-        try:
-            with open(arg, "rb") as f:
-                arc.parse(f.read())
-        except NotSupportedError as e:
-            print(f"Failed to parse: {str(e)}")
-            return 1
-        print(f"Parsing {arg}:")
-        if args.info:
-            for fn in arc:
-                print(fn)
-        if args.decompress:
+    if args.decompress:
+        for fn in argv:
+            arc = CabArchive()
+            try:
+                with open(fn, "rb") as f:
+                    arc.parse(f.read())
+            except NotSupportedError as e:
+                print(f"Failed to parse: {str(e)}")
+                return 1
+            print(f"Parsing {fn}:")
+            if args.info:
+                for fn in arc:
+                    print(fn)
             for fn in arc:
                 path = os.path.join(args.outdir, fn)
                 os.makedirs(os.path.dirname(path), exist_ok=True)
                 with open(path, "wb") as f:
                     print(f"Writing {fn}:")
                     f.write(arc[fn].buf)
+    elif args.create:
+        arc = CabArchive()
+        try:
+            print(f"Creating {argv[0]}:")
+        except IndexError:
+            print("Expected: ARCHIVE [FILE]...")
+            return 1
+        for fn in argv[1:]:
+            with open(fn, "rb") as f:
+                arc[os.path.basename(fn)] = CabFile(buf=f.read())
+        with open(argv[0], "wb") as f:
+            f.write(arc.save())
 
     return 0
 

From 986bb5fdf11f0bf471dc891f67f24254880fba83 Mon Sep 17 00:00:00 2001
From: Richard Hughes <richard@hughsie.com>
Date: Thu, 27 Feb 2025 15:18:09 +0000
Subject: [PATCH 6/9] Support reading and writing archives bigger than 2GB

Although, this probably isn't a good idea in reality as it is so slow...

See https://github.com/fwupd/fwupd/pull/8508 for the fwupd-side.
---
 cabarchive/parser.py | 17 +++++++++++++----
 cabarchive/writer.py |  2 +-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/cabarchive/parser.py b/cabarchive/parser.py
index d55aa98..3e69140 100644
--- a/cabarchive/parser.py
+++ b/cabarchive/parser.py
@@ -35,6 +35,7 @@ def __init__(self, cfarchive: "CabArchive", flattern: bool = False):
         self._header_reserved: bytes = b""
         self._zdict: Optional[bytes] = None
         self._rsvd_block: int = 0
+        self._ndatabsz: int = 0
 
     def parse_cffile(self, offset: int) -> int:
         """Parse a CFFILE entry"""
@@ -103,10 +104,14 @@ def parse_cffolder(self, idx: int, offset: int) -> None:
                 raise NotSupportedError("LZX compression not supported")
             raise NotSupportedError(f"Compression type 0x{compression:x} not supported")
 
-        # parse CDATA
-        self._folder_data.append(bytearray())
-        for _ in range(ndatab):
-            offset += self.parse_cfdata(idx, offset, compression)
+        # parse CDATA, either using the stream offset or the per-spec CFFOLDER.ndatab
+        self._folder_data.append(bytes())
+        if self._ndatabsz:
+            while offset < self._ndatabsz:
+                offset += self.parse_cfdata(idx, offset, compression)
+        else:
+            for _ in range(ndatab):
+                offset += self.parse_cfdata(idx, offset, compression)
 
     def parse_cfdata(self, idx: int, offset: int, compression: int) -> int:
         """Parse a CFDATA entry"""
@@ -244,6 +249,10 @@ def parse(self, buf: bytes) -> None:
         # read this so we can do round-trip
         self.cfarchive.set_id = set_id
 
+        # if the only folder is >= 2GB then CFFOLDER.ndatab will overflow
+        if len(self._buf) >= 0x8000 * 0xFFFF and nr_folders == 1:
+            self._ndatabsz = len(self._buf)
+
         # parse CFFOLDER
         for i in range(nr_folders):
             self.parse_cffolder(i, offset)
diff --git a/cabarchive/writer.py b/cabarchive/writer.py
index fdc5311..3193706 100644
--- a/cabarchive/writer.py
+++ b/cabarchive/writer.py
@@ -97,7 +97,7 @@ def write(self) -> bytes:
         data += struct.pack(
             FMT_CFFOLDER,
             offset,  # offset to CFDATA
-            len(chunks),  # number of CFDATA blocks
+            min(len(chunks), 0xFFFF),  # number of CFDATA blocks
             self.compress,
         )  # compression type
 

From a9a2f1a334a9baad3b0433cb6a50a5366eaec155 Mon Sep 17 00:00:00 2001
From: Richard Hughes <richard@hughsie.com>
Date: Thu, 27 Feb 2025 15:18:37 +0000
Subject: [PATCH 7/9] trivial: re-blacken source tree

---
 cabarchive/file.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cabarchive/file.py b/cabarchive/file.py
index fdad3de..6b34f55 100644
--- a/cabarchive/file.py
+++ b/cabarchive/file.py
@@ -18,7 +18,6 @@ def _is_ascii(text: str) -> bool:
 
 
 class CabFile:
-
     """An object representing a file in a Cab archive
 
     Any number of CabFile instances can be stored in a CabArchive.

From d5363406212932f1c71629bdf81f801e38867007 Mon Sep 17 00:00:00 2001
From: Richard Hughes <richard@hughsie.com>
Date: Thu, 27 Feb 2025 15:19:54 +0000
Subject: [PATCH 8/9] Speed up calculating the checksum by ~20%

---
 cabarchive/utils.py | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/cabarchive/utils.py b/cabarchive/utils.py
index 39beaad..b438f60 100644
--- a/cabarchive/utils.py
+++ b/cabarchive/utils.py
@@ -7,6 +7,8 @@
 #
 # pylint: disable=protected-access,too-few-public-methods
 
+import struct
+
 from typing import List
 
 FMT_CFHEADER = "<4sxxxxIxxxxIxxxxBBHHHHH"
@@ -16,33 +18,29 @@
 FMT_CFDATA = "<IHH"
 
 
-def _chunkify(arr: bytes, size: int) -> List[bytearray]:
+def _chunkify(arr: bytes, size: int) -> List[bytes]:
     """Split up a bytestream into chunks"""
     arrs = []
     for i in range(0, len(arr), size):
-        chunk = bytearray(arr[i : i + size])
-        arrs.append(chunk)
+        arrs.append(arr[i : i + size])
     return arrs
 
 
-def _checksum_compute(content: bytes, seed: int = 0) -> int:
+def _checksum_compute(buf: bytes, seed: int = 0) -> int:
     """Compute the MS cabinet checksum"""
-    csum = seed
-    chunks = _chunkify(content, 4)
-    for chunk in chunks:
-        if len(chunk) == 4:
-            ul = chunk[0]
-            ul |= chunk[1] << 8
-            ul |= chunk[2] << 16
-            ul |= chunk[3] << 24
-        else:
+    csum: int = seed
+    for offset in range(0, len(buf), 4):
+        try:
+            (ul,) = struct.unpack_from("<I", buf, offset)
+        except struct.error:
+            left: int = len(buf) - offset
             # WTF: I can only assume this is a typo from the original
             # author of the cabinet file specification
-            if len(chunk) == 3:
-                ul = (chunk[0] << 16) | (chunk[1] << 8) | chunk[2]
-            elif len(chunk) == 2:
-                ul = (chunk[0] << 8) | chunk[1]
-            elif len(chunk) == 1:
-                ul = chunk[0]
+            if left == 3:
+                ul = (buf[offset + 0] << 16) | (buf[offset + 1] << 8) | buf[offset + 2]
+            elif left == 2:
+                ul = (buf[offset + 0] << 8) | buf[offset + 1]
+            elif left == 1:
+                ul = buf[offset + 0]
         csum ^= ul
     return csum

From 59eb27f4d17411ecf520bb17eae943cc9b74bfcb Mon Sep 17 00:00:00 2001
From: Richard Hughes <richard@hughsie.com>
Date: Thu, 27 Feb 2025 15:20:22 +0000
Subject: [PATCH 9/9] trivial: Use slightly less memory when writing huge
 archives

---
 cabarchive/writer.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/cabarchive/writer.py b/cabarchive/writer.py
index 3193706..cf2c5ac 100644
--- a/cabarchive/writer.py
+++ b/cabarchive/writer.py
@@ -43,10 +43,13 @@ def write(self) -> bytes:
             cffiles.extend(self.cfarchive.values())
 
         # create linear CFDATA block
-        cfdata_linear = bytearray()
-        for f in cffiles:
-            if f.buf:
-                cfdata_linear += f.buf
+        if len(cffiles) > 1:
+            cfdata_linear = bytes()
+            for f in cffiles:
+                if f.buf:
+                    cfdata_linear += f.buf
+        else:
+            cfdata_linear = cffiles[0].buf or bytes()
 
         # _chunkify and compress with a fixed size
         chunks = _chunkify(cfdata_linear, 0x8000)