From a2507d5bbdb451f25eea9c39e2f2c36f9a479e50 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 15 Oct 2025 13:21:28 -0400
Subject: [PATCH 001/109] v2 python prototype.

---
 Makefile                        |   8 +
 docs/source/specification.rst   |   5 +-
 tamp/cli/main.py                |   4 +
 tamp/compressor.py              | 387 ++++++++++++++++++++++++++++----
 tamp/decompressor.py            | 114 +++++++---
 tools/print_compressed_sizes.py |  43 ++++
 6 files changed, 486 insertions(+), 75 deletions(-)
 create mode 100644 tools/print_compressed_sizes.py

diff --git a/Makefile b/Makefile
index 9e21e42e..e78e8a4a 100644
--- a/Makefile
+++ b/Makefile
@@ -27,6 +27,8 @@ help:
 	@echo "  make tamp-c-library     Build static C library"
 	@echo "  make website-build      Build website for deployment"
 
+.PHONY: clean test collect-data venv download
+
 
 ###########################
 # MicroPython Native Module
@@ -182,6 +184,12 @@ build/enwik8-100kb: download-enwik8
 build/enwik8-100kb.tamp: build/enwik8-100kb
 	@poetry run tamp compress build/enwik8-100kb -o build/enwik8-100kb.tamp
 
+download-micropython:
+	mkdir -p build
+	cd build && curl -O https://micropython.org/resources/firmware/RPI_PICO-20250415-v1.25.0.uf2
+
+download: download-enwik8 download-silesia download-micropython
+
 
 ##################
 # Python / Testing
diff --git a/docs/source/specification.rst b/docs/source/specification.rst
index f6128939..89a880a5 100644
--- a/docs/source/specification.rst
+++ b/docs/source/specification.rst
@@ -26,7 +26,8 @@ The bit-location 0 is equivalent to typical MSb position 7 of the first byte.
 | [2]     | custom_dictionary | A custom dictionary initialization method was used                  |
 |         |                   | and must be provided at decompression.                              |
 +---------+-------------------+---------------------------------------------------------------------+
-| [1]     | reserved          | Reserved for future use. Must be 0.                                 |
+| [1]     | v2                | Enables Tamp v2 features. Generally improves compression, but the   |
+|         |                   | decompressor needs to support it (introduced in tamp v1.11.0).      |
 +---------+-------------------+---------------------------------------------------------------------+
 | [0]     | more_header       | If ``True``, then the next byte in the stream is more header data.  |
 |         |                   | Currently always ``False``, but allows for future expandability.    |
@@ -60,7 +61,7 @@ Modifications are made to make the implementation simpler/faster.
       and points at the offset from the beginning of the dictionary buffer to the pattern.
       The shortest pattern-length is either going to be 2 or 3 bytes, depending on ``window``
       and ``literal`` parameters. The shortest pattern-length encoding must be shorter than
-      an equivalent stream of literals. The longest pattern-length will the minimum
+      an equivalent stream of literals. The longest pattern-length is the minimum
       pattern-length plus 13.
 
 Classically, the ``offset`` is from the current position in the buffer. Doing so results
diff --git a/tamp/cli/main.py b/tamp/cli/main.py
index 27275c25..e2054b6f 100644
--- a/tamp/cli/main.py
+++ b/tamp/cli/main.py
@@ -119,6 +119,7 @@ def compress(
         ),
     ] = 8,
     lazy_matching: bool = False,
+    v1: bool = False,
     implementation: ImplementationType = None,
 ):
     """Compress an input file or stream.
@@ -135,6 +136,8 @@ def compress(
         Number of bits used to represent a literal.
     lazy_matching: bool
         Use roughly 50% more cpu to get 0~2% better compression.
+    v1: bool
+        Use version 1 compression format.
     implementation: Optional[Literal["c", "python"]]
         Explicitly specify which implementation to use (c or python). Defaults to auto-detection.
     """
@@ -145,6 +148,7 @@ def compress(
         window=window,
         literal=literal,
         lazy_matching=lazy_matching,
+        v2=not v1,
     )
     write(output, output_bytes)
 
diff --git a/tamp/compressor.py b/tamp/compressor.py
index 91c5516d..f1d4dc05 100644
--- a/tamp/compressor.py
+++ b/tamp/compressor.py
@@ -1,3 +1,11 @@
+"""Pure Python Tamp Compressor Reference Implementation.
+
+The goal of this module is for clarity and to be able to easily test new ideas.
+Do not optimize this file for speed, unless it still maintains clarity.
+
+Some speed architectural optimizations might be tested here before implementing in other languages.
+"""
+
 from collections import deque
 from io import BytesIO
 
@@ -13,26 +21,52 @@
 
 from . import ExcessBitsError, bit_size, compute_min_pattern_size, initialize_dictionary
 
-# encodes [min_pattern_bytes, min_pattern_bytes + 13] pattern lengths
-_huffman_codes = b"\x00\x03\x08\x0b\x14$&+KT\x94\x95\xaa'"
+# encodes [0, 14] pattern lengths
+_huffman_codes = b"\x00\x03\x08\x0b\x14$&+KT\x94\x95\xaa'\xab"
 # These bit lengths pre-add the 1 bit for the 0-value is_literal flag.
-_huffman_bits = b"\x02\x03\x05\x05\x06\x07\x07\x07\x08\x08\x09\x09\x09\x07"
+_huffman_bits = b"\x02\x03\x05\x05\x06\x07\x07\x07\x08\x08\x09\x09\x09\x07\x09"
 _FLUSH_CODE = 0xAB  # 8 bits
+_RLE_SYMBOL = 12
+_RLE_MAX_WINDOW = 8  # Maximum number of RLE bytes to write to the window.
+_EXTENDED_MATCH_SYMBOL = 13
+_LEADING_EXTENDED_MATCH_HUFFMAN_BITS = 3
+_LEADING_RLE_HUFFMAN_BITS = 4
+
+
+def _determine_rle_breakeven_point(min_pattern_size, window_bits):
+    # Determines if a pattern-match would be shorter than a RLE match.
+    # See how many bits this encoding would be with RLE
+    rle_length_bits = {}
+    for i in range(min_pattern_size, min_pattern_size + 11 + 1):
+        rle_length_bits[i] = 8 + _LEADING_RLE_HUFFMAN_BITS + _huffman_bits[(i - 1) >> _LEADING_RLE_HUFFMAN_BITS]
+
+    pattern_length_bits = {}
+    for i in range(min_pattern_size, min_pattern_size + 11 + 1):
+        pattern_length_bits[i] = _huffman_bits[i - min_pattern_size] + window_bits
+
+    breakeven_point = 0
+    for pattern_size in sorted(pattern_length_bits):
+        if pattern_length_bits[pattern_size] < rle_length_bits[pattern_size]:
+            breakeven_point = pattern_size
+
+    return breakeven_point
 
 
 class _BitWriter:
     """Writes bits to a stream."""
 
-    def __init__(self, f, close_f_on_close=False):
+    def __init__(self, f, *, close_f_on_close: bool = False):
         self.close_f_on_close = close_f_on_close
         self.f = f
-        self.buffer = 0  # Basically a uint24
+        self.buffer = 0  # Basically a uint32
         self.bit_pos = 0
 
-    def write_huffman(self, pattern_size):
+    def write_huffman_and_literal_flag(self, pattern_size):
+        # pattern_size in range [0, 14]
         return self.write(_huffman_codes[pattern_size], _huffman_bits[pattern_size])
 
     def write(self, bits, num_bits, flush=True):
+        bits = int(bits)
         bits &= (1 << num_bits) - 1
         self.bit_pos += num_bits
         self.buffer |= bits << (32 - self.bit_pos)
@@ -75,7 +109,7 @@ def __init__(self, buffer):
         self.size = len(buffer)
         self.pos = 0  # Always pointing to the byte-to-be-overwritten
 
-    def write_byte(self, byte):  # ~10% of time
+    def write_byte(self, byte):
         self.buffer[self.pos] = byte
         self.pos = (self.pos + 1) % self.size
 
@@ -90,6 +124,25 @@ def index(self, pattern, start):
             raise ValueError("substring not found")
         return result
 
+    def write_from_self(self, position, size):
+        data = [self.buffer[(position + i) % self.size] for i in range(size)]
+        for x in data:
+            self.write_byte(x)
+
+    def get(self, index, size):
+        out = bytearray(size)
+        for i in range(size):
+            pos = (index + i) % self.size
+            out[i] = self.buffer[pos]
+        return bytes(out)
+
+    @property
+    def last_written_byte(self) -> int:
+        pos = self.pos - 1
+        if pos < 0:
+            pos = self.size - 1
+        return self.buffer[pos]  # TODO: unit-test this thoroughly on initial start!
+
 
 class Compressor:
     """Compresses data to a file or stream."""
@@ -102,6 +155,7 @@ def __init__(
         literal: int = 8,
         dictionary: Optional[bytearray] = None,
         lazy_matching: bool = False,
+        v2: bool = True,
     ):
         """
         Parameters
@@ -129,11 +183,26 @@ def __init__(
         lazy_matching: bool
             Use roughly 50% more cpu to get 0~2% better compression.
         """
-        if lazy_matching:
-            raise NotImplementedError("lazy matching not implemented in pure python implementation.")
+        self.window_bits = window
+        self.literal_bits = literal
+        self.min_pattern_size = compute_min_pattern_size(window, literal)
+        self.v2: bool = v2
+
+        self._rle_count = 0
+        self._rle_last_written = False  # The previous write was an RLE token
+
+        # "+1" Because a RLE of 1 is not valid.
+        self._rle_max_size = (13 << _LEADING_RLE_HUFFMAN_BITS) + (1 << _LEADING_RLE_HUFFMAN_BITS) + 1
+        self._rle_breakeven = _determine_rle_breakeven_point(self.min_pattern_size, self.window_bits)
+
+        self._extended_match_count = 0
+        self._extended_match_position = 0
+
+        self.lazy_matching = lazy_matching
+        self._cached_match_index = -1
+        self._cached_match_size = 0
 
         if not hasattr(f, "write"):  # It's probably a path-like object.
-            # TODO: then close it on close
             f = open(str(f), "wb")
             close_f_on_close = True
         else:
@@ -143,11 +212,15 @@ def __init__(
         if dictionary and bit_size(len(dictionary) - 1) != window:
             raise ValueError("Dictionary-window size mismatch.")
 
-        self.window_bits = window
-        self.literal_bits = literal
-
-        self.min_pattern_size = compute_min_pattern_size(window, literal)
-        self.max_pattern_size = self.min_pattern_size + 13
+        if self.v2:
+            self.max_pattern_size = (
+                self.min_pattern_size
+                + 11
+                + (13 << _LEADING_EXTENDED_MATCH_HUFFMAN_BITS)
+                + (1 << _LEADING_EXTENDED_MATCH_HUFFMAN_BITS)
+            )
+        else:
+            self.max_pattern_size = self.min_pattern_size + 13
 
         self.literal_flag = 1 << self.literal_bits
 
@@ -155,26 +228,171 @@ def __init__(
             buffer=dictionary if dictionary else initialize_dictionary(1 << window),
         )
 
-        self._input_buffer = deque(maxlen=self.max_pattern_size)
+        self._input_buffer = deque(maxlen=16)  # matching the C implementation
 
         # Callbacks for debugging/metric collection; can be externally set.
-        self.token_cb = None
+        self.match_cb = None
+        self.extended_match_cb = None
         self.literal_cb = None
         self.flush_cb = None
+        self.rle_cb = None
+
+        # For debugging: how many uncompressed bytes have we consumed so far.
+        self.input_index = 0
 
         # Write header
         self._bit_writer.write(window - 8, 3, flush=False)
         self._bit_writer.write(literal - 5, 2, flush=False)
         self._bit_writer.write(bool(dictionary), 1, flush=False)
-        self._bit_writer.write(0, 1, flush=False)  # Reserved
+        self._bit_writer.write(self.v2, 1, flush=False)
         self._bit_writer.write(0, 1, flush=False)  # No other header bytes
 
+    def _validate_no_match_overlap(self, write_pos, match_index, match_size):
+        """Check if writing a single byte will overlap with a future match section."""
+        return write_pos < match_index or write_pos >= match_index + match_size
+
     def _compress_input_buffer_single(self) -> int:
-        target = bytes(self._input_buffer)
         bytes_written = 0
+
+        if not self._input_buffer:
+            return bytes_written
+
+        if self._extended_match_count:
+            while self._input_buffer:
+                if (self._extended_match_position + self._extended_match_count) >= self._window_buffer.size:
+                    # wrap-around search: it's fine to check for the wrap now because it's super cheap here.
+                    pos = (self._extended_match_position + self._extended_match_count) % self._window_buffer.size
+                    if self._window_buffer.buffer[pos] == self._input_buffer[0]:
+                        self._input_buffer.popleft()
+                        self._extended_match_count += 1
+                        if self._extended_match_count == self.max_pattern_size:
+                            bytes_written += self._write_extended_match()
+                            return bytes_written
+                        continue
+                    # We've found the end of the match
+                    bytes_written += self._write_extended_match()
+                    return bytes_written
+                else:
+                    # Search the remainder of the window buffer.
+                    target = self._window_buffer.get(self._extended_match_position, self._extended_match_count)
+                    target += bytes([self._input_buffer[0]])
+                    search_i, match = self._search(target, start=self._extended_match_position)
+                    match_size = len(match)
+                    if match_size > self._extended_match_count:
+                        self._input_buffer.popleft()
+                        self._extended_match_count = match_size
+                        self._extended_match_position = search_i
+                        if self._extended_match_count == self.max_pattern_size:
+                            bytes_written += self._write_extended_match()
+                            return bytes_written
+                        continue
+                    else:
+                        # We've found the end of the match
+                        bytes_written += self._write_extended_match()
+                        return bytes_written
+            else:
+                # We ran out of input_buffer, return so caller can re-populate the input_buffer
+                return bytes_written
+
+        target = bytes(self._input_buffer)
         search_i = 0
         match_size = 1
-        for match_size in range(self.min_pattern_size, len(target) + 1):
+
+        if self.v2:
+            # RLE same-character-counting logic
+            while (
+                target and target[0] == self._window_buffer.last_written_byte and self._rle_count < self._rle_max_size
+            ):
+                self._rle_count += 1
+                self._input_buffer.popleft()
+                target = bytes(self._input_buffer)
+            if not target and self._rle_count != self._rle_max_size:
+                # Need more input to see if the RLE continues
+                return bytes_written
+            if self._rle_count == 1:
+                # This is not RLE; attempt to pattern-match or just write literals.
+                self._input_buffer.appendleft(self._window_buffer.last_written_byte)
+                target = bytes(self._input_buffer)
+                self._rle_count = 0
+            elif self._rle_count:
+                if self._rle_count > self._rle_breakeven:
+                    # It's certainly better to do a RLE write than searching for a pattern.
+                    bytes_written += self._write_rle()
+                    return bytes_written
+                else:
+                    # We'll see if pattern-matching offers a better encoding.
+                    target = bytes([self._window_buffer.last_written_byte]) * self._rle_count
+
+        # Check if we have a cached match from lazy matching
+        if self.lazy_matching and self._cached_match_index >= 0:
+            search_i = self._cached_match_index
+            match_size = self._cached_match_size
+            match = self._window_buffer.get(search_i, match_size)
+            self._cached_match_index = -1  # Clear cache after using
+        else:
+            # Perform normal pattern-matching
+            search_i, match = self._search(target, start=0)
+            match_size = len(match)
+
+        if self._rle_count:
+            # Check to see if the found pattern-match is more efficient than the RLE encoding.
+            assert self._rle_count >= 2  # noqa: S101
+            if match_size >= self._rle_count:
+                # Pattern is better than RLE
+                bytes_written += self._write_match(search_i, match)
+                self._rle_count = 0
+                return bytes_written
+            else:
+                # RLE is better than pattern
+                return self._write_rle()
+
+        # Lazy matching logic
+        if (
+            self.lazy_matching
+            and match_size >= self.min_pattern_size
+            and match_size <= 8
+            and len(self._input_buffer) > match_size + 2
+        ):
+            # Check if next position has a better match
+            next_target = bytes(list(self._input_buffer)[1:])  # Skip first byte
+            next_search_i, next_match = self._search(next_target, start=0)
+            next_match_size = len(next_match)
+
+            # If next position has a better match, and the match doesn't overlap with the literal we are writing
+            if next_match_size > match_size and self._validate_no_match_overlap(
+                self._window_buffer.pos, next_search_i, next_match_size
+            ):
+                # Write literal at current position and cache the next match
+                literal = self._input_buffer.popleft()
+                bytes_written += self._write_literal(literal)
+                self._cached_match_index = next_search_i
+                self._cached_match_size = next_match_size
+                return bytes_written
+
+        if match_size >= self.min_pattern_size:
+            if self.v2 and match_size > (self.min_pattern_size + 11):
+                # Protects +12 to be RLE symbol, and +13 to be extended match symbol
+                self._extended_match_position = search_i
+                self._extended_match_count = match_size
+            else:
+                bytes_written += self._write_match(search_i, match)
+
+            self._rle_last_written = False
+            for _ in range(match_size):
+                self._input_buffer.popleft()
+        else:
+            literal = self._input_buffer.popleft()
+            bytes_written += self._write_literal(literal)
+
+        return bytes_written
+
+    def _search(self, target: bytes, start=0):
+        match_size = 0
+        search_i = start
+        for match_size in range(
+            self.min_pattern_size,
+            min(len(target), self.max_pattern_size) + 1,
+        ):
             match = target[:match_size]
             try:
                 search_i = self._window_buffer.index(match, search_i)
@@ -183,30 +401,95 @@ def _compress_input_buffer_single(self) -> int:
                 match_size -= 1
                 break
         match = target[:match_size]
+        return search_i, match
 
-        if match_size >= self.min_pattern_size:
-            if self.token_cb:
-                self.token_cb(
-                    search_i,
-                    match_size,
-                    match,
-                )
-            bytes_written += self._bit_writer.write_huffman(match_size - self.min_pattern_size)
-            bytes_written += self._bit_writer.write(search_i, self.window_bits)
-            self._window_buffer.write_bytes(match)
+    def _write_extended_huffman(self, value, leading_bits):
+        bytes_written = 0
+        # the upper bits can have values [0, 13]
+        mask = (1 << leading_bits) - 1
+        if value > ((13 << leading_bits) + mask) or value < 0:
+            raise ValueError
+        code_index = value >> leading_bits
+        # Don't use write_huffman_and_literal_flag since we don't want to write a flag.
+        bytes_written += self._bit_writer.write(_huffman_codes[code_index], _huffman_bits[code_index] - 1)
+        bytes_written += self._bit_writer.write(value & mask, leading_bits)
+        return bytes_written
 
-            for _ in range(match_size):
-                self._input_buffer.popleft()
+    def _write_extended_match(self):
+        bytes_written = 0
+        if self.extended_match_cb:
+            string = self._window_buffer.get(self._extended_match_position, self._extended_match_count)
+            self.extended_match_cb(
+                self._window_buffer.pos, self._extended_match_position, self._extended_match_count, string
+            )
+        bytes_written += self._bit_writer.write_huffman_and_literal_flag(_EXTENDED_MATCH_SYMBOL)
+        bytes_written += self._bit_writer.write(self._extended_match_position, self.window_bits)
+        bytes_written += self._write_extended_huffman(
+            self._extended_match_count - self.min_pattern_size - 11 - 1,
+            _LEADING_EXTENDED_MATCH_HUFFMAN_BITS,
+        )
+
+        self._window_buffer.write_from_self(self._extended_match_position, self._extended_match_count)
+
+        # Reset state
+        self._extended_match_count = 0
+        self._extended_match_position = 0  # Technically not necessary.
+
+        return bytes_written
+
+    def _write_literal(self, literal) -> int:
+        bytes_written = 0
+        if self.literal_cb:
+            self.literal_cb(literal)
+        if literal >> self.literal_bits:
+            raise ExcessBitsError
+
+        bytes_written += self._bit_writer.write(literal | self.literal_flag, self.literal_bits + 1)
+        self._window_buffer.write_byte(literal)
+        self._rle_last_written = False
+        return bytes_written
+
+    def _write_match(self, search_i, match) -> int:
+        match_size = len(match)
+
+        if self.match_cb:
+            self.match_cb(
+                self._window_buffer.pos,
+                search_i,
+                match_size,
+                match,
+            )
+
+        bytes_written = 0
+        bytes_written += self._bit_writer.write_huffman_and_literal_flag(match_size - self.min_pattern_size)
+        bytes_written += self._bit_writer.write(search_i, self.window_bits)
+        self._window_buffer.write_bytes(match)
+        self._rle_last_written = False
+        return bytes_written
+
+    def _write_rle(self) -> int:
+        bytes_written = 0
+        last_written_byte = self._window_buffer.last_written_byte
+
+        if self._rle_count == 0:
+            raise ValueError("No RLE to write.")
+        elif self._rle_count == 1:
+            # Just write a literal
+            bytes_written += self._write_literal(last_written_byte)
         else:
-            char = self._input_buffer.popleft()
-            if self.literal_cb:
-                self.literal_cb(char)
-            if char >> self.literal_bits:
-                raise ExcessBitsError
+            if self.rle_cb:
+                self.rle_cb(self._rle_count, last_written_byte)
+            bytes_written += self._bit_writer.write_huffman_and_literal_flag(_RLE_SYMBOL)
+            bytes_written += self._write_extended_huffman(self._rle_count - 2, _LEADING_RLE_HUFFMAN_BITS)
 
-            bytes_written += self._bit_writer.write(char | self.literal_flag, self.literal_bits + 1)
-            self._window_buffer.write_byte(char)
+            if not self._rle_last_written:
+                # Only write up to 8 bytes, and only if we didn't already do this.
+                # This prevents filling up the window buffer with unhelpful data.
+                self._window_buffer.write_bytes(bytes([last_written_byte]) * min(self._rle_count, _RLE_MAX_WINDOW))
 
+            self._rle_last_written = True
+
+        self._rle_count = 0
         return bytes_written
 
     def write(self, data: Union[bytes, bytearray]) -> int:
@@ -225,8 +508,12 @@ def write(self, data: Union[bytes, bytearray]) -> int:
         """
         bytes_written = 0
 
-        for char in data:
-            self._input_buffer.append(char)
+        self.input_index = 0
+        while self.input_index < len(data):
+            if len(self._input_buffer) != self._input_buffer.maxlen:
+                self._input_buffer.append(data[self.input_index])
+                self.input_index += 1
+
             if len(self._input_buffer) == self._input_buffer.maxlen:
                 bytes_written += self._compress_input_buffer_single()
 
@@ -255,7 +542,18 @@ def flush(self, write_token: bool = True) -> int:
             self.flush_cb()
         while self._input_buffer:
             bytes_written += self._compress_input_buffer_single()
-        bytes_written += self._bit_writer.flush(write_token=write_token)
+        if self.v2 and self._rle_count:
+            bytes_written += self._write_rle()
+
+        # Clear any cached lazy matching state
+        if self.lazy_matching:
+            self._cached_match_index = -1
+            self._cached_match_size = 0
+
+        bytes_written_flush = self._bit_writer.flush(write_token=write_token)
+        bytes_written += bytes_written_flush
+        if bytes_written_flush:
+            self._rle_last_written = False
         return bytes_written
 
     def close(self) -> int:
@@ -300,6 +598,7 @@ def compress(
     literal: int = 8,
     dictionary: Optional[bytearray] = None,
     lazy_matching: bool = False,
+    v2: bool = True,
 ) -> bytes:
     """Single-call to compress data.
 
@@ -326,6 +625,8 @@ def compress(
         first be initialized with :func:`~tamp.initialize_dictionary`
     lazy_matching: bool
         Use roughly 50% more cpu to get 0~2% better compression.
+    v2: bool
+        Use v2 compression format. Defaults to True.
 
     Returns
     -------
@@ -340,6 +641,7 @@ def compress(
                 literal=literal,
                 dictionary=dictionary,
                 lazy_matching=lazy_matching,
+                v2=v2,
             )
             c.write(data)
         else:
@@ -349,6 +651,7 @@ def compress(
                 literal=literal,
                 dictionary=dictionary,
                 lazy_matching=lazy_matching,
+                v2=v2,
             )
             c.write(data)
         c.flush(write_token=False)
diff --git a/tamp/decompressor.py b/tamp/decompressor.py
index fb2fec7c..947dd023 100644
--- a/tamp/decompressor.py
+++ b/tamp/decompressor.py
@@ -10,6 +10,13 @@
 _CHUNK_SIZE = 1 << 20
 _FLUSH = object()
 
+# These variables must match compressor.py
+_RLE_SYMBOL = 12
+_EXTENDED_MATCH_SYMBOL = 13
+_RLE_MAX_WINDOW = 8  # Maximum number of RLE bytes to write to the window.
+_LEADING_EXTENDED_MATCH_HUFFMAN_BITS = 3
+_LEADING_RLE_HUFFMAN_BITS = 4
+
 # Each key here are the huffman codes or'd with 0x80
 # This is so that each lookup is easy/quick.
 _huffman_lookup = {
@@ -57,15 +64,15 @@ def read(self, num_bits):
             if not byte:
                 raise EOFError
             byte_value = int.from_bytes(byte, "little")
-            self.buffer |= byte_value << (24 - self.bit_pos)
+            self.buffer |= byte_value << (56 - self.bit_pos)
             self.bit_pos += 8
 
             if self.backup_buffer is not None and self.backup_bit_pos is not None:
-                self.backup_buffer |= byte_value << (24 - self.backup_bit_pos)
+                self.backup_buffer |= byte_value << (56 - self.backup_bit_pos)
                 self.backup_bit_pos += 8
 
-        result = self.buffer >> (32 - num_bits)
-        mask = (1 << (32 - num_bits)) - 1
+        result = self.buffer >> (64 - num_bits)
+        mask = (1 << (64 - num_bits)) - 1
         self.buffer = (self.buffer & mask) << num_bits
         self.bit_pos -= num_bits
 
@@ -120,6 +127,20 @@ def write_bytes(self, data):
         for byte in data:
             self.write_byte(byte)
 
+    def get(self, index, size):
+        out = bytearray(size)
+        for i in range(size):
+            pos = (index + i) % self.size
+            out[i] = self.buffer[pos]
+        return bytes(out)
+
+    @property
+    def last_written_byte(self) -> int:
+        pos = self.pos - 1
+        if pos < 0:
+            pos = self.size - 1
+        return self.buffer[pos]  # TODO: unit-test this thoroughly on initial start!
+
 
 class Decompressor:
     """Decompresses a file or stream of tamp-compressed data.
@@ -158,11 +179,9 @@ def __init__(self, f, *, dictionary: Optional[bytearray] = None):
         self.window_bits = self._bit_reader.read(3) + 8
         self.literal_bits = self._bit_reader.read(2) + 5
         uses_custom_dictionary = self._bit_reader.read(1)
-        reserved = self._bit_reader.read(1)
+        self.v2 = self._bit_reader.read(1)
         more_header_bytes = self._bit_reader.read(1)
-
-        if reserved:
-            raise NotImplementedError
+        self._rle_last_written = False
 
         if more_header_bytes:
             raise NotImplementedError
@@ -176,6 +195,7 @@ def __init__(self, f, *, dictionary: Optional[bytearray] = None):
 
         self.min_pattern_size = compute_min_pattern_size(self.window_bits, self.literal_bits)
 
+        # Used to store decoded bytes that do not currently fit in the output buffer.
         self.overflow = bytearray()
 
     def readinto(self, buf: bytearray) -> int:
@@ -191,49 +211,81 @@ def readinto(self, buf: bytearray) -> int:
         int
             Number of bytes decompressed into buffer.
         """
+        bytes_written = 0
+
         if len(self.overflow) > len(buf):
             buf[:] = self.overflow[: len(buf)]
-            written = len(buf)
+            bytes_written += len(buf)
             self.overflow = self.overflow[len(buf) :]
-            return written
+            return bytes_written
         elif self.overflow:
             buf[: len(self.overflow)] = self.overflow
-            written = len(self.overflow)
+            bytes_written += len(self.overflow)
             self.overflow = bytearray()
-        else:
-            written = 0
 
-        while written < len(buf):
+        def write_to_output(string):
+            nonlocal bytes_written
+            match_size = len(string)
+            to_buf = min(len(buf) - bytes_written, match_size)
+            buf[bytes_written : bytes_written + to_buf] = string[:to_buf]
+            bytes_written += to_buf
+            if to_buf < match_size:
+                self.overflow[:] = string[to_buf:]
+                return False  # stop decoding
+            return True
+
+        while bytes_written < len(buf):
             try:
                 with self._bit_reader:
                     is_literal = self._bit_reader.read(1)
 
                     if is_literal:
-                        c = self._bit_reader.read(self.literal_bits)
-                        self._window_buffer.write_byte(c)
-                        buf[written] = c
-                        written += 1
+                        string = bytes([self._bit_reader.read(self.literal_bits)])
+                        self._window_buffer.write_bytes(string)
+                        self._rle_last_written = False
                     else:
                         match_size = self._bit_reader.read_huffman()
                         if match_size is _FLUSH:
                             self._bit_reader.clear()
                             continue
-                        match_size += self.min_pattern_size
-                        index = self._bit_reader.read(self.window_bits)
-
-                        string = self._window_buffer.buffer[index : index + match_size]
-                        self._window_buffer.write_bytes(string)
-
-                        to_buf = min(len(buf) - written, match_size)
-                        buf[written : written + to_buf] = string[:to_buf]
-                        written += to_buf
-                        if to_buf < match_size:
-                            self.overflow[:] = string[to_buf:]
-                            break
+                        if self.v2 and match_size > 11:
+                            if match_size == _RLE_SYMBOL:
+                                rle_count = self._bit_reader.read_huffman()
+                                rle_count <<= _LEADING_RLE_HUFFMAN_BITS
+                                rle_count += self._bit_reader.read(_LEADING_RLE_HUFFMAN_BITS)
+                                rle_count += 1 + 1
+                                symbol = self._window_buffer.last_written_byte
+                                string = bytes([symbol]) * rle_count
+                                if not self._rle_last_written:
+                                    self._window_buffer.write_bytes(string[: min(rle_count, _RLE_MAX_WINDOW)])
+                                self._rle_last_written = True
+                            elif match_size == _EXTENDED_MATCH_SYMBOL:
+                                index = self._bit_reader.read(self.window_bits)
+                                match_size = self._bit_reader.read_huffman()
+                                match_size <<= _LEADING_EXTENDED_MATCH_HUFFMAN_BITS
+                                match_size += self._bit_reader.read(_LEADING_EXTENDED_MATCH_HUFFMAN_BITS)
+                                match_size += self.min_pattern_size + 11 + 1
+
+                                string = self._window_buffer.get(index, match_size)
+
+                                self._window_buffer.write_bytes(string)
+                                self._rle_last_written = False
+                            else:
+                                raise ValueError("unreachable")
+                        else:
+                            match_size += self.min_pattern_size
+                            index = self._bit_reader.read(self.window_bits)
+
+                            string = self._window_buffer.get(index, match_size)
+                            self._window_buffer.write_bytes(string)
+                            self._rle_last_written = False
+
+                    if not write_to_output(string):
+                        break
             except EOFError:
                 break
 
-        return written
+        return bytes_written
 
     def read(self, size: int = -1) -> bytearray:
         """Decompresses data to bytes.
diff --git a/tools/print_compressed_sizes.py b/tools/print_compressed_sizes.py
new file mode 100644
index 00000000..ea2c71e2
--- /dev/null
+++ b/tools/print_compressed_sizes.py
@@ -0,0 +1,43 @@
+"""
+Print compressed sizes for test files used in optimize-extended-huffman.py.
+
+This script compresses the same files that optimize-extended-huffman.py uses
+and prints the compressed size for each file with thousands separators.
+"""
+
+from pathlib import Path
+
+import tamp.compressor
+
+
+def main():
+    # Define test files (same as optimize-extended-huffman.py)
+    build_dir = Path(__file__).parent.parent / "build"
+    test_files = [build_dir / "enwik8", build_dir / "RPI_PICO-20250415-v1.25.0.uf2", *(build_dir / "silesia").iterdir()]
+    test_files.sort()
+
+    ratios = []
+    for file_path in test_files:
+        # Read and compress the file
+        data = file_path.read_bytes()
+        if len(data) == 0:
+            print(f"{file_path.name}: Empty file")
+            continue
+
+        compressed_data = tamp.compressor.compress(data)
+
+        original_size = len(data)
+        compressed_size = len(compressed_data)
+
+        ratio = original_size / compressed_size
+        ratios.append(ratio)
+
+        # Print with thousands separators
+        print(f"{file_path.name}: {compressed_size:,} (**{ratio:.3f}**)")
+
+    avg = sum(ratios) / len(ratios)
+    print(f"Average Ratio: {avg}")
+
+
+if __name__ == "__main__":
+    main()

From 33bc24aa8a12047b8e51d55d287911a646eba507 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Fri, 23 Jan 2026 20:02:04 -0500
Subject: [PATCH 002/109] update pi pico firmware download to datasets.

---
 Makefile                        | 4 ++--
 tools/print_compressed_sizes.py | 8 ++++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index e78e8a4a..7b668924 100644
--- a/Makefile
+++ b/Makefile
@@ -185,8 +185,8 @@ build/enwik8-100kb.tamp: build/enwik8-100kb
 	@poetry run tamp compress build/enwik8-100kb -o build/enwik8-100kb.tamp
 
 download-micropython:
-	mkdir -p build
-	cd build && curl -O https://micropython.org/resources/firmware/RPI_PICO-20250415-v1.25.0.uf2
+	mkdir -p datasets
+	cd datasets && curl -O https://micropython.org/resources/firmware/RPI_PICO-20250415-v1.25.0.uf2
 
 download: download-enwik8 download-silesia download-micropython
 
diff --git a/tools/print_compressed_sizes.py b/tools/print_compressed_sizes.py
index ea2c71e2..7cb809eb 100644
--- a/tools/print_compressed_sizes.py
+++ b/tools/print_compressed_sizes.py
@@ -12,8 +12,12 @@
 
 def main():
     # Define test files (same as optimize-extended-huffman.py)
-    build_dir = Path(__file__).parent.parent / "build"
-    test_files = [build_dir / "enwik8", build_dir / "RPI_PICO-20250415-v1.25.0.uf2", *(build_dir / "silesia").iterdir()]
+    datasets_dir = Path(__file__).parent.parent / "datasets"
+    test_files = [
+        datasets_dir / "enwik8",
+        datasets_dir / "RPI_PICO-20250415-v1.25.0.uf2",
+        *(datasets_dir / "silesia").iterdir(),
+    ]
     test_files.sort()
 
     ratios = []

From 2c461841e20330fe7857707771c2928d20f85f56 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Fri, 23 Jan 2026 21:13:27 -0500
Subject: [PATCH 003/109] remove wrap-around logic; adds additional
 complications for minimal gains.

---
 tamp/compressor.py | 49 +++++++++++++++++++---------------------------
 1 file changed, 20 insertions(+), 29 deletions(-)

diff --git a/tamp/compressor.py b/tamp/compressor.py
index f1d4dc05..722bfbca 100644
--- a/tamp/compressor.py
+++ b/tamp/compressor.py
@@ -260,39 +260,30 @@ def _compress_input_buffer_single(self) -> int:
         if self._extended_match_count:
             while self._input_buffer:
                 if (self._extended_match_position + self._extended_match_count) >= self._window_buffer.size:
-                    # wrap-around search: it's fine to check for the wrap now because it's super cheap here.
-                    pos = (self._extended_match_position + self._extended_match_count) % self._window_buffer.size
-                    if self._window_buffer.buffer[pos] == self._input_buffer[0]:
-                        self._input_buffer.popleft()
-                        self._extended_match_count += 1
-                        if self._extended_match_count == self.max_pattern_size:
-                            bytes_written += self._write_extended_match()
-                            return bytes_written
-                        continue
-                    # We've found the end of the match
+                    # Reached window boundary - emit match (no wrap-around, only 0.02% compression loss)
                     bytes_written += self._write_extended_match()
                     return bytes_written
-                else:
-                    # Search the remainder of the window buffer.
-                    target = self._window_buffer.get(self._extended_match_position, self._extended_match_count)
-                    target += bytes([self._input_buffer[0]])
-                    search_i, match = self._search(target, start=self._extended_match_position)
-                    match_size = len(match)
-                    if match_size > self._extended_match_count:
-                        self._input_buffer.popleft()
-                        self._extended_match_count = match_size
-                        self._extended_match_position = search_i
-                        if self._extended_match_count == self.max_pattern_size:
-                            bytes_written += self._write_extended_match()
-                            return bytes_written
-                        continue
-                    else:
-                        # We've found the end of the match
+
+                # Search the remainder of the window buffer for a longer match.
+                target = self._window_buffer.get(self._extended_match_position, self._extended_match_count)
+                target += bytes([self._input_buffer[0]])
+                search_i, match = self._search(target, start=self._extended_match_position)
+                match_size = len(match)
+                if match_size > self._extended_match_count:
+                    self._input_buffer.popleft()
+                    self._extended_match_count = match_size
+                    self._extended_match_position = search_i
+                    if self._extended_match_count == self.max_pattern_size:
                         bytes_written += self._write_extended_match()
                         return bytes_written
-            else:
-                # We ran out of input_buffer, return so caller can re-populate the input_buffer
-                return bytes_written
+                    continue
+                else:
+                    # We've found the end of the match
+                    bytes_written += self._write_extended_match()
+                    return bytes_written
+
+            # We ran out of input_buffer, return so caller can re-populate the input_buffer
+            return bytes_written
 
         target = bytes(self._input_buffer)
         search_i = 0

From e503cddd286eca6451ee5c989ee9fde147988222 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 08:27:12 -0500
Subject: [PATCH 004/109] Prepare cython bindings for v2 flag.

---
 tamp/_c_compressor.pyx | 4 ++++
 tamp/ctamp.pxd         | 1 +
 2 files changed, 5 insertions(+)

diff --git a/tamp/_c_compressor.pyx b/tamp/_c_compressor.pyx
index a3e403de..ae690257 100644
--- a/tamp/_c_compressor.pyx
+++ b/tamp/_c_compressor.pyx
@@ -35,9 +35,13 @@ cdef class Compressor:
         int literal=8,
         dictionary=None,
         bool lazy_matching=False,
+        bool v2=False,
     ):
         cdef ctamp.TampConf conf
 
+        if v2:
+            raise NotImplementedError("v2 compression not yet supported in C compressor. Use --v1 flag or --implementation python.")
+
         if dictionary and bit_size(len(dictionary) - 1) != window:
             raise ValueError("Dictionary-window size mismatch.")
 
diff --git a/tamp/ctamp.pxd b/tamp/ctamp.pxd
index 087ff62c..cd563ee1 100644
--- a/tamp/ctamp.pxd
+++ b/tamp/ctamp.pxd
@@ -6,6 +6,7 @@ cdef extern from "tamp/common.h":
         int window
         int literal
         bool use_custom_dictionary
+        bool v2  # v2 format (RLE, extended match). Read from header bit [1].
         # The lazy_matching field is conditionally compiled based on TAMP_LAZY_MATCHING
         # We declare it here, but accessing it when the macro is disabled will cause compile errors
         # This is handled in the Cython code by always setting it when the struct is initialized

From 8dba220ef02ce34a6a0b4143ce085bbb5fd046ec Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 08:28:38 -0500
Subject: [PATCH 005/109] add mssing extended-match-count flush.

---
 tamp/compressor.py       |  2 ++
 tests/test_cli.py        |  4 ++--
 tests/test_compressor.py | 18 +++++++++---------
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/tamp/compressor.py b/tamp/compressor.py
index 722bfbca..7b4b2a79 100644
--- a/tamp/compressor.py
+++ b/tamp/compressor.py
@@ -535,6 +535,8 @@ def flush(self, write_token: bool = True) -> int:
             bytes_written += self._compress_input_buffer_single()
         if self.v2 and self._rle_count:
             bytes_written += self._write_rle()
+        if self.v2 and self._extended_match_count:
+            bytes_written += self._write_extended_match()
 
         # Clear any cached lazy matching state
         if self.lazy_matching:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index c23917dd..428431e5 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -51,7 +51,7 @@ def test_compress_file_to_stdout(self):
             test_file.write_bytes(b"foo foo foo")
 
             with patch("sys.stdout.buffer.write") as mock_stdout:
-                app(["compress", str(test_file)], **_app_kwargs)
+                app(["compress", "--v1", str(test_file)], **_app_kwargs)
                 mock_stdout.assert_called_once_with(compressed_foo_foo_foo)
 
     def test_compress_stdin_to_stdout(self):
@@ -59,7 +59,7 @@ def test_compress_stdin_to_stdout(self):
             patch("sys.stdout.buffer.write") as mock_stdout,
             patch("sys.stdin.buffer.read", return_value="foo foo foo"),
         ):
-            app("compress", **_app_kwargs)
+            app(["compress", "--v1"], **_app_kwargs)
             mock_stdout.assert_called_once_with(compressed_foo_foo_foo)
 
     def test_decompress_file_to_stdout(self):
diff --git a/tests/test_compressor.py b/tests/test_compressor.py
index 188447c0..00b0ceac 100644
--- a/tests/test_compressor.py
+++ b/tests/test_compressor.py
@@ -94,7 +94,7 @@ def test_compressor_default(self):
 
                 bytes_written = 0
                 with io.BytesIO() as f:
-                    compressor = Compressor(f)
+                    compressor = Compressor(f, v2=False)
                     bytes_written += compressor.write(test_string)
                     bytes_written += compressor.flush(write_token=False)
 
@@ -106,7 +106,7 @@ def test_compressor_default(self):
 
                 # Test Context Manager
                 bytes_written = 0
-                with io.BytesIO() as f, Compressor(f) as compressor:
+                with io.BytesIO() as f, Compressor(f, v2=False) as compressor:
                     bytes_written += compressor.write(test_string)
                     bytes_written += compressor.flush(write_token=False)
 
@@ -137,7 +137,7 @@ def test_compressor_input_buffer(self):
                 )
 
                 with io.BytesIO() as f:
-                    compressor = Compressor(f)
+                    compressor = Compressor(f, v2=False)
                     compressor.write(b"f")
                     compressor.write(b"oo")
                     compressor.write(b" fo")
@@ -171,7 +171,7 @@ def test_compressor_7bit(self):
                     # fmt: on
                 )
                 with io.BytesIO() as f:
-                    compressor = Compressor(f, literal=7)
+                    compressor = Compressor(f, literal=7, v2=False)
                     compressor.write(test_string)
                     compressor.flush(write_token=False)
 
@@ -200,7 +200,7 @@ def test_compressor_predefined_dictionary(self):
                 )
 
                 with io.BytesIO() as f:
-                    compressor = Compressor(f, window=8, literal=7, dictionary=dictionary)
+                    compressor = Compressor(f, window=8, literal=7, dictionary=dictionary, v2=False)
                     compressor.write(test_string)
                     compressor.flush(write_token=False)
 
@@ -223,7 +223,7 @@ def test_oob_2_byte_pattern(self):
                 test_string = memoryview(test_string_extended)[:3]  # b"Q\x00Q"
 
                 with io.BytesIO() as f:
-                    compressor = Compressor(f)
+                    compressor = Compressor(f, v2=False)
                     compressor.write(test_string)
                     compressor.flush(write_token=False)
 
@@ -245,7 +245,7 @@ def test_oob_2_byte_pattern(self):
     def test_excess_bits(self):
         for Compressor in Compressors:
             with self.subTest(Compressor=Compressor), io.BytesIO() as f:
-                compressor = Compressor(f, literal=7)
+                compressor = Compressor(f, literal=7, v2=False)
 
                 with self.assertRaises((ExcessBitsError, NativeExcessBitsError)):
                     compressor.write(b"\xff")
@@ -271,7 +271,7 @@ def test_single_shot_compress_text(self):
                     ]
                     # fmt: on
                 )
-                self.assertEqual(compress("foo foo foo"), expected)
+                self.assertEqual(compress("foo foo foo", v2=False), expected)
 
     def test_single_shot_compress_binary(self):
         for compress in compresses:
@@ -293,7 +293,7 @@ def test_single_shot_compress_binary(self):
                     ]
                     # fmt: on
                 )
-                self.assertEqual(compress(b"foo foo foo"), expected)
+                self.assertEqual(compress(b"foo foo foo", v2=False), expected)
 
     def test_invalid_conf(self):
         for Compressor in Compressors:

From bd7762dcc863b2fa2440594310a1f8b448f1c12f Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 10:42:05 -0500
Subject: [PATCH 006/109] common.h: add TampConf.v2 attribute and associated
 macros

---
 tamp/_c_src/tamp/common.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tamp/_c_src/tamp/common.h b/tamp/_c_src/tamp/common.h
index 0b2b8e47..089b7769 100644
--- a/tamp/_c_src/tamp/common.h
+++ b/tamp/_c_src/tamp/common.h
@@ -68,6 +68,26 @@ extern "C" {
 #define TAMP_STREAM_WORK_BUFFER_SIZE 32
 #endif
 
+/* V2 format support (RLE, extended match).
+ * Enabled by default. Disable to save code size on minimal builds.
+ * Separate flags allow decompressor-only or compressor-only v2 support.
+ */
+#ifndef TAMP_V2_DECOMPRESS
+#define TAMP_V2_DECOMPRESS 1
+#endif
+#ifndef TAMP_V2_COMPRESS
+#define TAMP_V2_COMPRESS 1
+#endif
+
+/* V2 encoding constants */
+#if TAMP_V2_DECOMPRESS || TAMP_V2_COMPRESS
+#define TAMP_RLE_SYMBOL 12
+#define TAMP_EXTENDED_MATCH_SYMBOL 13
+#define TAMP_LEADING_EXTENDED_MATCH_BITS 3
+#define TAMP_LEADING_RLE_BITS 4
+#define TAMP_RLE_MAX_WINDOW 8
+#endif
+
 enum {
     /* Normal/Recoverable status >= 0 */
     TAMP_OK = 0,
@@ -93,6 +113,7 @@ typedef struct TampConf {
     uint16_t window : 4;                 // number of window bits
     uint16_t literal : 4;                // number of literal bits
     uint16_t use_custom_dictionary : 1;  // Use a custom initialized dictionary.
+    uint16_t v2 : 1;                     // v2 format (RLE, extended match). Read from header bit [1].
 #if TAMP_LAZY_MATCHING
     uint16_t lazy_matching : 1;  // use Lazy Matching (spend 50-75% more CPU for around 0.5-2.0% better compression.)
                                  // only effects compression operations.

From 0b208fffae32b53ba3f005ad8207117c7004dd53 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 14:04:54 -0500
Subject: [PATCH 007/109] wip c decompressor

---
 tamp/_c_src/tamp/decompressor.c | 309 ++++++++++++++++++++++++++++++--
 tamp/_c_src/tamp/decompressor.h |  12 ++
 2 files changed, 310 insertions(+), 11 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index a7c0baaa..aa1999e8 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -7,6 +7,23 @@
 
 #define FLUSH 15
 
+#if TAMP_V2_DECOMPRESS
+/* Pending symbol states for v2 decode suspend/resume.
+ * - 0: No pending operation
+ * - 12: RLE - pending_ext_huffman holds partial huffman result or 0xFF for fresh/output-resume
+ * - 13: Extended match - fresh (need window_offset)
+ * - 14: Extended match - have window_offset in pending_window_offset (need huffman+trailing)
+ * - 15: Extended match - have window_offset and huffman (need trailing only)
+ * When skip_bytes > 0, we're resuming after output-full with full decode saved.
+ */
+#define PENDING_NONE 0
+#define PENDING_RLE 12
+#define PENDING_EXT_FRESH 13
+#define PENDING_EXT_HAVE_OFFSET 14
+#define PENDING_EXT_HAVE_HUFFMAN 15
+#define PARTIAL_STATE_NONE 0xFF
+#endif
+
 /**
  * This array was generated with tools/huffman_jump_table.py
  *
@@ -51,6 +68,197 @@ static inline int8_t huffman_decode(uint32_t *bit_buffer, uint8_t *bit_buffer_po
     return code & 0xF;
 }
 
+#if TAMP_V2_DECOMPRESS
+/**
+ * @brief Decode huffman symbol + trailing bits from bit buffer.
+ *
+ * Shared helper for RLE and extended match decoding.
+ * Uses pending_ext_huffman to track partial decode state.
+ *
+ * @param d Decompressor state
+ * @param trailing_bits Number of trailing bits to read (3 or 4)
+ * @param result Output: (huffman << trailing_bits) + trailing
+ * @return TAMP_OK on success, TAMP_INPUT_EXHAUSTED if more bits needed
+ */
+static tamp_res decode_huffman_trailing(TampDecompressor *d, uint8_t trailing_bits, uint16_t *result) {
+    uint32_t bit_buffer = d->bit_buffer;
+    uint8_t bit_buffer_pos = d->bit_buffer_pos;
+    int8_t huffman_value;
+
+    if (d->pending_ext_huffman != PARTIAL_STATE_NONE) {
+        huffman_value = d->pending_ext_huffman;
+    } else {
+        if (TAMP_UNLIKELY(bit_buffer_pos < 1)) return TAMP_INPUT_EXHAUSTED;
+        huffman_value = huffman_decode(&bit_buffer, &bit_buffer_pos);
+    }
+
+    if (TAMP_UNLIKELY(bit_buffer_pos < trailing_bits)) {
+        d->bit_buffer = bit_buffer;
+        d->bit_buffer_pos = bit_buffer_pos;
+        d->pending_ext_huffman = huffman_value;
+        return TAMP_INPUT_EXHAUSTED;
+    }
+
+    uint8_t trailing = bit_buffer >> (32 - trailing_bits);
+    bit_buffer <<= trailing_bits;
+    bit_buffer_pos -= trailing_bits;
+
+    *result = (huffman_value << trailing_bits) + trailing;
+
+    d->bit_buffer = bit_buffer;
+    d->bit_buffer_pos = bit_buffer_pos;
+    d->pending_ext_huffman = PARTIAL_STATE_NONE;
+
+    return TAMP_OK;
+}
+
+/**
+ * @brief Decode RLE token and write repeated bytes to output.
+ */
+static tamp_res decode_rle(TampDecompressor *d, unsigned char **output, const unsigned char *output_end,
+                           size_t *output_written_size, uint16_t window_mask) {
+    uint16_t rle_count;
+    uint16_t skip = d->skip_bytes;
+
+    if (skip > 0) {
+        rle_count = d->pending_window_offset;
+    } else {
+        uint16_t raw;
+        tamp_res res = decode_huffman_trailing(d, TAMP_LEADING_RLE_BITS, &raw);
+        if (res != TAMP_OK) return res;
+        rle_count = raw + 2;
+    }
+
+    /* Get the byte to repeat (last written byte) */
+    uint16_t prev_pos = (d->window_pos == 0) ? window_mask : (d->window_pos - 1);
+    uint8_t symbol = d->window[prev_pos];
+
+    /* Calculate how many to write this call */
+    uint16_t remaining_count = rle_count - skip;
+    size_t output_space = output_end - *output;
+    uint16_t to_write;
+
+    if (TAMP_UNLIKELY(remaining_count > output_space)) {
+        /* Partial write */
+        to_write = output_space;
+        d->skip_bytes = skip + output_space;
+        d->pending_symbol = PENDING_RLE;
+        d->pending_window_offset = rle_count;
+    } else {
+        /* Complete write */
+        to_write = remaining_count;
+        d->skip_bytes = 0;
+        d->pending_symbol = PENDING_NONE;
+    }
+
+    /* Write repeated bytes to output */
+    for (uint16_t i = 0; i < to_write; i++) {
+        *(*output)++ = symbol;
+    }
+    *output_written_size += to_write;
+
+    /* Update window only on first chunk (skip==0) and not after another RLE */
+    if (skip == 0 && !d->rle_last_written) {
+        uint16_t window_write = (rle_count < TAMP_RLE_MAX_WINDOW) ? rle_count : TAMP_RLE_MAX_WINDOW;
+        for (uint16_t i = 0; i < window_write; i++) {
+            d->window[d->window_pos] = symbol;
+            d->window_pos = (d->window_pos + 1) & window_mask;
+        }
+    }
+    d->rle_last_written = 1;
+
+    return (d->pending_symbol == PENDING_NONE) ? TAMP_OK : TAMP_OUTPUT_FULL;
+}
+
+/**
+ * @brief Decode extended match token and copy from window to output.
+ */
+static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **output, const unsigned char *output_end,
+                                      size_t *output_written_size, uint8_t conf_window, uint8_t min_pattern_size,
+                                      uint16_t window_mask) {
+    uint16_t window_offset;
+    uint16_t match_size;
+    uint16_t skip = d->skip_bytes;
+    uint8_t pending = d->pending_symbol;
+
+    if (skip > 0) {
+        /* Resume from output-full: window_offset and match_size already saved */
+        window_offset = d->pending_window_offset;
+        match_size = d->pending_ext_huffman;
+    } else {
+        /* Step 1: Get window_offset (saved or decode fresh) */
+        if (pending >= PENDING_EXT_HAVE_OFFSET) {
+            window_offset = d->pending_window_offset;
+        } else {
+            if (TAMP_UNLIKELY(d->bit_buffer_pos < conf_window)) return TAMP_INPUT_EXHAUSTED;
+            window_offset = d->bit_buffer >> (32 - conf_window);
+            d->bit_buffer <<= conf_window;
+            d->bit_buffer_pos -= conf_window;
+            /* Save window_offset in case huffman+trailing needs more input */
+            d->pending_window_offset = window_offset;
+        }
+
+        /* Step 2: Decode huffman + trailing bits */
+        uint16_t raw;
+        tamp_res res = decode_huffman_trailing(d, TAMP_LEADING_EXTENDED_MATCH_BITS, &raw);
+        if (res != TAMP_OK) {
+            /* Update pending_symbol based on where we stopped */
+            d->pending_symbol =
+                (d->pending_ext_huffman != PARTIAL_STATE_NONE) ? PENDING_EXT_HAVE_HUFFMAN : PENDING_EXT_HAVE_OFFSET;
+            return res;
+        }
+
+        match_size = raw + min_pattern_size + 12;
+    }
+
+    /* Security check: validate window bounds */
+    const uint32_t window_size = (1u << conf_window);
+    if (TAMP_UNLIKELY((uint32_t)window_offset >= window_size ||
+                      (uint32_t)window_offset + (uint32_t)match_size > window_size)) {
+        return TAMP_OOB;
+    }
+
+    /* Calculate how many to write this call */
+    uint16_t remaining_count = match_size - skip;
+    size_t output_space = output_end - *output;
+    uint16_t to_write;
+
+    if (TAMP_UNLIKELY(remaining_count > output_space)) {
+        /* Partial write */
+        to_write = output_space;
+        d->skip_bytes = skip + output_space;
+        d->pending_symbol = PENDING_EXT_FRESH;
+        d->pending_window_offset = window_offset;
+        d->pending_ext_huffman = match_size;
+    } else {
+        /* Complete write */
+        to_write = remaining_count;
+        d->skip_bytes = 0;
+        d->pending_symbol = PENDING_NONE;
+    }
+
+    /* Copy from window to output */
+    uint16_t src_offset = window_offset + skip;
+    for (uint16_t i = 0; i < to_write; i++) {
+        *(*output)++ = d->window[src_offset + i];
+    }
+    *output_written_size += to_write;
+
+    /* Update window only on complete decode */
+    if (d->pending_symbol == PENDING_NONE) {
+        uint16_t wp = d->window_pos;
+        for (uint16_t i = 0; i < match_size; i++) {
+            d->window[wp] = d->window[(window_offset + i) & window_mask];
+            wp = (wp + 1) & window_mask;
+        }
+        d->window_pos = wp;
+        d->rle_last_written = 0;
+    }
+
+    return (d->pending_symbol == PENDING_NONE) ? TAMP_OK : TAMP_OUTPUT_FULL;
+}
+#endif /* TAMP_V2_DECOMPRESS */
+
 /**
  * @brief Copy pattern from window to window, updating window_pos.
  *
@@ -84,13 +292,13 @@ tamp_res tamp_decompressor_read_header(TampConf *conf, const unsigned char *inpu
                                        size_t *input_consumed_size) {
     if (input_consumed_size) (*input_consumed_size) = 0;
     if (input_size == 0) return TAMP_INPUT_EXHAUSTED;
-    if (input[0] & 0x2) return TAMP_INVALID_CONF;  // Reserved
     if (input[0] & 0x1) return TAMP_INVALID_CONF;  // Currently only a single header byte is supported.
     if (input_consumed_size) (*input_consumed_size)++;
 
     conf->window = ((input[0] >> 5) & 0x7) + 8;
     conf->literal = ((input[0] >> 3) & 0x3) + 5;
     conf->use_custom_dictionary = ((input[0] >> 2) & 0x1);
+    conf->v2 = ((input[0] >> 1) & 0x1);
 
     return TAMP_OK;
 }
@@ -101,7 +309,8 @@ tamp_res tamp_decompressor_read_header(TampConf *conf, const unsigned char *inpu
  *   * window_bits_max
  */
 static tamp_res tamp_decompressor_populate_from_conf(TampDecompressor *decompressor, uint8_t conf_window,
-                                                     uint8_t conf_literal, uint8_t conf_use_custom_dictionary) {
+                                                     uint8_t conf_literal, uint8_t conf_use_custom_dictionary,
+                                                     uint8_t conf_v2) {
     if (conf_window < 8 || conf_window > 15) return TAMP_INVALID_CONF;
     if (conf_literal < 5 || conf_literal > 8) return TAMP_INVALID_CONF;
     if (conf_window > decompressor->window_bits_max) return TAMP_INVALID_CONF;
@@ -111,6 +320,11 @@ static tamp_res tamp_decompressor_populate_from_conf(TampDecompressor *decompres
     decompressor->conf_literal = conf_literal;
     decompressor->min_pattern_size = tamp_compute_min_pattern_size(conf_window, conf_literal);
     decompressor->configured = true;
+#if TAMP_V2_DECOMPRESS
+    decompressor->conf_v2 = conf_v2;
+#else
+    (void)conf_v2;
+#endif
 
     return TAMP_OK;
 }
@@ -128,7 +342,7 @@ tamp_res tamp_decompressor_init(TampDecompressor *decompressor, const TampConf *
     decompressor->window_bits_max = window_bits;
     if (conf) {
         res = tamp_decompressor_populate_from_conf(decompressor, conf->window, conf->literal,
-                                                   conf->use_custom_dictionary);
+                                                   conf->use_custom_dictionary, conf->v2);
     }
 
     return res;
@@ -156,7 +370,8 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
         res = tamp_decompressor_read_header(&conf, input, input_end - input, &header_consumed_size);
         if (res != TAMP_OK) return res;
 
-        res = tamp_decompressor_populate_from_conf(decompressor, conf.window, conf.literal, conf.use_custom_dictionary);
+        res = tamp_decompressor_populate_from_conf(decompressor, conf.window, conf.literal, conf.use_custom_dictionary,
+                                                   conf.v2);
         if (res != TAMP_OK) return res;
 
         input += header_consumed_size;
@@ -169,15 +384,48 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
     const uint8_t min_pattern_size = decompressor->min_pattern_size;
 
     const uint16_t window_mask = (1 << conf_window) - 1;
-    while (input != input_end || decompressor->bit_buffer_pos) {
+#if TAMP_V2_DECOMPRESS
+    const bool v2_enabled = decompressor->conf_v2;
+#endif
+
+/* Macro to refill bit buffer from input. Used before returning TAMP_INPUT_EXHAUSTED
+ * to ensure we consume all available input first. */
+#define REFILL()                                                                               \
+    while (input != input_end && decompressor->bit_buffer_pos <= 24) {                         \
+        decompressor->bit_buffer_pos += 8;                                                     \
+        decompressor->bit_buffer |= (uint32_t) * input << (32 - decompressor->bit_buffer_pos); \
+        input++;                                                                               \
+        (*input_consumed_size)++;                                                              \
+    }
+
+    while (input != input_end || decompressor->bit_buffer_pos
+#if TAMP_V2_DECOMPRESS
+           || decompressor->pending_symbol
+#endif
+    ) {
         // Populate the bit buffer
-        while (input != input_end && decompressor->bit_buffer_pos <= 24) {
-            uint32_t t = *input;
-            decompressor->bit_buffer_pos += 8;
-            decompressor->bit_buffer |= t << (32 - decompressor->bit_buffer_pos);
-            input++;
-            (*input_consumed_size)++;
+        REFILL();
+
+#if TAMP_V2_DECOMPRESS
+        /* Resume pending v2 operation. Retry after refill if helper needs more bits. */
+        if (TAMP_UNLIKELY(decompressor->pending_symbol)) {
+            if (TAMP_UNLIKELY(output == output_end)) return TAMP_OUTPUT_FULL;
+            tamp_res v2_res;
+            if (decompressor->pending_symbol == PENDING_RLE) {
+                v2_res = decode_rle(decompressor, &output, output_end, output_written_size, window_mask);
+            } else {
+                v2_res = decode_extended_match(decompressor, &output, output_end, output_written_size, conf_window,
+                                               min_pattern_size, window_mask);
+            }
+            if (v2_res == TAMP_INPUT_EXHAUSTED) {
+                REFILL();
+                if (input == input_end) return TAMP_INPUT_EXHAUSTED;
+                continue; /* Retry with refilled buffer */
+            }
+            if (v2_res != TAMP_OK) return v2_res;
+            continue;
         }
+#endif
 
         if (TAMP_UNLIKELY(decompressor->bit_buffer_pos == 0)) return TAMP_INPUT_EXHAUSTED;
 
@@ -200,6 +448,9 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
 
             output++;
             (*output_written_size)++;
+#if TAMP_V2_DECOMPRESS
+            decompressor->rle_last_written = 0;
+#endif
         } else {
             // is token; attempt a decode
             /* copy the bit buffers so that we can abort at any time */
@@ -225,6 +476,39 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
                     bit_buffer_pos & ~7;  // Round bit_buffer_pos down to nearest multiple of 8.
                 continue;
             }
+
+#if TAMP_V2_DECOMPRESS
+            /* Check for v2 symbols */
+            if (TAMP_UNLIKELY(v2_enabled && match_size >= TAMP_RLE_SYMBOL)) {
+                /* Commit bit buffer and set pending symbol before calling helper.
+                 * Initialize partial state fields to indicate fresh decode. */
+                decompressor->bit_buffer = bit_buffer;
+                decompressor->bit_buffer_pos = bit_buffer_pos;
+                decompressor->pending_window_offset = 0;
+                decompressor->pending_ext_huffman = PARTIAL_STATE_NONE;
+
+                tamp_res v2_res;
+                if (match_size == TAMP_RLE_SYMBOL) {
+                    decompressor->pending_symbol = PENDING_RLE;
+                    v2_res = decode_rle(decompressor, &output, output_end, output_written_size, window_mask);
+                } else if (match_size == TAMP_EXTENDED_MATCH_SYMBOL) {
+                    decompressor->pending_symbol = PENDING_EXT_FRESH;
+                    v2_res = decode_extended_match(decompressor, &output, output_end, output_written_size, conf_window,
+                                                   min_pattern_size, window_mask);
+                } else {
+                    return TAMP_ERROR; /* Invalid v2 symbol */
+                }
+                /* On success, helper clears pending_symbol; on error, it stays set for resume */
+                if (v2_res == TAMP_INPUT_EXHAUSTED) {
+                    REFILL();
+                    if (input == input_end) return TAMP_INPUT_EXHAUSTED;
+                    continue; /* Retry with refilled buffer */
+                }
+                if (v2_res != TAMP_OK) return v2_res;
+                continue;
+            }
+#endif
+
             if (TAMP_UNLIKELY(bit_buffer_pos < conf_window)) {
                 // There are not enough bits to decode window offset
                 return TAMP_INPUT_EXHAUSTED;
@@ -270,6 +554,9 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
                 uint16_t wp = decompressor->window_pos;
                 window_copy(decompressor->window, &wp, window_offset, match_size, window_mask);
                 decompressor->window_pos = wp;
+#if TAMP_V2_DECOMPRESS
+                decompressor->rle_last_written = 0;
+#endif
             }
         }
         if (TAMP_UNLIKELY(callback && (res = callback(user_data, *output_written_size, input_size))))
diff --git a/tamp/_c_src/tamp/decompressor.h b/tamp/_c_src/tamp/decompressor.h
index 1608a6d5..a7c8a58e 100644
--- a/tamp/_c_src/tamp/decompressor.h
+++ b/tamp/_c_src/tamp/decompressor.h
@@ -17,17 +17,29 @@ typedef struct {
     uint32_t bit_buffer;     // Bit buffer for reading compressed data (32 bits)
     uint16_t window_pos;     // Current position in window (15 bits)
     uint8_t bit_buffer_pos;  // Bits currently in bit_buffer (6 bits)
+#if TAMP_V2_DECOMPRESS
+    uint8_t pending_symbol;          // State machine: 0=none, 12=RLE, 13=ext match need window,
+                                     // 14=ext match need length, 15=ext match need raw bits
+    uint16_t pending_window_offset;  // Saved window_offset for extended match resume
+    uint8_t pending_ext_huffman;     // Saved ext_huffman for extended match resume (state 15)
+#endif
 
     /* WARM: read once at start of decompress, cached in locals */
     uint8_t conf_window : 4;       // Window bits from config
     uint8_t conf_literal : 4;      // Literal bits from config
     uint8_t min_pattern_size : 2;  // Minimum pattern size, 2 or 3
+#if TAMP_V2_DECOMPRESS
+    uint8_t conf_v2 : 1;  // v2 format enabled (from header)
+#endif
 
     /* COLD: rarely accessed (init or edge cases).
      * Bitfields save space; add new cold fields here. */
     uint8_t skip_bytes : 4;       // For output-buffer-limited resumption
     uint8_t window_bits_max : 4;  // Max window bits buffer can hold
     uint8_t configured : 1;       // Whether config has been set
+#if TAMP_V2_DECOMPRESS
+    uint8_t rle_last_written : 1;  // Previous write was RLE (skip window re-write)
+#endif
 } TampDecompressor;
 
 /**

From 5dbcefbe433a9fe7aebd6f26a4fcf57b4911782a Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 14:33:28 -0500
Subject: [PATCH 008/109] swap window/extended-match length in encoding

---
 Makefile                        | 130 ++++++++++++++++++--------------
 tamp/_c_src/tamp/decompressor.c | 109 +++++++++++++-------------
 tamp/_c_src/tamp/decompressor.h |   9 +--
 tamp/compressor.py              |   3 +-
 tamp/decompressor.py            |   3 +-
 5 files changed, 135 insertions(+), 119 deletions(-)

diff --git a/Makefile b/Makefile
index 7b668924..0949b15b 100644
--- a/Makefile
+++ b/Makefile
@@ -182,7 +182,8 @@ build/enwik8-100kb: download-enwik8
 	@head -c 100000 datasets/enwik8 > build/enwik8-100kb
 
 build/enwik8-100kb.tamp: build/enwik8-100kb
-	@poetry run tamp compress build/enwik8-100kb -o build/enwik8-100kb.tamp
+	@# Use Python implementation until C compressor supports v2
+	@poetry run tamp compress --implementation=python build/enwik8-100kb -o build/enwik8-100kb.tamp
 
 download-micropython:
 	mkdir -p datasets
@@ -490,7 +491,7 @@ tamp-c-library: build/tamp.a
 # Binary Sizes
 ###############
 # Generate binary size information for README table (armv6m with -O3).
-.PHONY: binary-size c-size
+.PHONY: binary-size c-size c-size-v1 c-size-v2
 
 ARM_CC := arm-none-eabi-gcc
 ARM_AR := arm-none-eabi-ar
@@ -501,61 +502,76 @@ C_SRC_COMMON = tamp/_c_src/tamp/common.c
 C_SRC_COMP = tamp/_c_src/tamp/compressor.c
 C_SRC_DECOMP = tamp/_c_src/tamp/decompressor.c
 
-# Build compressor-only library (without stream API)
-build/arm/tamp_comp.a: $(C_SRC_COMMON) $(C_SRC_COMP)
-	@mkdir -p build/arm
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common_c.o
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
-	$(ARM_AR) rcs $@ build/arm/common_c.o build/arm/compressor.o
-
-# Build decompressor-only library (without stream API)
-build/arm/tamp_decomp.a: $(C_SRC_COMMON) $(C_SRC_DECOMP)
-	@mkdir -p build/arm
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common_d.o
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
-	$(ARM_AR) rcs $@ build/arm/common_d.o build/arm/decompressor.o
-
-# Build full library (without stream API)
-build/arm/tamp_full.a: $(C_SRC_COMMON) $(C_SRC_COMP) $(C_SRC_DECOMP)
-	@mkdir -p build/arm
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common_f.o
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor_f.o
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor_f.o
-	$(ARM_AR) rcs $@ build/arm/common_f.o build/arm/compressor_f.o build/arm/decompressor_f.o
-
-# Build compressor-only library (with stream API, the default)
-build/arm/tamp_comp_stream.a: $(C_SRC_COMMON) $(C_SRC_COMP)
-	@mkdir -p build/arm
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMMON) -o build/arm/common_cs.o
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMP) -o build/arm/compressor_s.o
-	$(ARM_AR) rcs $@ build/arm/common_cs.o build/arm/compressor_s.o
-
-# Build decompressor-only library (with stream API, the default)
-build/arm/tamp_decomp_stream.a: $(C_SRC_COMMON) $(C_SRC_DECOMP)
-	@mkdir -p build/arm
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common_ds.o
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor_s.o
-	$(ARM_AR) rcs $@ build/arm/common_ds.o build/arm/decompressor_s.o
-
-# Build full library (with stream API, the default)
-build/arm/tamp_full_stream.a: $(C_SRC_COMMON) $(C_SRC_COMP) $(C_SRC_DECOMP)
-	@mkdir -p build/arm
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common_fs.o
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMP) -o build/arm/compressor_fs.o
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor_fs.o
-	$(ARM_AR) rcs $@ build/arm/common_fs.o build/arm/compressor_fs.o build/arm/decompressor_fs.o
-
-c-size:
-	@rm -rf build/arm
-	@$(MAKE) --no-print-directory build/arm/tamp_comp_stream.a build/arm/tamp_decomp_stream.a build/arm/tamp_full_stream.a build/arm/tamp_comp.a build/arm/tamp_decomp.a build/arm/tamp_full.a
-	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/tamp_comp.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \
-	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/tamp_decomp.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \
-	size_full=$$($(ARM_SIZE) -B --totals build/arm/tamp_full.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \
-	printf 'Tamp (C, -DTAMP_STREAM=0)  %d  %d  %d\n' $$size_comp $$size_decomp $$size_full
-	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/tamp_comp_stream.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \
-	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/tamp_decomp_stream.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \
-	size_full=$$($(ARM_SIZE) -B --totals build/arm/tamp_full_stream.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \
-	printf 'Tamp (C)                   %d  %d  %d\n' $$size_comp $$size_decomp $$size_full
+# V2 flags to disable v2 support
+V1_FLAGS = -DTAMP_V2_COMPRESS=0 -DTAMP_V2_DECOMPRESS=0
+
+c-size-v1:
+	@rm -rf build/arm && mkdir -p build/arm
+	@# v1 without stream API
+	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_AR) rcs build/arm/v1_comp.a build/arm/common.o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/v1_decomp.a build/arm/common.o build/arm/decompressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/v1_full.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o
+	@# v1 with stream API
+	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_AR) rcs build/arm/v1_comp_s.a build/arm/common.o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/v1_decomp_s.a build/arm/common.o build/arm/decompressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/v1_full_s.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o
+	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/v1_comp.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/v1_decomp.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_full=$$($(ARM_SIZE) -B --totals build/arm/v1_full.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	printf 'Tamp v1 (C, no stream)     %d  %d  %d\n' $$size_comp $$size_decomp $$size_full
+	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/v1_comp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/v1_decomp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_full=$$($(ARM_SIZE) -B --totals build/arm/v1_full_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	printf 'Tamp v1 (C)                %d  %d  %d\n' $$size_comp $$size_decomp $$size_full
+
+c-size-v2:
+	@rm -rf build/arm && mkdir -p build/arm
+	@# v2 without stream API
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_AR) rcs build/arm/v2_comp.a build/arm/common.o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/v2_decomp.a build/arm/common.o build/arm/decompressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/v2_full.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o
+	@# v2 with stream API
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_AR) rcs build/arm/v2_comp_s.a build/arm/common.o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/v2_decomp_s.a build/arm/common.o build/arm/decompressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/v2_full_s.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o
+	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/v2_comp.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/v2_decomp.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_full=$$($(ARM_SIZE) -B --totals build/arm/v2_full.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	printf 'Tamp v2 (C, no stream)     %d  %d  %d\n' $$size_comp $$size_decomp $$size_full
+	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/v2_comp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/v2_decomp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_full=$$($(ARM_SIZE) -B --totals build/arm/v2_full_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	printf 'Tamp v2 (C)                %d  %d  %d\n' $$size_comp $$size_decomp $$size_full
+
+c-size: c-size-v1 c-size-v2
 
 binary-size:
 	@echo "Binary sizes for armv6m (bytes):"
diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index aa1999e8..690fa554 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -10,18 +10,15 @@
 #if TAMP_V2_DECOMPRESS
 /* Pending symbol states for v2 decode suspend/resume.
  * - 0: No pending operation
- * - 12: RLE - pending_ext_huffman holds partial huffman result or 0xFF for fresh/output-resume
- * - 13: Extended match - fresh (need window_offset)
- * - 14: Extended match - have window_offset in pending_window_offset (need huffman+trailing)
- * - 15: Extended match - have window_offset and huffman (need trailing only)
+ * - 12: RLE (fresh or resume based on skip_bytes)
+ * - 13: Extended match - have match_size, need window_offset
+ * - 14: Extended match - fresh decode (need both size and offset)
  * When skip_bytes > 0, we're resuming after output-full with full decode saved.
  */
 #define PENDING_NONE 0
 #define PENDING_RLE 12
-#define PENDING_EXT_FRESH 13
-#define PENDING_EXT_HAVE_OFFSET 14
-#define PENDING_EXT_HAVE_HUFFMAN 15
-#define PARTIAL_STATE_NONE 0xFF
+#define PENDING_EXT_NEED_OFFSET 13
+#define PENDING_EXT_FRESH 14
 #endif
 
 /**
@@ -72,8 +69,8 @@ static inline int8_t huffman_decode(uint32_t *bit_buffer, uint8_t *bit_buffer_po
 /**
  * @brief Decode huffman symbol + trailing bits from bit buffer.
  *
- * Shared helper for RLE and extended match decoding.
- * Uses pending_ext_huffman to track partial decode state.
+ * Simple helper that decodes from local copies. On failure, decompressor
+ * state is not modified. Caller is responsible for state management.
  *
  * @param d Decompressor state
  * @param trailing_bits Number of trailing bits to read (3 or 4)
@@ -83,21 +80,13 @@ static inline int8_t huffman_decode(uint32_t *bit_buffer, uint8_t *bit_buffer_po
 static tamp_res decode_huffman_trailing(TampDecompressor *d, uint8_t trailing_bits, uint16_t *result) {
     uint32_t bit_buffer = d->bit_buffer;
     uint8_t bit_buffer_pos = d->bit_buffer_pos;
-    int8_t huffman_value;
 
-    if (d->pending_ext_huffman != PARTIAL_STATE_NONE) {
-        huffman_value = d->pending_ext_huffman;
-    } else {
-        if (TAMP_UNLIKELY(bit_buffer_pos < 1)) return TAMP_INPUT_EXHAUSTED;
-        huffman_value = huffman_decode(&bit_buffer, &bit_buffer_pos);
-    }
+    /* Need at least 1 bit for huffman, plus trailing bits */
+    if (TAMP_UNLIKELY(bit_buffer_pos < 1 + trailing_bits)) return TAMP_INPUT_EXHAUSTED;
 
-    if (TAMP_UNLIKELY(bit_buffer_pos < trailing_bits)) {
-        d->bit_buffer = bit_buffer;
-        d->bit_buffer_pos = bit_buffer_pos;
-        d->pending_ext_huffman = huffman_value;
-        return TAMP_INPUT_EXHAUSTED;
-    }
+    int8_t huffman_value = huffman_decode(&bit_buffer, &bit_buffer_pos);
+
+    if (TAMP_UNLIKELY(bit_buffer_pos < trailing_bits)) return TAMP_INPUT_EXHAUSTED;
 
     uint8_t trailing = bit_buffer >> (32 - trailing_bits);
     bit_buffer <<= trailing_bits;
@@ -105,15 +94,18 @@ static tamp_res decode_huffman_trailing(TampDecompressor *d, uint8_t trailing_bi
 
     *result = (huffman_value << trailing_bits) + trailing;
 
+    /* Commit only on success */
     d->bit_buffer = bit_buffer;
     d->bit_buffer_pos = bit_buffer_pos;
-    d->pending_ext_huffman = PARTIAL_STATE_NONE;
 
     return TAMP_OK;
 }
 
 /**
  * @brief Decode RLE token and write repeated bytes to output.
+ *
+ * RLE format: huffman(count_high) + trailing_bits(count_low)
+ * rle_count = (count_high << 4) + count_low + 2
  */
 static tamp_res decode_rle(TampDecompressor *d, unsigned char **output, const unsigned char *output_end,
                            size_t *output_written_size, uint16_t window_mask) {
@@ -121,8 +113,10 @@ static tamp_res decode_rle(TampDecompressor *d, unsigned char **output, const un
     uint16_t skip = d->skip_bytes;
 
     if (skip > 0) {
+        /* Resume from output-full: rle_count saved in pending_window_offset */
         rle_count = d->pending_window_offset;
     } else {
+        /* Fresh decode */
         uint16_t raw;
         tamp_res res = decode_huffman_trailing(d, TAMP_LEADING_RLE_BITS, &raw);
         if (res != TAMP_OK) return res;
@@ -139,7 +133,7 @@ static tamp_res decode_rle(TampDecompressor *d, unsigned char **output, const un
     uint16_t to_write;
 
     if (TAMP_UNLIKELY(remaining_count > output_space)) {
-        /* Partial write */
+        /* Partial write - save state for resume */
         to_write = output_space;
         d->skip_bytes = skip + output_space;
         d->pending_symbol = PENDING_RLE;
@@ -172,6 +166,14 @@ static tamp_res decode_rle(TampDecompressor *d, unsigned char **output, const un
 
 /**
  * @brief Decode extended match token and copy from window to output.
+ *
+ * NEW FORMAT: huffman(size_high) + trailing_bits(size_low) + window_offset
+ * match_size = (size_high << 3) + size_low + min_pattern_size + 12
+ *
+ * State machine:
+ * - Fresh: decode huffman+trailing, then window_offset
+ * - PENDING_EXT_NEED_OFFSET: have match_size, need window_offset
+ * - Output-full resume (skip > 0): have both match_size and window_offset
  */
 static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **output, const unsigned char *output_end,
                                       size_t *output_written_size, uint8_t conf_window, uint8_t min_pattern_size,
@@ -179,36 +181,36 @@ static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **outpu
     uint16_t window_offset;
     uint16_t match_size;
     uint16_t skip = d->skip_bytes;
-    uint8_t pending = d->pending_symbol;
 
     if (skip > 0) {
-        /* Resume from output-full: window_offset and match_size already saved */
+        /* Resume from output-full: both values saved */
         window_offset = d->pending_window_offset;
-        match_size = d->pending_ext_huffman;
+        match_size = d->pending_match_size;
+    } else if (d->pending_symbol == PENDING_EXT_NEED_OFFSET) {
+        /* Resume: have match_size, need window_offset */
+        match_size = d->pending_match_size;
+
+        if (TAMP_UNLIKELY(d->bit_buffer_pos < conf_window)) return TAMP_INPUT_EXHAUSTED;
+        window_offset = d->bit_buffer >> (32 - conf_window);
+        d->bit_buffer <<= conf_window;
+        d->bit_buffer_pos -= conf_window;
     } else {
-        /* Step 1: Get window_offset (saved or decode fresh) */
-        if (pending >= PENDING_EXT_HAVE_OFFSET) {
-            window_offset = d->pending_window_offset;
-        } else {
-            if (TAMP_UNLIKELY(d->bit_buffer_pos < conf_window)) return TAMP_INPUT_EXHAUSTED;
-            window_offset = d->bit_buffer >> (32 - conf_window);
-            d->bit_buffer <<= conf_window;
-            d->bit_buffer_pos -= conf_window;
-            /* Save window_offset in case huffman+trailing needs more input */
-            d->pending_window_offset = window_offset;
-        }
-
-        /* Step 2: Decode huffman + trailing bits */
+        /* Fresh decode: huffman+trailing first, then window_offset */
         uint16_t raw;
         tamp_res res = decode_huffman_trailing(d, TAMP_LEADING_EXTENDED_MATCH_BITS, &raw);
-        if (res != TAMP_OK) {
-            /* Update pending_symbol based on where we stopped */
-            d->pending_symbol =
-                (d->pending_ext_huffman != PARTIAL_STATE_NONE) ? PENDING_EXT_HAVE_HUFFMAN : PENDING_EXT_HAVE_OFFSET;
-            return res;
-        }
-
+        if (res != TAMP_OK) return res;
         match_size = raw + min_pattern_size + 12;
+
+        /* Now decode window_offset */
+        if (TAMP_UNLIKELY(d->bit_buffer_pos < conf_window)) {
+            /* Save match_size and return */
+            d->pending_symbol = PENDING_EXT_NEED_OFFSET;
+            d->pending_match_size = match_size;
+            return TAMP_INPUT_EXHAUSTED;
+        }
+        window_offset = d->bit_buffer >> (32 - conf_window);
+        d->bit_buffer <<= conf_window;
+        d->bit_buffer_pos -= conf_window;
     }
 
     /* Security check: validate window bounds */
@@ -224,12 +226,12 @@ static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **outpu
     uint16_t to_write;
 
     if (TAMP_UNLIKELY(remaining_count > output_space)) {
-        /* Partial write */
+        /* Partial write - save state for resume */
         to_write = output_space;
         d->skip_bytes = skip + output_space;
-        d->pending_symbol = PENDING_EXT_FRESH;
+        d->pending_symbol = PENDING_EXT_NEED_OFFSET; /* Reuse for output-full */
         d->pending_window_offset = window_offset;
-        d->pending_ext_huffman = match_size;
+        d->pending_match_size = match_size;
     } else {
         /* Complete write */
         to_write = remaining_count;
@@ -480,12 +482,9 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
 #if TAMP_V2_DECOMPRESS
             /* Check for v2 symbols */
             if (TAMP_UNLIKELY(v2_enabled && match_size >= TAMP_RLE_SYMBOL)) {
-                /* Commit bit buffer and set pending symbol before calling helper.
-                 * Initialize partial state fields to indicate fresh decode. */
+                /* Commit bit buffer before calling helper. */
                 decompressor->bit_buffer = bit_buffer;
                 decompressor->bit_buffer_pos = bit_buffer_pos;
-                decompressor->pending_window_offset = 0;
-                decompressor->pending_ext_huffman = PARTIAL_STATE_NONE;
 
                 tamp_res v2_res;
                 if (match_size == TAMP_RLE_SYMBOL) {
diff --git a/tamp/_c_src/tamp/decompressor.h b/tamp/_c_src/tamp/decompressor.h
index a7c8a58e..df78c3d9 100644
--- a/tamp/_c_src/tamp/decompressor.h
+++ b/tamp/_c_src/tamp/decompressor.h
@@ -18,10 +18,9 @@ typedef struct {
     uint16_t window_pos;     // Current position in window (15 bits)
     uint8_t bit_buffer_pos;  // Bits currently in bit_buffer (6 bits)
 #if TAMP_V2_DECOMPRESS
-    uint8_t pending_symbol;          // State machine: 0=none, 12=RLE, 13=ext match need window,
-                                     // 14=ext match need length, 15=ext match need raw bits
-    uint16_t pending_window_offset;  // Saved window_offset for extended match resume
-    uint8_t pending_ext_huffman;     // Saved ext_huffman for extended match resume (state 15)
+    uint8_t pending_symbol;          // State machine: 0=none, 12=RLE, 13=ext need offset, 14=ext fresh
+    uint16_t pending_window_offset;  // Saved window_offset for extended match output-full resume
+    uint16_t pending_match_size;     // Saved match_size for extended match resume
 #endif
 
     /* WARM: read once at start of decompress, cached in locals */
@@ -34,7 +33,7 @@ typedef struct {
 
     /* COLD: rarely accessed (init or edge cases).
      * Bitfields save space; add new cold fields here. */
-    uint8_t skip_bytes : 4;       // For output-buffer-limited resumption
+    uint8_t skip_bytes;           // For output-buffer-limited resumption (v2 needs >4 bits)
     uint8_t window_bits_max : 4;  // Max window bits buffer can hold
     uint8_t configured : 1;       // Whether config has been set
 #if TAMP_V2_DECOMPRESS
diff --git a/tamp/compressor.py b/tamp/compressor.py
index 7b4b2a79..906477cc 100644
--- a/tamp/compressor.py
+++ b/tamp/compressor.py
@@ -413,12 +413,13 @@ def _write_extended_match(self):
             self.extended_match_cb(
                 self._window_buffer.pos, self._extended_match_position, self._extended_match_count, string
             )
+        # Format: symbol, size (huffman+trailing), position
         bytes_written += self._bit_writer.write_huffman_and_literal_flag(_EXTENDED_MATCH_SYMBOL)
-        bytes_written += self._bit_writer.write(self._extended_match_position, self.window_bits)
         bytes_written += self._write_extended_huffman(
             self._extended_match_count - self.min_pattern_size - 11 - 1,
             _LEADING_EXTENDED_MATCH_HUFFMAN_BITS,
         )
+        bytes_written += self._bit_writer.write(self._extended_match_position, self.window_bits)
 
         self._window_buffer.write_from_self(self._extended_match_position, self._extended_match_count)
 
diff --git a/tamp/decompressor.py b/tamp/decompressor.py
index 947dd023..e095489e 100644
--- a/tamp/decompressor.py
+++ b/tamp/decompressor.py
@@ -260,11 +260,12 @@ def write_to_output(string):
                                     self._window_buffer.write_bytes(string[: min(rle_count, _RLE_MAX_WINDOW)])
                                 self._rle_last_written = True
                             elif match_size == _EXTENDED_MATCH_SYMBOL:
-                                index = self._bit_reader.read(self.window_bits)
+                                # Format: size (huffman+trailing), then position
                                 match_size = self._bit_reader.read_huffman()
                                 match_size <<= _LEADING_EXTENDED_MATCH_HUFFMAN_BITS
                                 match_size += self._bit_reader.read(_LEADING_EXTENDED_MATCH_HUFFMAN_BITS)
                                 match_size += self.min_pattern_size + 11 + 1
+                                index = self._bit_reader.read(self.window_bits)
 
                                 string = self._window_buffer.get(index, match_size)
 

From b377d18fcfa4b5d891b5121d34d8759b8f68c03f Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 14:47:40 -0500
Subject: [PATCH 009/109] more cleanup

---
 tamp/_c_src/tamp/decompressor.c | 39 +++++++++++++++++----------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 690fa554..39201c22 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -350,6 +350,21 @@ tamp_res tamp_decompressor_init(TampDecompressor *decompressor, const TampConf *
     return res;
 }
 
+/**
+ * @brief Refill bit buffer from input stream.
+ *
+ * Consumes bytes from input until bit_buffer has at least 25 bits or input is exhausted.
+ */
+static inline void refill_bit_buffer(TampDecompressor *d, const unsigned char **input, const unsigned char *input_end,
+                                     size_t *input_consumed_size) {
+    while (*input != input_end && d->bit_buffer_pos <= 24) {
+        d->bit_buffer_pos += 8;
+        d->bit_buffer |= (uint32_t) * (*input) << (32 - d->bit_buffer_pos);
+        (*input)++;
+        (*input_consumed_size)++;
+    }
+}
+
 tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigned char *output, size_t output_size,
                                          size_t *output_written_size, const unsigned char *input, size_t input_size,
                                          size_t *input_consumed_size, tamp_callback_t callback, void *user_data) {
@@ -390,23 +405,13 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
     const bool v2_enabled = decompressor->conf_v2;
 #endif
 
-/* Macro to refill bit buffer from input. Used before returning TAMP_INPUT_EXHAUSTED
- * to ensure we consume all available input first. */
-#define REFILL()                                                                               \
-    while (input != input_end && decompressor->bit_buffer_pos <= 24) {                         \
-        decompressor->bit_buffer_pos += 8;                                                     \
-        decompressor->bit_buffer |= (uint32_t) * input << (32 - decompressor->bit_buffer_pos); \
-        input++;                                                                               \
-        (*input_consumed_size)++;                                                              \
-    }
-
     while (input != input_end || decompressor->bit_buffer_pos
 #if TAMP_V2_DECOMPRESS
            || decompressor->pending_symbol
 #endif
     ) {
         // Populate the bit buffer
-        REFILL();
+        refill_bit_buffer(decompressor, &input, input_end, input_consumed_size);
 
 #if TAMP_V2_DECOMPRESS
         /* Resume pending v2 operation. Retry after refill if helper needs more bits. */
@@ -420,7 +425,7 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
                                                min_pattern_size, window_mask);
             }
             if (v2_res == TAMP_INPUT_EXHAUSTED) {
-                REFILL();
+                refill_bit_buffer(decompressor, &input, input_end, input_consumed_size);
                 if (input == input_end) return TAMP_INPUT_EXHAUSTED;
                 continue; /* Retry with refilled buffer */
             }
@@ -497,13 +502,9 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
                 } else {
                     return TAMP_ERROR; /* Invalid v2 symbol */
                 }
-                /* On success, helper clears pending_symbol; on error, it stays set for resume */
-                if (v2_res == TAMP_INPUT_EXHAUSTED) {
-                    REFILL();
-                    if (input == input_end) return TAMP_INPUT_EXHAUSTED;
-                    continue; /* Retry with refilled buffer */
-                }
-                if (v2_res != TAMP_OK) return v2_res;
+                /* On success, helper clears pending_symbol; on error, it stays set for resume.
+                 * TAMP_INPUT_EXHAUSTED is handled by resume path on next iteration. */
+                if (v2_res == TAMP_OUTPUT_FULL || v2_res < TAMP_OK) return v2_res;
                 continue;
             }
 #endif

From 667e3cfc0fd1f7353218a8f42312701195a0cf24 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 18:35:04 -0500
Subject: [PATCH 010/109] remove rle_last_written check; provides very small
 benefits, but unnecessarily bloats c-decompressor

---
 tamp/_c_src/tamp/decompressor.c | 32 +++++++++++---------------------
 tamp/_c_src/tamp/decompressor.h |  5 +----
 tamp/compressor.py              | 16 ++++------------
 tamp/decompressor.py            | 10 +++-------
 4 files changed, 19 insertions(+), 44 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 39201c22..f370cd8e 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -8,17 +8,13 @@
 #define FLUSH 15
 
 #if TAMP_V2_DECOMPRESS
-/* Pending symbol states for v2 decode suspend/resume.
- * - 0: No pending operation
- * - 12: RLE (fresh or resume based on skip_bytes)
- * - 13: Extended match - have match_size, need window_offset
- * - 14: Extended match - fresh decode (need both size and offset)
+/* Pending symbol states for v2 decode suspend/resume (2 bits).
  * When skip_bytes > 0, we're resuming after output-full with full decode saved.
  */
 #define PENDING_NONE 0
-#define PENDING_RLE 12
-#define PENDING_EXT_NEED_OFFSET 13
-#define PENDING_EXT_FRESH 14
+#define PENDING_RLE 1
+#define PENDING_EXT_NEED_OFFSET 2
+#define PENDING_EXT_FRESH 3
 #endif
 
 /**
@@ -151,15 +147,16 @@ static tamp_res decode_rle(TampDecompressor *d, unsigned char **output, const un
     }
     *output_written_size += to_write;
 
-    /* Update window only on first chunk (skip==0) and not after another RLE */
-    if (skip == 0 && !d->rle_last_written) {
-        uint16_t window_write = (rle_count < TAMP_RLE_MAX_WINDOW) ? rle_count : TAMP_RLE_MAX_WINDOW;
+    /* Update window only on first chunk (skip==0).
+     * Write up to TAMP_RLE_MAX_WINDOW or until end of buffer (no wrap). */
+    if (skip == 0) {
+        uint16_t remaining = (window_mask + 1) - d->window_pos;
+        uint16_t window_write = MIN(MIN(rle_count, TAMP_RLE_MAX_WINDOW), remaining);
         for (uint16_t i = 0; i < window_write; i++) {
-            d->window[d->window_pos] = symbol;
-            d->window_pos = (d->window_pos + 1) & window_mask;
+            d->window[d->window_pos++] = symbol;
         }
+        d->window_pos &= window_mask;
     }
-    d->rle_last_written = 1;
 
     return (d->pending_symbol == PENDING_NONE) ? TAMP_OK : TAMP_OUTPUT_FULL;
 }
@@ -254,7 +251,6 @@ static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **outpu
             wp = (wp + 1) & window_mask;
         }
         d->window_pos = wp;
-        d->rle_last_written = 0;
     }
 
     return (d->pending_symbol == PENDING_NONE) ? TAMP_OK : TAMP_OUTPUT_FULL;
@@ -455,9 +451,6 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
 
             output++;
             (*output_written_size)++;
-#if TAMP_V2_DECOMPRESS
-            decompressor->rle_last_written = 0;
-#endif
         } else {
             // is token; attempt a decode
             /* copy the bit buffers so that we can abort at any time */
@@ -554,9 +547,6 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
                 uint16_t wp = decompressor->window_pos;
                 window_copy(decompressor->window, &wp, window_offset, match_size, window_mask);
                 decompressor->window_pos = wp;
-#if TAMP_V2_DECOMPRESS
-                decompressor->rle_last_written = 0;
-#endif
             }
         }
         if (TAMP_UNLIKELY(callback && (res = callback(user_data, *output_written_size, input_size))))
diff --git a/tamp/_c_src/tamp/decompressor.h b/tamp/_c_src/tamp/decompressor.h
index df78c3d9..8a3ff207 100644
--- a/tamp/_c_src/tamp/decompressor.h
+++ b/tamp/_c_src/tamp/decompressor.h
@@ -18,7 +18,7 @@ typedef struct {
     uint16_t window_pos;     // Current position in window (15 bits)
     uint8_t bit_buffer_pos;  // Bits currently in bit_buffer (6 bits)
 #if TAMP_V2_DECOMPRESS
-    uint8_t pending_symbol;          // State machine: 0=none, 12=RLE, 13=ext need offset, 14=ext fresh
+    uint8_t pending_symbol : 2;      // State machine: 0=none, 1=RLE, 2=ext need offset, 3=ext fresh
     uint16_t pending_window_offset;  // Saved window_offset for extended match output-full resume
     uint16_t pending_match_size;     // Saved match_size for extended match resume
 #endif
@@ -36,9 +36,6 @@ typedef struct {
     uint8_t skip_bytes;           // For output-buffer-limited resumption (v2 needs >4 bits)
     uint8_t window_bits_max : 4;  // Max window bits buffer can hold
     uint8_t configured : 1;       // Whether config has been set
-#if TAMP_V2_DECOMPRESS
-    uint8_t rle_last_written : 1;  // Previous write was RLE (skip window re-write)
-#endif
 } TampDecompressor;
 
 /**
diff --git a/tamp/compressor.py b/tamp/compressor.py
index 906477cc..b2544abb 100644
--- a/tamp/compressor.py
+++ b/tamp/compressor.py
@@ -189,7 +189,6 @@ def __init__(
         self.v2: bool = v2
 
         self._rle_count = 0
-        self._rle_last_written = False  # The previous write was an RLE token
 
         # "+1" Because a RLE of 1 is not valid.
         self._rle_max_size = (13 << _LEADING_RLE_HUFFMAN_BITS) + (1 << _LEADING_RLE_HUFFMAN_BITS) + 1
@@ -368,7 +367,6 @@ def _compress_input_buffer_single(self) -> int:
             else:
                 bytes_written += self._write_match(search_i, match)
 
-            self._rle_last_written = False
             for _ in range(match_size):
                 self._input_buffer.popleft()
         else:
@@ -438,7 +436,6 @@ def _write_literal(self, literal) -> int:
 
         bytes_written += self._bit_writer.write(literal | self.literal_flag, self.literal_bits + 1)
         self._window_buffer.write_byte(literal)
-        self._rle_last_written = False
         return bytes_written
 
     def _write_match(self, search_i, match) -> int:
@@ -456,7 +453,6 @@ def _write_match(self, search_i, match) -> int:
         bytes_written += self._bit_writer.write_huffman_and_literal_flag(match_size - self.min_pattern_size)
         bytes_written += self._bit_writer.write(search_i, self.window_bits)
         self._window_buffer.write_bytes(match)
-        self._rle_last_written = False
         return bytes_written
 
     def _write_rle(self) -> int:
@@ -474,12 +470,10 @@ def _write_rle(self) -> int:
             bytes_written += self._bit_writer.write_huffman_and_literal_flag(_RLE_SYMBOL)
             bytes_written += self._write_extended_huffman(self._rle_count - 2, _LEADING_RLE_HUFFMAN_BITS)
 
-            if not self._rle_last_written:
-                # Only write up to 8 bytes, and only if we didn't already do this.
-                # This prevents filling up the window buffer with unhelpful data.
-                self._window_buffer.write_bytes(bytes([last_written_byte]) * min(self._rle_count, _RLE_MAX_WINDOW))
-
-            self._rle_last_written = True
+            # Write up to 8 bytes (or until end of buffer) to the window.
+            remaining = self._window_buffer.size - self._window_buffer.pos
+            window_write = min(self._rle_count, _RLE_MAX_WINDOW, remaining)
+            self._window_buffer.write_bytes(bytes([last_written_byte]) * window_write)
 
         self._rle_count = 0
         return bytes_written
@@ -546,8 +540,6 @@ def flush(self, write_token: bool = True) -> int:
 
         bytes_written_flush = self._bit_writer.flush(write_token=write_token)
         bytes_written += bytes_written_flush
-        if bytes_written_flush:
-            self._rle_last_written = False
         return bytes_written
 
     def close(self) -> int:
diff --git a/tamp/decompressor.py b/tamp/decompressor.py
index e095489e..38ff50a4 100644
--- a/tamp/decompressor.py
+++ b/tamp/decompressor.py
@@ -181,7 +181,6 @@ def __init__(self, f, *, dictionary: Optional[bytearray] = None):
         uses_custom_dictionary = self._bit_reader.read(1)
         self.v2 = self._bit_reader.read(1)
         more_header_bytes = self._bit_reader.read(1)
-        self._rle_last_written = False
 
         if more_header_bytes:
             raise NotImplementedError
@@ -242,7 +241,6 @@ def write_to_output(string):
                     if is_literal:
                         string = bytes([self._bit_reader.read(self.literal_bits)])
                         self._window_buffer.write_bytes(string)
-                        self._rle_last_written = False
                     else:
                         match_size = self._bit_reader.read_huffman()
                         if match_size is _FLUSH:
@@ -256,9 +254,9 @@ def write_to_output(string):
                                 rle_count += 1 + 1
                                 symbol = self._window_buffer.last_written_byte
                                 string = bytes([symbol]) * rle_count
-                                if not self._rle_last_written:
-                                    self._window_buffer.write_bytes(string[: min(rle_count, _RLE_MAX_WINDOW)])
-                                self._rle_last_written = True
+                                remaining = self._window_buffer.size - self._window_buffer.pos
+                                window_write = min(rle_count, _RLE_MAX_WINDOW, remaining)
+                                self._window_buffer.write_bytes(string[:window_write])
                             elif match_size == _EXTENDED_MATCH_SYMBOL:
                                 # Format: size (huffman+trailing), then position
                                 match_size = self._bit_reader.read_huffman()
@@ -270,7 +268,6 @@ def write_to_output(string):
                                 string = self._window_buffer.get(index, match_size)
 
                                 self._window_buffer.write_bytes(string)
-                                self._rle_last_written = False
                             else:
                                 raise ValueError("unreachable")
                         else:
@@ -279,7 +276,6 @@ def write_to_output(string):
 
                             string = self._window_buffer.get(index, match_size)
                             self._window_buffer.write_bytes(string)
-                            self._rle_last_written = False
 
                     if not write_to_output(string):
                         break

From aea413db85de19337bfd3944f2ed5db965959f44 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 20:01:39 -0500
Subject: [PATCH 011/109] rename pending_symbol to token_state.

---
 tamp/_c_src/tamp/decompressor.c | 46 ++++++++++++++++-----------------
 tamp/_c_src/tamp/decompressor.h |  2 +-
 2 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index f370cd8e..17f1d02c 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -8,13 +8,11 @@
 #define FLUSH 15
 
 #if TAMP_V2_DECOMPRESS
-/* Pending symbol states for v2 decode suspend/resume (2 bits).
- * When skip_bytes > 0, we're resuming after output-full with full decode saved.
- */
-#define PENDING_NONE 0
-#define PENDING_RLE 1
-#define PENDING_EXT_NEED_OFFSET 2
-#define PENDING_EXT_FRESH 3
+/* Token state for v2 decode suspend/resume (2 bits). */
+#define TOKEN_NONE 0
+#define TOKEN_RLE 1
+#define TOKEN_EXT_MATCH 2
+#define TOKEN_EXT_MATCH_FRESH 3
 #endif
 
 /**
@@ -43,7 +41,7 @@ static const uint8_t HUFFMAN_TABLE[128] = {
  *
  * @returns Decoded match_size
  */
-static inline int8_t huffman_decode(uint32_t *bit_buffer, uint8_t *bit_buffer_pos) {
+static int8_t huffman_decode(uint32_t *bit_buffer, uint8_t *bit_buffer_pos) {
     uint8_t code;
     uint8_t bit_len;
 
@@ -132,13 +130,13 @@ static tamp_res decode_rle(TampDecompressor *d, unsigned char **output, const un
         /* Partial write - save state for resume */
         to_write = output_space;
         d->skip_bytes = skip + output_space;
-        d->pending_symbol = PENDING_RLE;
+        d->token_state = TOKEN_RLE;
         d->pending_window_offset = rle_count;
     } else {
         /* Complete write */
         to_write = remaining_count;
         d->skip_bytes = 0;
-        d->pending_symbol = PENDING_NONE;
+        d->token_state = TOKEN_NONE;
     }
 
     /* Write repeated bytes to output */
@@ -158,7 +156,7 @@ static tamp_res decode_rle(TampDecompressor *d, unsigned char **output, const un
         d->window_pos &= window_mask;
     }
 
-    return (d->pending_symbol == PENDING_NONE) ? TAMP_OK : TAMP_OUTPUT_FULL;
+    return (d->token_state == TOKEN_NONE) ? TAMP_OK : TAMP_OUTPUT_FULL;
 }
 
 /**
@@ -169,7 +167,7 @@ static tamp_res decode_rle(TampDecompressor *d, unsigned char **output, const un
  *
  * State machine:
  * - Fresh: decode huffman+trailing, then window_offset
- * - PENDING_EXT_NEED_OFFSET: have match_size, need window_offset
+ * - TOKEN_EXT_MATCH: have match_size, need window_offset
  * - Output-full resume (skip > 0): have both match_size and window_offset
  */
 static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **output, const unsigned char *output_end,
@@ -183,7 +181,7 @@ static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **outpu
         /* Resume from output-full: both values saved */
         window_offset = d->pending_window_offset;
         match_size = d->pending_match_size;
-    } else if (d->pending_symbol == PENDING_EXT_NEED_OFFSET) {
+    } else if (d->token_state == TOKEN_EXT_MATCH) {
         /* Resume: have match_size, need window_offset */
         match_size = d->pending_match_size;
 
@@ -201,7 +199,7 @@ static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **outpu
         /* Now decode window_offset */
         if (TAMP_UNLIKELY(d->bit_buffer_pos < conf_window)) {
             /* Save match_size and return */
-            d->pending_symbol = PENDING_EXT_NEED_OFFSET;
+            d->token_state = TOKEN_EXT_MATCH;
             d->pending_match_size = match_size;
             return TAMP_INPUT_EXHAUSTED;
         }
@@ -226,14 +224,14 @@ static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **outpu
         /* Partial write - save state for resume */
         to_write = output_space;
         d->skip_bytes = skip + output_space;
-        d->pending_symbol = PENDING_EXT_NEED_OFFSET; /* Reuse for output-full */
+        d->token_state = TOKEN_EXT_MATCH; /* Reuse for output-full */
         d->pending_window_offset = window_offset;
         d->pending_match_size = match_size;
     } else {
         /* Complete write */
         to_write = remaining_count;
         d->skip_bytes = 0;
-        d->pending_symbol = PENDING_NONE;
+        d->token_state = TOKEN_NONE;
     }
 
     /* Copy from window to output */
@@ -244,7 +242,7 @@ static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **outpu
     *output_written_size += to_write;
 
     /* Update window only on complete decode */
-    if (d->pending_symbol == PENDING_NONE) {
+    if (d->token_state == TOKEN_NONE) {
         uint16_t wp = d->window_pos;
         for (uint16_t i = 0; i < match_size; i++) {
             d->window[wp] = d->window[(window_offset + i) & window_mask];
@@ -253,7 +251,7 @@ static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **outpu
         d->window_pos = wp;
     }
 
-    return (d->pending_symbol == PENDING_NONE) ? TAMP_OK : TAMP_OUTPUT_FULL;
+    return (d->token_state == TOKEN_NONE) ? TAMP_OK : TAMP_OUTPUT_FULL;
 }
 #endif /* TAMP_V2_DECOMPRESS */
 
@@ -403,7 +401,7 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
 
     while (input != input_end || decompressor->bit_buffer_pos
 #if TAMP_V2_DECOMPRESS
-           || decompressor->pending_symbol
+           || decompressor->token_state
 #endif
     ) {
         // Populate the bit buffer
@@ -411,10 +409,10 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
 
 #if TAMP_V2_DECOMPRESS
         /* Resume pending v2 operation. Retry after refill if helper needs more bits. */
-        if (TAMP_UNLIKELY(decompressor->pending_symbol)) {
+        if (TAMP_UNLIKELY(decompressor->token_state)) {
             if (TAMP_UNLIKELY(output == output_end)) return TAMP_OUTPUT_FULL;
             tamp_res v2_res;
-            if (decompressor->pending_symbol == PENDING_RLE) {
+            if (decompressor->token_state == TOKEN_RLE) {
                 v2_res = decode_rle(decompressor, &output, output_end, output_written_size, window_mask);
             } else {
                 v2_res = decode_extended_match(decompressor, &output, output_end, output_written_size, conf_window,
@@ -486,16 +484,16 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
 
                 tamp_res v2_res;
                 if (match_size == TAMP_RLE_SYMBOL) {
-                    decompressor->pending_symbol = PENDING_RLE;
+                    decompressor->token_state = TOKEN_RLE;
                     v2_res = decode_rle(decompressor, &output, output_end, output_written_size, window_mask);
                 } else if (match_size == TAMP_EXTENDED_MATCH_SYMBOL) {
-                    decompressor->pending_symbol = PENDING_EXT_FRESH;
+                    decompressor->token_state = TOKEN_EXT_MATCH_FRESH;
                     v2_res = decode_extended_match(decompressor, &output, output_end, output_written_size, conf_window,
                                                    min_pattern_size, window_mask);
                 } else {
                     return TAMP_ERROR; /* Invalid v2 symbol */
                 }
-                /* On success, helper clears pending_symbol; on error, it stays set for resume.
+                /* On success, helper clears token_state; on error, it stays set for resume.
                  * TAMP_INPUT_EXHAUSTED is handled by resume path on next iteration. */
                 if (v2_res == TAMP_OUTPUT_FULL || v2_res < TAMP_OK) return v2_res;
                 continue;
diff --git a/tamp/_c_src/tamp/decompressor.h b/tamp/_c_src/tamp/decompressor.h
index 8a3ff207..8aae31be 100644
--- a/tamp/_c_src/tamp/decompressor.h
+++ b/tamp/_c_src/tamp/decompressor.h
@@ -18,7 +18,7 @@ typedef struct {
     uint16_t window_pos;     // Current position in window (15 bits)
     uint8_t bit_buffer_pos;  // Bits currently in bit_buffer (6 bits)
 #if TAMP_V2_DECOMPRESS
-    uint8_t pending_symbol : 2;      // State machine: 0=none, 1=RLE, 2=ext need offset, 3=ext fresh
+    uint8_t token_state : 2;         // 0=none, 1=RLE, 2=ext match, 3=ext match fresh
     uint16_t pending_window_offset;  // Saved window_offset for extended match output-full resume
     uint16_t pending_match_size;     // Saved match_size for extended match resume
 #endif

From 184627098b5bc1723017fa6a2596963d6670171c Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 20:11:44 -0500
Subject: [PATCH 012/109] Make decompression 1% slower to save 200 bytes in
 firmware

---
 tamp/_c_src/tamp/decompressor.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 17f1d02c..0d98c221 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -102,7 +102,8 @@ static tamp_res decode_huffman_trailing(TampDecompressor *d, uint8_t trailing_bi
  * rle_count = (count_high << 4) + count_low + 2
  */
 static tamp_res decode_rle(TampDecompressor *d, unsigned char **output, const unsigned char *output_end,
-                           size_t *output_written_size, uint16_t window_mask) {
+                           size_t *output_written_size) {
+    const uint16_t window_mask = (1u << d->conf_window) - 1;
     uint16_t rle_count;
     uint16_t skip = d->skip_bytes;
 
@@ -171,8 +172,9 @@ static tamp_res decode_rle(TampDecompressor *d, unsigned char **output, const un
  * - Output-full resume (skip > 0): have both match_size and window_offset
  */
 static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **output, const unsigned char *output_end,
-                                      size_t *output_written_size, uint8_t conf_window, uint8_t min_pattern_size,
-                                      uint16_t window_mask) {
+                                      size_t *output_written_size) {
+    const uint8_t conf_window = d->conf_window;
+    const uint16_t window_mask = (1u << conf_window) - 1;
     uint16_t window_offset;
     uint16_t match_size;
     uint16_t skip = d->skip_bytes;
@@ -194,7 +196,7 @@ static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **outpu
         uint16_t raw;
         tamp_res res = decode_huffman_trailing(d, TAMP_LEADING_EXTENDED_MATCH_BITS, &raw);
         if (res != TAMP_OK) return res;
-        match_size = raw + min_pattern_size + 12;
+        match_size = raw + d->min_pattern_size + 12;
 
         /* Now decode window_offset */
         if (TAMP_UNLIKELY(d->bit_buffer_pos < conf_window)) {
@@ -413,10 +415,9 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
             if (TAMP_UNLIKELY(output == output_end)) return TAMP_OUTPUT_FULL;
             tamp_res v2_res;
             if (decompressor->token_state == TOKEN_RLE) {
-                v2_res = decode_rle(decompressor, &output, output_end, output_written_size, window_mask);
+                v2_res = decode_rle(decompressor, &output, output_end, output_written_size);
             } else {
-                v2_res = decode_extended_match(decompressor, &output, output_end, output_written_size, conf_window,
-                                               min_pattern_size, window_mask);
+                v2_res = decode_extended_match(decompressor, &output, output_end, output_written_size);
             }
             if (v2_res == TAMP_INPUT_EXHAUSTED) {
                 refill_bit_buffer(decompressor, &input, input_end, input_consumed_size);
@@ -485,11 +486,10 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
                 tamp_res v2_res;
                 if (match_size == TAMP_RLE_SYMBOL) {
                     decompressor->token_state = TOKEN_RLE;
-                    v2_res = decode_rle(decompressor, &output, output_end, output_written_size, window_mask);
+                    v2_res = decode_rle(decompressor, &output, output_end, output_written_size);
                 } else if (match_size == TAMP_EXTENDED_MATCH_SYMBOL) {
                     decompressor->token_state = TOKEN_EXT_MATCH_FRESH;
-                    v2_res = decode_extended_match(decompressor, &output, output_end, output_written_size, conf_window,
-                                                   min_pattern_size, window_mask);
+                    v2_res = decode_extended_match(decompressor, &output, output_end, output_written_size);
                 } else {
                     return TAMP_ERROR; /* Invalid v2 symbol */
                 }

From ce3bd9442cb894cac1e9301cab8374244dd3da09 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 20:14:52 -0500
Subject: [PATCH 013/109] narrow variable scope

---
 tamp/_c_src/tamp/decompressor.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 0d98c221..9e5bcca0 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -103,7 +103,6 @@ static tamp_res decode_huffman_trailing(TampDecompressor *d, uint8_t trailing_bi
  */
 static tamp_res decode_rle(TampDecompressor *d, unsigned char **output, const unsigned char *output_end,
                            size_t *output_written_size) {
-    const uint16_t window_mask = (1u << d->conf_window) - 1;
     uint16_t rle_count;
     uint16_t skip = d->skip_bytes;
 
@@ -119,7 +118,7 @@ static tamp_res decode_rle(TampDecompressor *d, unsigned char **output, const un
     }
 
     /* Get the byte to repeat (last written byte) */
-    uint16_t prev_pos = (d->window_pos == 0) ? window_mask : (d->window_pos - 1);
+    uint16_t prev_pos = (d->window_pos - 1) & ((1u << d->conf_window) - 1);
     uint8_t symbol = d->window[prev_pos];
 
     /* Calculate how many to write this call */
@@ -149,12 +148,13 @@ static tamp_res decode_rle(TampDecompressor *d, unsigned char **output, const un
     /* Update window only on first chunk (skip==0).
      * Write up to TAMP_RLE_MAX_WINDOW or until end of buffer (no wrap). */
     if (skip == 0) {
-        uint16_t remaining = (window_mask + 1) - d->window_pos;
+        const uint16_t window_size = 1u << d->conf_window;
+        uint16_t remaining = window_size - d->window_pos;
         uint16_t window_write = MIN(MIN(rle_count, TAMP_RLE_MAX_WINDOW), remaining);
         for (uint16_t i = 0; i < window_write; i++) {
             d->window[d->window_pos++] = symbol;
         }
-        d->window_pos &= window_mask;
+        d->window_pos &= (window_size - 1);
     }
 
     return (d->token_state == TOKEN_NONE) ? TAMP_OK : TAMP_OUTPUT_FULL;
@@ -174,7 +174,6 @@ static tamp_res decode_rle(TampDecompressor *d, unsigned char **output, const un
 static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **output, const unsigned char *output_end,
                                       size_t *output_written_size) {
     const uint8_t conf_window = d->conf_window;
-    const uint16_t window_mask = (1u << conf_window) - 1;
     uint16_t window_offset;
     uint16_t match_size;
     uint16_t skip = d->skip_bytes;
@@ -245,6 +244,7 @@ static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **outpu
 
     /* Update window only on complete decode */
     if (d->token_state == TOKEN_NONE) {
+        const uint16_t window_mask = (1u << conf_window) - 1;
         uint16_t wp = d->window_pos;
         for (uint16_t i = 0; i < match_size; i++) {
             d->window[wp] = d->window[(window_offset + i) & window_mask];

From 34ad99cd476a1e05b7b8362f8a2d3a61b9352a40 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 21:45:04 -0500
Subject: [PATCH 014/109] remove extended-match wrapping logic.

---
 tamp/_c_src/tamp/decompressor.c | 13 +++++++------
 tamp/compressor.py              | 10 +++++++---
 tamp/decompressor.py            |  5 ++++-
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 9e5bcca0..39d14e1a 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -242,15 +242,16 @@ static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **outpu
     }
     *output_written_size += to_write;
 
-    /* Update window only on complete decode */
+    /* Update window only on complete decode.
+     * Write up to end of buffer (no wrap), mask wp only at the end. */
     if (d->token_state == TOKEN_NONE) {
-        const uint16_t window_mask = (1u << conf_window) - 1;
+        uint16_t remaining = window_size - d->window_pos;
+        uint16_t window_write = (match_size < remaining) ? match_size : remaining;
         uint16_t wp = d->window_pos;
-        for (uint16_t i = 0; i < match_size; i++) {
-            d->window[wp] = d->window[(window_offset + i) & window_mask];
-            wp = (wp + 1) & window_mask;
+        for (uint16_t i = 0; i < window_write; i++) {
+            d->window[wp++] = d->window[window_offset + i];
         }
-        d->window_pos = wp;
+        d->window_pos = wp & (window_size - 1);
     }
 
     return (d->token_state == TOKEN_NONE) ? TAMP_OK : TAMP_OUTPUT_FULL;
diff --git a/tamp/compressor.py b/tamp/compressor.py
index b2544abb..5194515d 100644
--- a/tamp/compressor.py
+++ b/tamp/compressor.py
@@ -125,9 +125,13 @@ def index(self, pattern, start):
         return result
 
     def write_from_self(self, position, size):
-        data = [self.buffer[(position + i) % self.size] for i in range(size)]
-        for x in data:
-            self.write_byte(x)
+        # Write up to end of buffer (no wrap)
+        remaining = self.size - self.pos
+        window_write = min(size, remaining)
+        for i in range(window_write):
+            self.buffer[self.pos] = self.buffer[position + i]
+            self.pos += 1
+        self.pos %= self.size
 
     def get(self, index, size):
         out = bytearray(size)
diff --git a/tamp/decompressor.py b/tamp/decompressor.py
index 38ff50a4..a61ad5d5 100644
--- a/tamp/decompressor.py
+++ b/tamp/decompressor.py
@@ -267,7 +267,10 @@ def write_to_output(string):
 
                                 string = self._window_buffer.get(index, match_size)
 
-                                self._window_buffer.write_bytes(string)
+                                # Write up to end of buffer (no wrap)
+                                remaining = self._window_buffer.size - self._window_buffer.pos
+                                window_write = min(match_size, remaining)
+                                self._window_buffer.write_bytes(string[:window_write])
                             else:
                                 raise ValueError("unreachable")
                         else:

From 4cdb50fb2dca56b0ae9d66b61735e5a2b3882057 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 22:03:45 -0500
Subject: [PATCH 015/109] move TAMP_OUTPUT_FULL logic to top of loop

---
 tamp/_c_src/tamp/decompressor.c | 5 ++---
 tamp/_c_src/tamp/decompressor.h | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 39d14e1a..af87e2d8 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -407,13 +407,14 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
            || decompressor->token_state
 #endif
     ) {
+        if (TAMP_UNLIKELY(output == output_end)) return TAMP_OUTPUT_FULL;
+
         // Populate the bit buffer
         refill_bit_buffer(decompressor, &input, input_end, input_consumed_size);
 
 #if TAMP_V2_DECOMPRESS
         /* Resume pending v2 operation. Retry after refill if helper needs more bits. */
         if (TAMP_UNLIKELY(decompressor->token_state)) {
-            if (TAMP_UNLIKELY(output == output_end)) return TAMP_OUTPUT_FULL;
             tamp_res v2_res;
             if (decompressor->token_state == TOKEN_RLE) {
                 v2_res = decode_rle(decompressor, &output, output_end, output_written_size);
@@ -432,8 +433,6 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
 
         if (TAMP_UNLIKELY(decompressor->bit_buffer_pos == 0)) return TAMP_INPUT_EXHAUSTED;
 
-        if (TAMP_UNLIKELY(output == output_end)) return TAMP_OUTPUT_FULL;
-
         // Hint that patterns are more likely than literals
         if (TAMP_UNLIKELY(decompressor->bit_buffer >> 31)) {
             // is literal
diff --git a/tamp/_c_src/tamp/decompressor.h b/tamp/_c_src/tamp/decompressor.h
index 8aae31be..d12c98a9 100644
--- a/tamp/_c_src/tamp/decompressor.h
+++ b/tamp/_c_src/tamp/decompressor.h
@@ -11,12 +11,11 @@ extern "C" {
  * Fields are ordered by access frequency for cache efficiency.
  */
 typedef struct {
-    /* HOT: accessed every iteration of the decompression loop.
-     * Full-width types avoid bitfield access overhead. */
+    /* HOT: accessed every iteration of the decompression loop. */
     unsigned char *window;   // Pointer to window buffer
     uint32_t bit_buffer;     // Bit buffer for reading compressed data (32 bits)
     uint16_t window_pos;     // Current position in window (15 bits)
-    uint8_t bit_buffer_pos;  // Bits currently in bit_buffer (6 bits)
+    uint8_t bit_buffer_pos;  // Bits currently in bit_buffer (6 bits needed)
 #if TAMP_V2_DECOMPRESS
     uint8_t token_state : 2;         // 0=none, 1=RLE, 2=ext match, 3=ext match fresh
     uint16_t pending_window_offset;  // Saved window_offset for extended match output-full resume

From 70fd7398d90ef73cb5cb9c668e2fc6ffb0ae4286 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 22:11:58 -0500
Subject: [PATCH 016/109] further reduce binary size by 56 bytes via a goto.

---
 tamp/_c_src/tamp/decompressor.c | 60 +++++++++++++++------------------
 1 file changed, 27 insertions(+), 33 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index af87e2d8..d3565128 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -41,7 +41,7 @@ static const uint8_t HUFFMAN_TABLE[128] = {
  *
  * @returns Decoded match_size
  */
-static int8_t huffman_decode(uint32_t *bit_buffer, uint8_t *bit_buffer_pos) {
+static int8_t huffman_decode(uint32_t* bit_buffer, uint8_t* bit_buffer_pos) {
     uint8_t code;
     uint8_t bit_len;
 
@@ -71,7 +71,7 @@ static int8_t huffman_decode(uint32_t *bit_buffer, uint8_t *bit_buffer_pos) {
  * @param result Output: (huffman << trailing_bits) + trailing
  * @return TAMP_OK on success, TAMP_INPUT_EXHAUSTED if more bits needed
  */
-static tamp_res decode_huffman_trailing(TampDecompressor *d, uint8_t trailing_bits, uint16_t *result) {
+static tamp_res decode_huffman_trailing(TampDecompressor* d, uint8_t trailing_bits, uint16_t* result) {
     uint32_t bit_buffer = d->bit_buffer;
     uint8_t bit_buffer_pos = d->bit_buffer_pos;
 
@@ -101,8 +101,8 @@ static tamp_res decode_huffman_trailing(TampDecompressor *d, uint8_t trailing_bi
  * RLE format: huffman(count_high) + trailing_bits(count_low)
  * rle_count = (count_high << 4) + count_low + 2
  */
-static tamp_res decode_rle(TampDecompressor *d, unsigned char **output, const unsigned char *output_end,
-                           size_t *output_written_size) {
+static tamp_res decode_rle(TampDecompressor* d, unsigned char** output, const unsigned char* output_end,
+                           size_t* output_written_size) {
     uint16_t rle_count;
     uint16_t skip = d->skip_bytes;
 
@@ -171,8 +171,8 @@ static tamp_res decode_rle(TampDecompressor *d, unsigned char **output, const un
  * - TOKEN_EXT_MATCH: have match_size, need window_offset
  * - Output-full resume (skip > 0): have both match_size and window_offset
  */
-static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **output, const unsigned char *output_end,
-                                      size_t *output_written_size) {
+static tamp_res decode_extended_match(TampDecompressor* d, unsigned char** output, const unsigned char* output_end,
+                                      size_t* output_written_size) {
     const uint8_t conf_window = d->conf_window;
     uint16_t window_offset;
     uint16_t match_size;
@@ -265,7 +265,7 @@ static tamp_res decode_extended_match(TampDecompressor *d, unsigned char **outpu
  * using a temporary buffer when necessary. Overlap occurs when the
  * destination would "catch up" to the source during copying.
  */
-static inline void window_copy(unsigned char *window, uint16_t *window_pos, uint16_t window_offset, uint8_t match_size,
+static inline void window_copy(unsigned char* window, uint16_t* window_pos, uint16_t window_offset, uint8_t match_size,
                                uint16_t window_mask) {
     const uint16_t src_to_dst = (*window_pos - window_offset) & window_mask;
     const bool overlap = (src_to_dst < match_size) && (src_to_dst > 0);
@@ -287,8 +287,8 @@ static inline void window_copy(unsigned char *window, uint16_t *window_pos, uint
     }
 }
 
-tamp_res tamp_decompressor_read_header(TampConf *conf, const unsigned char *input, size_t input_size,
-                                       size_t *input_consumed_size) {
+tamp_res tamp_decompressor_read_header(TampConf* conf, const unsigned char* input, size_t input_size,
+                                       size_t* input_consumed_size) {
     if (input_consumed_size) (*input_consumed_size) = 0;
     if (input_size == 0) return TAMP_INPUT_EXHAUSTED;
     if (input[0] & 0x1) return TAMP_INVALID_CONF;  // Currently only a single header byte is supported.
@@ -307,7 +307,7 @@ tamp_res tamp_decompressor_read_header(TampConf *conf, const unsigned char *inpu
  *   * window
  *   * window_bits_max
  */
-static tamp_res tamp_decompressor_populate_from_conf(TampDecompressor *decompressor, uint8_t conf_window,
+static tamp_res tamp_decompressor_populate_from_conf(TampDecompressor* decompressor, uint8_t conf_window,
                                                      uint8_t conf_literal, uint8_t conf_use_custom_dictionary,
                                                      uint8_t conf_v2) {
     if (conf_window < 8 || conf_window > 15) return TAMP_INVALID_CONF;
@@ -328,7 +328,7 @@ static tamp_res tamp_decompressor_populate_from_conf(TampDecompressor *decompres
     return TAMP_OK;
 }
 
-tamp_res tamp_decompressor_init(TampDecompressor *decompressor, const TampConf *conf, unsigned char *window,
+tamp_res tamp_decompressor_init(TampDecompressor* decompressor, const TampConf* conf, unsigned char* window,
                                 uint8_t window_bits) {
     tamp_res res = TAMP_OK;
 
@@ -336,7 +336,7 @@ tamp_res tamp_decompressor_init(TampDecompressor *decompressor, const TampConf *
     if (window_bits < 8 || window_bits > 15) return TAMP_INVALID_CONF;
 
     for (uint8_t i = 0; i < sizeof(TampDecompressor); i++)  // Zero-out the struct
-        ((unsigned char *)decompressor)[i] = 0;
+        ((unsigned char*)decompressor)[i] = 0;
     decompressor->window = window;
     decompressor->window_bits_max = window_bits;
     if (conf) {
@@ -352,8 +352,8 @@ tamp_res tamp_decompressor_init(TampDecompressor *decompressor, const TampConf *
  *
  * Consumes bytes from input until bit_buffer has at least 25 bits or input is exhausted.
  */
-static inline void refill_bit_buffer(TampDecompressor *d, const unsigned char **input, const unsigned char *input_end,
-                                     size_t *input_consumed_size) {
+static inline void refill_bit_buffer(TampDecompressor* d, const unsigned char** input, const unsigned char* input_end,
+                                     size_t* input_consumed_size) {
     while (*input != input_end && d->bit_buffer_pos <= 24) {
         d->bit_buffer_pos += 8;
         d->bit_buffer |= (uint32_t) * (*input) << (32 - d->bit_buffer_pos);
@@ -362,14 +362,14 @@ static inline void refill_bit_buffer(TampDecompressor *d, const unsigned char **
     }
 }
 
-tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigned char *output, size_t output_size,
-                                         size_t *output_written_size, const unsigned char *input, size_t input_size,
-                                         size_t *input_consumed_size, tamp_callback_t callback, void *user_data) {
+tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigned char* output, size_t output_size,
+                                         size_t* output_written_size, const unsigned char* input, size_t input_size,
+                                         size_t* input_consumed_size, tamp_callback_t callback, void* user_data) {
     size_t input_consumed_size_proxy;
     size_t output_written_size_proxy;
     tamp_res res;
-    const unsigned char *input_end = input + input_size;
-    const unsigned char *output_end = output + output_size;
+    const unsigned char* input_end = input + input_size;
+    const unsigned char* output_end = output + output_size;
 
     if (!output_written_size) output_written_size = &output_written_size_proxy;
     if (!input_consumed_size) input_consumed_size = &input_consumed_size_proxy;
@@ -413,8 +413,9 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
         refill_bit_buffer(decompressor, &input, input_end, input_consumed_size);
 
 #if TAMP_V2_DECOMPRESS
-        /* Resume pending v2 operation. Retry after refill if helper needs more bits. */
+        /* Handle v2 tokens - either resuming or fresh from match_size detection below. */
         if (TAMP_UNLIKELY(decompressor->token_state)) {
+        v2_dispatch:;
             tamp_res v2_res;
             if (decompressor->token_state == TOKEN_RLE) {
                 v2_res = decode_rle(decompressor, &output, output_end, output_written_size);
@@ -424,7 +425,7 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
             if (v2_res == TAMP_INPUT_EXHAUSTED) {
                 refill_bit_buffer(decompressor, &input, input_end, input_consumed_size);
                 if (input == input_end) return TAMP_INPUT_EXHAUSTED;
-                continue; /* Retry with refilled buffer */
+                continue;
             }
             if (v2_res != TAMP_OK) return v2_res;
             continue;
@@ -479,24 +480,17 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
 #if TAMP_V2_DECOMPRESS
             /* Check for v2 symbols */
             if (TAMP_UNLIKELY(v2_enabled && match_size >= TAMP_RLE_SYMBOL)) {
-                /* Commit bit buffer before calling helper. */
                 decompressor->bit_buffer = bit_buffer;
                 decompressor->bit_buffer_pos = bit_buffer_pos;
 
-                tamp_res v2_res;
                 if (match_size == TAMP_RLE_SYMBOL) {
                     decompressor->token_state = TOKEN_RLE;
-                    v2_res = decode_rle(decompressor, &output, output_end, output_written_size);
                 } else if (match_size == TAMP_EXTENDED_MATCH_SYMBOL) {
                     decompressor->token_state = TOKEN_EXT_MATCH_FRESH;
-                    v2_res = decode_extended_match(decompressor, &output, output_end, output_written_size);
                 } else {
-                    return TAMP_ERROR; /* Invalid v2 symbol */
+                    return TAMP_ERROR;
                 }
-                /* On success, helper clears token_state; on error, it stays set for resume.
-                 * TAMP_INPUT_EXHAUSTED is handled by resume path on next iteration. */
-                if (v2_res == TAMP_OUTPUT_FULL || v2_res < TAMP_OK) return v2_res;
-                continue;
+                goto v2_dispatch;
             }
 #endif
 
@@ -555,9 +549,9 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
 
 #if TAMP_STREAM
 
-tamp_res tamp_decompress_stream(TampDecompressor *decompressor, tamp_read_t read_cb, void *read_handle,
-                                tamp_write_t write_cb, void *write_handle, size_t *input_consumed_size,
-                                size_t *output_written_size, tamp_callback_t callback, void *user_data) {
+tamp_res tamp_decompress_stream(TampDecompressor* decompressor, tamp_read_t read_cb, void* read_handle,
+                                tamp_write_t write_cb, void* write_handle, size_t* input_consumed_size,
+                                size_t* output_written_size, tamp_callback_t callback, void* user_data) {
     size_t input_consumed_size_proxy, output_written_size_proxy;
     if (!input_consumed_size) input_consumed_size = &input_consumed_size_proxy;
     if (!output_written_size) output_written_size = &output_written_size_proxy;

From 6b59265ee7edbab297c4663c8c0bc067fbfe06c0 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 22:17:03 -0500
Subject: [PATCH 017/109] use some math instead of if/else

---
 tamp/_c_src/tamp/decompressor.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index d3565128..c2688b4e 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -8,11 +8,15 @@
 #define FLUSH 15
 
 #if TAMP_V2_DECOMPRESS
-/* Token state for v2 decode suspend/resume (2 bits). */
+/* Token state for v2 decode suspend/resume (2 bits).
+ * TOKEN_RLE and TOKEN_EXT_MATCH_FRESH are arranged so that:
+ *     token_state = match_size - (TAMP_RLE_SYMBOL - 1)
+ * maps TAMP_RLE_SYMBOL (12) -> 1 and TAMP_EXTENDED_MATCH_SYMBOL (13) -> 2.
+ */
 #define TOKEN_NONE 0
 #define TOKEN_RLE 1
-#define TOKEN_EXT_MATCH 2
-#define TOKEN_EXT_MATCH_FRESH 3
+#define TOKEN_EXT_MATCH_FRESH 2
+#define TOKEN_EXT_MATCH 3 /* Resume: have match_size, need window_offset */
 #endif
 
 /**
@@ -478,18 +482,12 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigne
             }
 
 #if TAMP_V2_DECOMPRESS
-            /* Check for v2 symbols */
+            /* Check for v2 symbols (RLE=12, extended match=13).
+             * Convert match_size to token_state via subtraction (see TOKEN_* defines). */
             if (TAMP_UNLIKELY(v2_enabled && match_size >= TAMP_RLE_SYMBOL)) {
                 decompressor->bit_buffer = bit_buffer;
                 decompressor->bit_buffer_pos = bit_buffer_pos;
-
-                if (match_size == TAMP_RLE_SYMBOL) {
-                    decompressor->token_state = TOKEN_RLE;
-                } else if (match_size == TAMP_EXTENDED_MATCH_SYMBOL) {
-                    decompressor->token_state = TOKEN_EXT_MATCH_FRESH;
-                } else {
-                    return TAMP_ERROR;
-                }
+                decompressor->token_state = match_size - (TAMP_RLE_SYMBOL - 1);
                 goto v2_dispatch;
             }
 #endif

From 59e40909ab136e958a5068c6ae1badb12b1e0935 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 22:18:00 -0500
Subject: [PATCH 018/109] no need for v2_res

---
 tamp/_c_src/tamp/decompressor.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index c2688b4e..0772943a 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -419,19 +419,18 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigne
 #if TAMP_V2_DECOMPRESS
         /* Handle v2 tokens - either resuming or fresh from match_size detection below. */
         if (TAMP_UNLIKELY(decompressor->token_state)) {
-        v2_dispatch:;
-            tamp_res v2_res;
+        v2_dispatch:
             if (decompressor->token_state == TOKEN_RLE) {
-                v2_res = decode_rle(decompressor, &output, output_end, output_written_size);
+                res = decode_rle(decompressor, &output, output_end, output_written_size);
             } else {
-                v2_res = decode_extended_match(decompressor, &output, output_end, output_written_size);
+                res = decode_extended_match(decompressor, &output, output_end, output_written_size);
             }
-            if (v2_res == TAMP_INPUT_EXHAUSTED) {
+            if (res == TAMP_INPUT_EXHAUSTED) {
                 refill_bit_buffer(decompressor, &input, input_end, input_consumed_size);
                 if (input == input_end) return TAMP_INPUT_EXHAUSTED;
                 continue;
             }
-            if (v2_res != TAMP_OK) return v2_res;
+            if (res != TAMP_OK) return res;
             continue;
         }
 #endif

From f59fd08c1110d4c10becfec1d312dd95594aa43b Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 22:25:11 -0500
Subject: [PATCH 019/109] simplify while-loop check with a union. reduces
 binary by 56 bytes

---
 tamp/_c_src/tamp/decompressor.c |  6 +-----
 tamp/_c_src/tamp/decompressor.h | 24 +++++++++++++++++++-----
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 0772943a..d156739e 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -406,11 +406,7 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigne
     const bool v2_enabled = decompressor->conf_v2;
 #endif
 
-    while (input != input_end || decompressor->bit_buffer_pos
-#if TAMP_V2_DECOMPRESS
-           || decompressor->token_state
-#endif
-    ) {
+    while (input != input_end || decompressor->pos_and_state) {
         if (TAMP_UNLIKELY(output == output_end)) return TAMP_OUTPUT_FULL;
 
         // Populate the bit buffer
diff --git a/tamp/_c_src/tamp/decompressor.h b/tamp/_c_src/tamp/decompressor.h
index d12c98a9..8e333ba8 100644
--- a/tamp/_c_src/tamp/decompressor.h
+++ b/tamp/_c_src/tamp/decompressor.h
@@ -12,12 +12,26 @@ extern "C" {
  */
 typedef struct {
     /* HOT: accessed every iteration of the decompression loop. */
-    unsigned char *window;   // Pointer to window buffer
-    uint32_t bit_buffer;     // Bit buffer for reading compressed data (32 bits)
-    uint16_t window_pos;     // Current position in window (15 bits)
-    uint8_t bit_buffer_pos;  // Bits currently in bit_buffer (6 bits needed)
+    unsigned char *window;  // Pointer to window buffer
+    uint32_t bit_buffer;    // Bit buffer for reading compressed data (32 bits)
+    uint16_t window_pos;    // Current position in window (15 bits)
+
+    /* Union allows single zero-check in main loop instead of two separate checks. */
+#if TAMP_V2_DECOMPRESS
+    union {
+        struct {
+            uint8_t bit_buffer_pos;  // Bits currently in bit_buffer (6 bits needed)
+            uint8_t token_state;     // 0=none, 1=RLE, 2=ext match, 3=ext match fresh (2 bits used)
+        };
+        uint16_t pos_and_state;  // Combined for fast 16-bit zero-check
+    };
+#else
+    union {
+        uint8_t bit_buffer_pos;  // Bits currently in bit_buffer (6 bits needed)
+        uint8_t pos_and_state;   // Alias for consistent access in main loop
+    };
+#endif
 #if TAMP_V2_DECOMPRESS
-    uint8_t token_state : 2;         // 0=none, 1=RLE, 2=ext match, 3=ext match fresh
     uint16_t pending_window_offset;  // Saved window_offset for extended match output-full resume
     uint16_t pending_match_size;     // Saved match_size for extended match resume
 #endif

From a925f62b82b10df977cf9575d8a1538391f218c4 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 22:42:37 -0500
Subject: [PATCH 020/109] unified huffman decode.

---
 tamp/_c_src/tamp/decompressor.c | 113 +++++++++++++++-----------------
 1 file changed, 54 insertions(+), 59 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index d156739e..f50c0c33 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -37,68 +37,52 @@ static const uint8_t HUFFMAN_TABLE[128] = {
     17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,  17,  17,  17,  17, 17, 17,  17,  17,  17};
 
 /**
- * @brief Decode a huffman match-size symbol from the decompressor's bit_buffer.
+ * @brief Decode huffman symbol + optional trailing bits from bit buffer.
  *
- * Internally updates bit_buffer and bit_buffer_pos.
+ * Modifies bit_buffer and bit_buffer_pos in place. Caller is responsible
+ * for committing to decompressor state if needed.
  *
- * bit_buffer MUST have at least 8 bits prior to calling.
- *
- * @returns Decoded match_size
- */
-static int8_t huffman_decode(uint32_t* bit_buffer, uint8_t* bit_buffer_pos) {
-    uint8_t code;
-    uint8_t bit_len;
-
-    (*bit_buffer_pos)--;
-    code = *bit_buffer >> 31;
-    *bit_buffer <<= 1;
-    if (TAMP_LIKELY(code == 0)) return 0;
-
-    code = *bit_buffer >> (32 - 7);
-    code = HUFFMAN_TABLE[code];
-    bit_len = code >> 4;
-    *bit_buffer <<= bit_len;
-    (*bit_buffer_pos) -= bit_len;
-
-    return code & 0xF;
-}
-
-#if TAMP_V2_DECOMPRESS
-/**
- * @brief Decode huffman symbol + trailing bits from bit buffer.
- *
- * Simple helper that decodes from local copies. On failure, decompressor
- * state is not modified. Caller is responsible for state management.
- *
- * @param d Decompressor state
- * @param trailing_bits Number of trailing bits to read (3 or 4)
+ * @param bit_buffer Pointer to bit buffer (modified in place)
+ * @param bit_buffer_pos Pointer to bit position (modified in place)
+ * @param trailing_bits Number of trailing bits to read (0, 3, or 4)
  * @param result Output: (huffman << trailing_bits) + trailing
  * @return TAMP_OK on success, TAMP_INPUT_EXHAUSTED if more bits needed
  */
-static tamp_res decode_huffman_trailing(TampDecompressor* d, uint8_t trailing_bits, uint16_t* result) {
-    uint32_t bit_buffer = d->bit_buffer;
-    uint8_t bit_buffer_pos = d->bit_buffer_pos;
-
+static tamp_res decode_huffman(uint32_t* bit_buffer, uint8_t* bit_buffer_pos, uint8_t trailing_bits, uint16_t* result) {
     /* Need at least 1 bit for huffman, plus trailing bits */
-    if (TAMP_UNLIKELY(bit_buffer_pos < 1 + trailing_bits)) return TAMP_INPUT_EXHAUSTED;
-
-    int8_t huffman_value = huffman_decode(&bit_buffer, &bit_buffer_pos);
+    if (TAMP_UNLIKELY(*bit_buffer_pos < 1 + trailing_bits)) return TAMP_INPUT_EXHAUSTED;
 
-    if (TAMP_UNLIKELY(bit_buffer_pos < trailing_bits)) return TAMP_INPUT_EXHAUSTED;
-
-    uint8_t trailing = bit_buffer >> (32 - trailing_bits);
-    bit_buffer <<= trailing_bits;
-    bit_buffer_pos -= trailing_bits;
-
-    *result = (huffman_value << trailing_bits) + trailing;
+    /* Decode huffman symbol */
+    int8_t huffman_value;
+    (*bit_buffer_pos)--;
+    if (TAMP_LIKELY((*bit_buffer >> 31) == 0)) {
+        *bit_buffer <<= 1;
+        huffman_value = 0;
+    } else {
+        *bit_buffer <<= 1;
+        uint8_t code = HUFFMAN_TABLE[*bit_buffer >> (32 - 7)];
+        uint8_t bit_len = code >> 4;
+        if (TAMP_UNLIKELY(*bit_buffer_pos < bit_len + trailing_bits)) return TAMP_INPUT_EXHAUSTED;
+        *bit_buffer <<= bit_len;
+        *bit_buffer_pos -= bit_len;
+        huffman_value = code & 0xF;
+    }
 
-    /* Commit only on success */
-    d->bit_buffer = bit_buffer;
-    d->bit_buffer_pos = bit_buffer_pos;
+    /* Read trailing bits (skip if trailing_bits==0 to avoid undefined shift) */
+    if (trailing_bits) {
+        uint8_t trailing = *bit_buffer >> (32 - trailing_bits);
+        *bit_buffer <<= trailing_bits;
+        *bit_buffer_pos -= trailing_bits;
+        *result = (huffman_value << trailing_bits) + trailing;
+    } else {
+        *result = huffman_value;
+    }
 
     return TAMP_OK;
 }
 
+#if TAMP_V2_DECOMPRESS
+
 /**
  * @brief Decode RLE token and write repeated bytes to output.
  *
@@ -115,9 +99,13 @@ static tamp_res decode_rle(TampDecompressor* d, unsigned char** output, const un
         rle_count = d->pending_window_offset;
     } else {
         /* Fresh decode */
+        uint32_t bit_buffer = d->bit_buffer;
+        uint8_t bit_buffer_pos = d->bit_buffer_pos;
         uint16_t raw;
-        tamp_res res = decode_huffman_trailing(d, TAMP_LEADING_RLE_BITS, &raw);
+        tamp_res res = decode_huffman(&bit_buffer, &bit_buffer_pos, TAMP_LEADING_RLE_BITS, &raw);
         if (res != TAMP_OK) return res;
+        d->bit_buffer = bit_buffer;
+        d->bit_buffer_pos = bit_buffer_pos;
         rle_count = raw + 2;
     }
 
@@ -196,21 +184,27 @@ static tamp_res decode_extended_match(TampDecompressor* d, unsigned char** outpu
         d->bit_buffer_pos -= conf_window;
     } else {
         /* Fresh decode: huffman+trailing first, then window_offset */
+        uint32_t bit_buffer = d->bit_buffer;
+        uint8_t bit_buffer_pos = d->bit_buffer_pos;
         uint16_t raw;
-        tamp_res res = decode_huffman_trailing(d, TAMP_LEADING_EXTENDED_MATCH_BITS, &raw);
+        tamp_res res = decode_huffman(&bit_buffer, &bit_buffer_pos, TAMP_LEADING_EXTENDED_MATCH_BITS, &raw);
         if (res != TAMP_OK) return res;
         match_size = raw + d->min_pattern_size + 12;
 
         /* Now decode window_offset */
-        if (TAMP_UNLIKELY(d->bit_buffer_pos < conf_window)) {
+        if (TAMP_UNLIKELY(bit_buffer_pos < conf_window)) {
             /* Save match_size and return */
+            d->bit_buffer = bit_buffer;
+            d->bit_buffer_pos = bit_buffer_pos;
             d->token_state = TOKEN_EXT_MATCH;
             d->pending_match_size = match_size;
             return TAMP_INPUT_EXHAUSTED;
         }
-        window_offset = d->bit_buffer >> (32 - conf_window);
-        d->bit_buffer <<= conf_window;
-        d->bit_buffer_pos -= conf_window;
+        window_offset = bit_buffer >> (32 - conf_window);
+        bit_buffer <<= conf_window;
+        bit_buffer_pos -= conf_window;
+        d->bit_buffer = bit_buffer;
+        d->bit_buffer_pos = bit_buffer_pos;
     }
 
     /* Security check: validate window bounds */
@@ -464,10 +458,11 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigne
             bit_buffer <<= 1;
             bit_buffer_pos--;
 
-            // There must be at least 8 bits, otherwise no possible decoding.
-            if (TAMP_UNLIKELY(bit_buffer_pos < 8)) return TAMP_INPUT_EXHAUSTED;
+            uint16_t match_size_u16;
+            if (decode_huffman(&bit_buffer, &bit_buffer_pos, 0, &match_size_u16) != TAMP_OK)
+                return TAMP_INPUT_EXHAUSTED;
+            match_size = match_size_u16;
 
-            match_size = huffman_decode(&bit_buffer, &bit_buffer_pos);
             if (TAMP_UNLIKELY(match_size == FLUSH)) {
                 // flush bit_buffer to the nearest byte and skip the remainder of decoding
                 decompressor->bit_buffer = bit_buffer << (bit_buffer_pos & 7);

From e69b826db026d667668e29944259cfb5a878b597 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 22:51:16 -0500
Subject: [PATCH 021/109] Add comment about HUFFMAN_TABLE being pretty
 optimized.

---
 tamp/_c_src/tamp/decompressor.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index f50c0c33..6defe074 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -20,14 +20,11 @@
 #endif
 
 /**
- * This array was generated with tools/huffman_jump_table.py
+ * Huffman lookup table indexed by 7 bits (after first "1" bit consumed).
+ * Upper 4 bits = additional bits to consume, lower 4 bits = symbol (15 = FLUSH).
  *
- * The idea is that the resulting code is smaller/faster as a lookup table than a bunch of if/else
- * statements.
- *
- * Of each element:
- *  * The upper 4 bits express the number of bits to decode.
- *  * The lower 4 bits express the decoded value, with FLUSH being represented as 0b1111
+ * Note: A 64-byte table with special-cased symbol 1 was tried but was ~10% slower
+ * and only saved 8 bytes in final firmware due to added branch logic.
  */
 static const uint8_t HUFFMAN_TABLE[128] = {
     50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,  50,  85,  85,  85, 85, 122, 123, 104, 104, 86, 86,
@@ -56,9 +53,11 @@ static tamp_res decode_huffman(uint32_t* bit_buffer, uint8_t* bit_buffer_pos, ui
     int8_t huffman_value;
     (*bit_buffer_pos)--;
     if (TAMP_LIKELY((*bit_buffer >> 31) == 0)) {
+        /* Symbol 0: code "0" */
         *bit_buffer <<= 1;
         huffman_value = 0;
     } else {
+        /* All other symbols: use 128-entry table indexed by next 7 bits */
         *bit_buffer <<= 1;
         uint8_t code = HUFFMAN_TABLE[*bit_buffer >> (32 - 7)];
         uint8_t bit_len = code >> 4;

From 589a9a2cb854933a80f66879043be9819576202a Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 23:01:51 -0500
Subject: [PATCH 022/109] Make some datatypes smaller; reduces binary by 36
 bytes.

---
 tamp/_c_src/tamp/decompressor.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 6defe074..2c43c08e 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -42,10 +42,10 @@ static const uint8_t HUFFMAN_TABLE[128] = {
  * @param bit_buffer Pointer to bit buffer (modified in place)
  * @param bit_buffer_pos Pointer to bit position (modified in place)
  * @param trailing_bits Number of trailing bits to read (0, 3, or 4)
- * @param result Output: (huffman << trailing_bits) + trailing
+ * @param result Output: (huffman << trailing_bits) + trailing (max 223 for trailing_bits=4)
  * @return TAMP_OK on success, TAMP_INPUT_EXHAUSTED if more bits needed
  */
-static tamp_res decode_huffman(uint32_t* bit_buffer, uint8_t* bit_buffer_pos, uint8_t trailing_bits, uint16_t* result) {
+static tamp_res decode_huffman(uint32_t* bit_buffer, uint8_t* bit_buffer_pos, uint8_t trailing_bits, uint8_t* result) {
     /* Need at least 1 bit for huffman, plus trailing bits */
     if (TAMP_UNLIKELY(*bit_buffer_pos < 1 + trailing_bits)) return TAMP_INPUT_EXHAUSTED;
 
@@ -90,8 +90,8 @@ static tamp_res decode_huffman(uint32_t* bit_buffer, uint8_t* bit_buffer_pos, ui
  */
 static tamp_res decode_rle(TampDecompressor* d, unsigned char** output, const unsigned char* output_end,
                            size_t* output_written_size) {
-    uint16_t rle_count;
-    uint16_t skip = d->skip_bytes;
+    uint8_t rle_count; /* max 225: (13 << 4) + 15 + 2 */
+    uint8_t skip = d->skip_bytes;
 
     if (skip > 0) {
         /* Resume from output-full: rle_count saved in pending_window_offset */
@@ -100,7 +100,7 @@ static tamp_res decode_rle(TampDecompressor* d, unsigned char** output, const un
         /* Fresh decode */
         uint32_t bit_buffer = d->bit_buffer;
         uint8_t bit_buffer_pos = d->bit_buffer_pos;
-        uint16_t raw;
+        uint8_t raw;
         tamp_res res = decode_huffman(&bit_buffer, &bit_buffer_pos, TAMP_LEADING_RLE_BITS, &raw);
         if (res != TAMP_OK) return res;
         d->bit_buffer = bit_buffer;
@@ -113,14 +113,14 @@ static tamp_res decode_rle(TampDecompressor* d, unsigned char** output, const un
     uint8_t symbol = d->window[prev_pos];
 
     /* Calculate how many to write this call */
-    uint16_t remaining_count = rle_count - skip;
+    uint8_t remaining_count = rle_count - skip;
     size_t output_space = output_end - *output;
-    uint16_t to_write;
+    uint8_t to_write;
 
     if (TAMP_UNLIKELY(remaining_count > output_space)) {
         /* Partial write - save state for resume */
         to_write = output_space;
-        d->skip_bytes = skip + output_space;
+        d->skip_bytes = skip + to_write;
         d->token_state = TOKEN_RLE;
         d->pending_window_offset = rle_count;
     } else {
@@ -185,7 +185,7 @@ static tamp_res decode_extended_match(TampDecompressor* d, unsigned char** outpu
         /* Fresh decode: huffman+trailing first, then window_offset */
         uint32_t bit_buffer = d->bit_buffer;
         uint8_t bit_buffer_pos = d->bit_buffer_pos;
-        uint16_t raw;
+        uint8_t raw;
         tamp_res res = decode_huffman(&bit_buffer, &bit_buffer_pos, TAMP_LEADING_EXTENDED_MATCH_BITS, &raw);
         if (res != TAMP_OK) return res;
         match_size = raw + d->min_pattern_size + 12;
@@ -457,10 +457,9 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigne
             bit_buffer <<= 1;
             bit_buffer_pos--;
 
-            uint16_t match_size_u16;
-            if (decode_huffman(&bit_buffer, &bit_buffer_pos, 0, &match_size_u16) != TAMP_OK)
-                return TAMP_INPUT_EXHAUSTED;
-            match_size = match_size_u16;
+            uint8_t match_size_u8;
+            if (decode_huffman(&bit_buffer, &bit_buffer_pos, 0, &match_size_u8) != TAMP_OK) return TAMP_INPUT_EXHAUSTED;
+            match_size = match_size_u8;
 
             if (TAMP_UNLIKELY(match_size == FLUSH)) {
                 // flush bit_buffer to the nearest byte and skip the remainder of decoding

From e081f53506856c15d24b567682f1fc1c72d9aacb Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 24 Jan 2026 23:15:34 -0500
Subject: [PATCH 023/109] reduce some dtypes to uint8

---
 tamp/_c_src/tamp/decompressor.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 2c43c08e..b245895e 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -131,7 +131,7 @@ static tamp_res decode_rle(TampDecompressor* d, unsigned char** output, const un
     }
 
     /* Write repeated bytes to output */
-    for (uint16_t i = 0; i < to_write; i++) {
+    for (uint8_t i = 0; i < to_write; i++) {
         *(*output)++ = symbol;
     }
     *output_written_size += to_write;
@@ -141,8 +141,8 @@ static tamp_res decode_rle(TampDecompressor* d, unsigned char** output, const un
     if (skip == 0) {
         const uint16_t window_size = 1u << d->conf_window;
         uint16_t remaining = window_size - d->window_pos;
-        uint16_t window_write = MIN(MIN(rle_count, TAMP_RLE_MAX_WINDOW), remaining);
-        for (uint16_t i = 0; i < window_write; i++) {
+        uint8_t window_write = MIN(MIN(rle_count, TAMP_RLE_MAX_WINDOW), remaining); /* max 8 */
+        for (uint8_t i = 0; i < window_write; i++) {
             d->window[d->window_pos++] = symbol;
         }
         d->window_pos &= (window_size - 1);
@@ -166,8 +166,8 @@ static tamp_res decode_extended_match(TampDecompressor* d, unsigned char** outpu
                                       size_t* output_written_size) {
     const uint8_t conf_window = d->conf_window;
     uint16_t window_offset;
-    uint16_t match_size;
-    uint16_t skip = d->skip_bytes;
+    uint8_t match_size; /* max 126: (13<<3)+7 + 3 + 12 */
+    uint8_t skip = d->skip_bytes;
 
     if (skip > 0) {
         /* Resume from output-full: both values saved */
@@ -214,9 +214,9 @@ static tamp_res decode_extended_match(TampDecompressor* d, unsigned char** outpu
     }
 
     /* Calculate how many to write this call */
-    uint16_t remaining_count = match_size - skip;
+    uint8_t remaining_count = match_size - skip;
     size_t output_space = output_end - *output;
-    uint16_t to_write;
+    uint8_t to_write;
 
     if (TAMP_UNLIKELY(remaining_count > output_space)) {
         /* Partial write - save state for resume */
@@ -234,7 +234,7 @@ static tamp_res decode_extended_match(TampDecompressor* d, unsigned char** outpu
 
     /* Copy from window to output */
     uint16_t src_offset = window_offset + skip;
-    for (uint16_t i = 0; i < to_write; i++) {
+    for (uint8_t i = 0; i < to_write; i++) {
         *(*output)++ = d->window[src_offset + i];
     }
     *output_written_size += to_write;
@@ -243,9 +243,9 @@ static tamp_res decode_extended_match(TampDecompressor* d, unsigned char** outpu
      * Write up to end of buffer (no wrap), mask wp only at the end. */
     if (d->token_state == TOKEN_NONE) {
         uint16_t remaining = window_size - d->window_pos;
-        uint16_t window_write = (match_size < remaining) ? match_size : remaining;
+        uint8_t window_write = (match_size < remaining) ? match_size : remaining; /* max 126 */
         uint16_t wp = d->window_pos;
-        for (uint16_t i = 0; i < window_write; i++) {
+        for (uint8_t i = 0; i < window_write; i++) {
             d->window[wp++] = d->window[window_offset + i];
         }
         d->window_pos = wp & (window_size - 1);

From 66bdc09c65ad6983dc381c8ffc1ce76b7ce86f0a Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sun, 25 Jan 2026 11:52:26 -0500
Subject: [PATCH 024/109] prep cython bindings for c-compressor-v2

---
 tamp/_c_compressor.pyx | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tamp/_c_compressor.pyx b/tamp/_c_compressor.pyx
index ae690257..8655d085 100644
--- a/tamp/_c_compressor.pyx
+++ b/tamp/_c_compressor.pyx
@@ -35,13 +35,10 @@ cdef class Compressor:
         int literal=8,
         dictionary=None,
         bool lazy_matching=False,
-        bool v2=False,
+        bool v2=True,
     ):
         cdef ctamp.TampConf conf
 
-        if v2:
-            raise NotImplementedError("v2 compression not yet supported in C compressor. Use --v1 flag or --implementation python.")
-
         if dictionary and bit_size(len(dictionary) - 1) != window:
             raise ValueError("Dictionary-window size mismatch.")
 
@@ -59,6 +56,7 @@ cdef class Compressor:
         # Set lazy_matching - this field is conditionally compiled based on TAMP_LAZY_MATCHING
         # The build system defines this macro, so the field should be available
         conf.lazy_matching = lazy_matching
+        conf.v2 = v2
 
         self._window_buffer = dictionary if dictionary else bytearray(1 << window)
         self._window_buffer_ptr = <unsigned char *>self._window_buffer

From bfe963162f881cc2b1754d3b3837ea90e70b6376 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sun, 25 Jan 2026 21:41:10 -0500
Subject: [PATCH 025/109] don't wrap extended match

---
 tamp/compressor.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tamp/compressor.py b/tamp/compressor.py
index 5194515d..c3e880f8 100644
--- a/tamp/compressor.py
+++ b/tamp/compressor.py
@@ -128,8 +128,10 @@ def write_from_self(self, position, size):
         # Write up to end of buffer (no wrap)
         remaining = self.size - self.pos
         window_write = min(size, remaining)
-        for i in range(window_write):
-            self.buffer[self.pos] = self.buffer[position + i]
+        # Read source data first to avoid overlap when source and destination ranges overlap
+        data = self.get(position, window_write)
+        for byte in data:
+            self.buffer[self.pos] = byte
             self.pos += 1
         self.pos %= self.size
 

From 37c360813b3a20f6a97353c2eb00f54a32d5a9b7 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 26 Jan 2026 08:53:59 -0500
Subject: [PATCH 026/109] more robust window_copy

---
 tamp/_c_src/tamp/decompressor.c | 61 +++++++++++++++------------------
 1 file changed, 27 insertions(+), 34 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index b245895e..35d2c8a3 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -7,6 +7,30 @@
 
 #define FLUSH 15
 
+/**
+ * @brief Copy pattern from window to window, updating window_pos.
+ *
+ * Handles potential overlap between source and destination regions by
+ * copying backwards when the destination would "catch up" to the source.
+ */
+TAMP_NOINLINE static void window_copy(unsigned char* window, uint16_t* window_pos, uint16_t window_offset,
+                                      uint8_t match_size, uint16_t window_mask) {
+    const uint16_t src_to_dst = (*window_pos - window_offset) & window_mask;
+
+    if (TAMP_UNLIKELY(src_to_dst < match_size && src_to_dst > 0)) {
+        /* Overlap with dst > src: copy backwards to avoid corruption. */
+        for (uint8_t i = match_size; i-- > 0;) {
+            window[(*window_pos + i) & window_mask] = window[window_offset + i];
+        }
+        *window_pos = (*window_pos + match_size) & window_mask;
+    } else {
+        for (uint8_t i = 0; i < match_size; i++) {
+            window[*window_pos] = window[window_offset + i];
+            *window_pos = (*window_pos + 1) & window_mask;
+        }
+    }
+}
+
 #if TAMP_V2_DECOMPRESS
 /* Token state for v2 decode suspend/resume (2 bits).
  * TOKEN_RLE and TOKEN_EXT_MATCH_FRESH are arranged so that:
@@ -240,50 +264,19 @@ static tamp_res decode_extended_match(TampDecompressor* d, unsigned char** outpu
     *output_written_size += to_write;
 
     /* Update window only on complete decode.
-     * Write up to end of buffer (no wrap), mask wp only at the end. */
+     * Write up to end of buffer (no wrap). */
     if (d->token_state == TOKEN_NONE) {
         uint16_t remaining = window_size - d->window_pos;
         uint8_t window_write = (match_size < remaining) ? match_size : remaining; /* max 126 */
         uint16_t wp = d->window_pos;
-        for (uint8_t i = 0; i < window_write; i++) {
-            d->window[wp++] = d->window[window_offset + i];
-        }
-        d->window_pos = wp & (window_size - 1);
+        window_copy(d->window, &wp, window_offset, window_write, window_size - 1);
+        d->window_pos = wp;
     }
 
     return (d->token_state == TOKEN_NONE) ? TAMP_OK : TAMP_OUTPUT_FULL;
 }
 #endif /* TAMP_V2_DECOMPRESS */
 
-/**
- * @brief Copy pattern from window to window, updating window_pos.
- *
- * Handles potential overlap between source and destination regions by
- * using a temporary buffer when necessary. Overlap occurs when the
- * destination would "catch up" to the source during copying.
- */
-static inline void window_copy(unsigned char* window, uint16_t* window_pos, uint16_t window_offset, uint8_t match_size,
-                               uint16_t window_mask) {
-    const uint16_t src_to_dst = (*window_pos - window_offset) & window_mask;
-    const bool overlap = (src_to_dst < match_size) && (src_to_dst > 0);
-
-    if (TAMP_UNLIKELY(overlap)) {
-        uint8_t tmp_buf[16];
-        for (uint8_t i = 0; i < match_size; i++) {
-            tmp_buf[i] = window[window_offset + i];
-        }
-        for (uint8_t i = 0; i < match_size; i++) {
-            window[*window_pos] = tmp_buf[i];
-            *window_pos = (*window_pos + 1) & window_mask;
-        }
-    } else {
-        for (uint8_t i = 0; i < match_size; i++) {
-            window[*window_pos] = window[window_offset + i];
-            *window_pos = (*window_pos + 1) & window_mask;
-        }
-    }
-}
-
 tamp_res tamp_decompressor_read_header(TampConf* conf, const unsigned char* input, size_t input_size,
                                        size_t* input_consumed_size) {
     if (input_consumed_size) (*input_consumed_size) = 0;

From 19baefb30e021f751f09f48186a2f13c81badba5 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sat, 31 Jan 2026 22:10:22 -0500
Subject: [PATCH 027/109] simplify rle criteria

---
 tamp/compressor.py | 148 +++++++++++++++++++++++++--------------------
 1 file changed, 82 insertions(+), 66 deletions(-)

diff --git a/tamp/compressor.py b/tamp/compressor.py
index c3e880f8..b5e7c392 100644
--- a/tamp/compressor.py
+++ b/tamp/compressor.py
@@ -33,25 +33,6 @@
 _LEADING_RLE_HUFFMAN_BITS = 4
 
 
-def _determine_rle_breakeven_point(min_pattern_size, window_bits):
-    # Determines if a pattern-match would be shorter than a RLE match.
-    # See how many bits this encoding would be with RLE
-    rle_length_bits = {}
-    for i in range(min_pattern_size, min_pattern_size + 11 + 1):
-        rle_length_bits[i] = 8 + _LEADING_RLE_HUFFMAN_BITS + _huffman_bits[(i - 1) >> _LEADING_RLE_HUFFMAN_BITS]
-
-    pattern_length_bits = {}
-    for i in range(min_pattern_size, min_pattern_size + 11 + 1):
-        pattern_length_bits[i] = _huffman_bits[i - min_pattern_size] + window_bits
-
-    breakeven_point = 0
-    for pattern_size in sorted(pattern_length_bits):
-        if pattern_length_bits[pattern_size] < rle_length_bits[pattern_size]:
-            breakeven_point = pattern_size
-
-    return breakeven_point
-
-
 class _BitWriter:
     """Writes bits to a stream."""
 
@@ -133,7 +114,8 @@ def write_from_self(self, position, size):
         for byte in data:
             self.buffer[self.pos] = byte
             self.pos += 1
-        self.pos %= self.size
+        if self.pos == self.size:
+            self.pos = 0
 
     def get(self, index, size):
         out = bytearray(size)
@@ -198,7 +180,6 @@ def __init__(
 
         # "+1" Because a RLE of 1 is not valid.
         self._rle_max_size = (13 << _LEADING_RLE_HUFFMAN_BITS) + (1 << _LEADING_RLE_HUFFMAN_BITS) + 1
-        self._rle_breakeven = _determine_rle_breakeven_point(self.min_pattern_size, self.window_bits)
 
         self._extended_match_count = 0
         self._extended_match_position = 0
@@ -290,57 +271,92 @@ def _compress_input_buffer_single(self) -> int:
             # We ran out of input_buffer, return so caller can re-populate the input_buffer
             return bytes_written
 
-        target = bytes(self._input_buffer)
-        search_i = 0
-        match_size = 1
+        # RLE handling with persistent state (v2 only)
+        # Accumulate RLE count across compression cycles for better compression of long runs
+        have_match_from_rle = False  # Track if we already did pattern matching in RLE section
 
         if self.v2:
-            # RLE same-character-counting logic
-            while (
-                target and target[0] == self._window_buffer.last_written_byte and self._rle_count < self._rle_max_size
-            ):
-                self._rle_count += 1
-                self._input_buffer.popleft()
-                target = bytes(self._input_buffer)
-            if not target and self._rle_count != self._rle_max_size:
-                # Need more input to see if the RLE continues
+            last_byte = self._window_buffer.last_written_byte
+
+            # Count additional matching bytes in current buffer
+            new_rle_bytes = 0
+            for byte in self._input_buffer:
+                if byte == last_byte and self._rle_count + new_rle_bytes < self._rle_max_size:
+                    new_rle_bytes += 1
+                else:
+                    break
+
+            # If we consumed whole buffer and haven't hit max, keep accumulating
+            if new_rle_bytes == len(self._input_buffer) and self._rle_count + new_rle_bytes < self._rle_max_size:
+                # Consume these bytes and wait for more data
+                for _ in range(new_rle_bytes):
+                    self._input_buffer.popleft()
+                self._rle_count += new_rle_bytes
                 return bytes_written
-            if self._rle_count == 1:
-                # This is not RLE; attempt to pattern-match or just write literals.
-                self._input_buffer.appendleft(self._window_buffer.last_written_byte)
-                target = bytes(self._input_buffer)
-                self._rle_count = 0
-            elif self._rle_count:
-                if self._rle_count > self._rle_breakeven:
-                    # It's certainly better to do a RLE write than searching for a pattern.
-                    bytes_written += self._write_rle()
+
+            # RLE run has ended or hit max - decide what to encode
+            total_rle_count = self._rle_count + new_rle_bytes
+
+            if total_rle_count >= 2:
+                # Build search target: accumulated RLE + new RLE + rest of buffer
+                # This allows pattern matching to find longer sequences
+                target = bytes([last_byte]) * total_rle_count + bytes(list(self._input_buffer)[new_rle_bytes:])
+
+                # Do pattern search
+                if self.lazy_matching and self._cached_match_index >= 0:
+                    search_i = self._cached_match_index
+                    match_size = self._cached_match_size
+                    match = self._window_buffer.get(search_i, match_size)
+                    self._cached_match_index = -1
+                else:
+                    search_i, match = self._search(target, start=0)
+                    match_size = len(match)
+
+                have_match_from_rle = True
+
+                # Simple decision: if pattern match is longer, use it; otherwise use RLE
+                if match_size >= self.min_pattern_size and match_size > total_rle_count:
+                    # Pattern match wins
+                    # Consume bytes from input_buffer (only the bytes actually in the buffer)
+                    # Note: first _rle_count bytes were already consumed in previous calls
+                    bytes_to_consume = match_size - self._rle_count
+                    for _ in range(bytes_to_consume):
+                        self._input_buffer.popleft()
+                    self._rle_count = 0
+
+                    # Write the pattern match immediately and return
+                    # (Don't continue to normal flow which would try to consume bytes again)
+                    if self.v2 and match_size > (self.min_pattern_size + 11):
+                        self._extended_match_position = search_i
+                        self._extended_match_count = match_size
+                        bytes_written += self._write_extended_match()
+                    else:
+                        bytes_written += self._write_match(search_i, match)
                     return bytes_written
                 else:
-                    # We'll see if pattern-matching offers a better encoding.
-                    target = bytes([self._window_buffer.last_written_byte]) * self._rle_count
-
-        # Check if we have a cached match from lazy matching
-        if self.lazy_matching and self._cached_match_index >= 0:
-            search_i = self._cached_match_index
-            match_size = self._cached_match_size
-            match = self._window_buffer.get(search_i, match_size)
-            self._cached_match_index = -1  # Clear cache after using
-        else:
-            # Perform normal pattern-matching
-            search_i, match = self._search(target, start=0)
-            match_size = len(match)
-
-        if self._rle_count:
-            # Check to see if the found pattern-match is more efficient than the RLE encoding.
-            assert self._rle_count >= 2  # noqa: S101
-            if match_size >= self._rle_count:
-                # Pattern is better than RLE
-                bytes_written += self._write_match(search_i, match)
+                    # RLE wins - commit RLE
+                    for _ in range(new_rle_bytes):
+                        self._input_buffer.popleft()
+                    self._rle_count = total_rle_count
+                    bytes_written += self._write_rle()
+                    return bytes_written
+            elif self._rle_count == 1:
+                # Single byte isn't worth RLE encoding
                 self._rle_count = 0
-                return bytes_written
+                # Fall through to normal pattern matching
+
+        # Normal pattern matching (when no RLE or RLE was abandoned for pattern)
+        if not have_match_from_rle:
+            target = bytes(self._input_buffer)
+
+            if self.lazy_matching and self._cached_match_index >= 0:
+                search_i = self._cached_match_index
+                match_size = self._cached_match_size
+                match = self._window_buffer.get(search_i, match_size)
+                self._cached_match_index = -1
             else:
-                # RLE is better than pattern
-                return self._write_rle()
+                search_i, match = self._search(target, start=0)
+                match_size = len(match)
 
         # Lazy matching logic
         if (
@@ -476,7 +492,7 @@ def _write_rle(self) -> int:
             bytes_written += self._bit_writer.write_huffman_and_literal_flag(_RLE_SYMBOL)
             bytes_written += self._write_extended_huffman(self._rle_count - 2, _LEADING_RLE_HUFFMAN_BITS)
 
-            # Write up to 8 bytes (or until end of buffer) to the window.
+            # Write up to 8 bytes to the window (up to end of buffer, no wrap).
             remaining = self._window_buffer.size - self._window_buffer.pos
             window_write = min(self._rle_count, _RLE_MAX_WINDOW, remaining)
             self._window_buffer.write_bytes(bytes([last_written_byte]) * window_write)

From 5ae02824b7c631ff8d3e8c4541f23d918c543693 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sun, 1 Feb 2026 10:39:26 -0500
Subject: [PATCH 028/109] simplify window_copy call

---
 tamp/_c_src/tamp/decompressor.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 35d2c8a3..d4b61f7a 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -263,13 +263,10 @@ static tamp_res decode_extended_match(TampDecompressor* d, unsigned char** outpu
     }
     *output_written_size += to_write;
 
-    /* Update window only on complete decode.
-     * Write up to end of buffer (no wrap). */
+    /* Update window only on complete decode. */
     if (d->token_state == TOKEN_NONE) {
-        uint16_t remaining = window_size - d->window_pos;
-        uint8_t window_write = (match_size < remaining) ? match_size : remaining; /* max 126 */
         uint16_t wp = d->window_pos;
-        window_copy(d->window, &wp, window_offset, window_write, window_size - 1);
+        window_copy(d->window, &wp, window_offset, match_size, window_size - 1);
         d->window_pos = wp;
     }
 

From 983532c35c59a2f6d7c5ec9d9ef3406d153a12b7 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sun, 1 Feb 2026 11:10:01 -0500
Subject: [PATCH 029/109] move window_copy to a better location.

---
 tamp/_c_src/tamp/decompressor.c | 48 ++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index d4b61f7a..d419df11 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -7,30 +7,6 @@
 
 #define FLUSH 15
 
-/**
- * @brief Copy pattern from window to window, updating window_pos.
- *
- * Handles potential overlap between source and destination regions by
- * copying backwards when the destination would "catch up" to the source.
- */
-TAMP_NOINLINE static void window_copy(unsigned char* window, uint16_t* window_pos, uint16_t window_offset,
-                                      uint8_t match_size, uint16_t window_mask) {
-    const uint16_t src_to_dst = (*window_pos - window_offset) & window_mask;
-
-    if (TAMP_UNLIKELY(src_to_dst < match_size && src_to_dst > 0)) {
-        /* Overlap with dst > src: copy backwards to avoid corruption. */
-        for (uint8_t i = match_size; i-- > 0;) {
-            window[(*window_pos + i) & window_mask] = window[window_offset + i];
-        }
-        *window_pos = (*window_pos + match_size) & window_mask;
-    } else {
-        for (uint8_t i = 0; i < match_size; i++) {
-            window[*window_pos] = window[window_offset + i];
-            *window_pos = (*window_pos + 1) & window_mask;
-        }
-    }
-}
-
 #if TAMP_V2_DECOMPRESS
 /* Token state for v2 decode suspend/resume (2 bits).
  * TOKEN_RLE and TOKEN_EXT_MATCH_FRESH are arranged so that:
@@ -57,6 +33,30 @@ static const uint8_t HUFFMAN_TABLE[128] = {
     17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,  17,  17,  17,  17, 17, 17,  17,  17,  17,  17, 17,
     17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,  17,  17,  17,  17, 17, 17,  17,  17,  17};
 
+/**
+ * @brief Copy pattern from window to window, updating window_pos.
+ *
+ * Handles potential overlap between source and destination regions by
+ * copying backwards when the destination would "catch up" to the source.
+ */
+TAMP_NOINLINE static void window_copy(unsigned char* window, uint16_t* window_pos, uint16_t window_offset,
+                                      uint8_t match_size, uint16_t window_mask) {
+    const uint16_t src_to_dst = (*window_pos - window_offset) & window_mask;
+
+    if (TAMP_UNLIKELY(src_to_dst < match_size && src_to_dst > 0)) {
+        /* Overlap with dst > src: copy backwards to avoid corruption. */
+        for (uint8_t i = match_size; i-- > 0;) {
+            window[(*window_pos + i) & window_mask] = window[window_offset + i];
+        }
+        *window_pos = (*window_pos + match_size) & window_mask;
+    } else {
+        for (uint8_t i = 0; i < match_size; i++) {
+            window[*window_pos] = window[window_offset + i];
+            *window_pos = (*window_pos + 1) & window_mask;
+        }
+    }
+}
+
 /**
  * @brief Decode huffman symbol + optional trailing bits from bit buffer.
  *

From 131c25a8552329892b9382d5910d77634c627db7 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sun, 1 Feb 2026 12:29:02 -0500
Subject: [PATCH 030/109] more comments

---
 tamp/_c_src/tamp/decompressor.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index d419df11..5a289232 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -38,13 +38,36 @@ static const uint8_t HUFFMAN_TABLE[128] = {
  *
  * Handles potential overlap between source and destination regions by
  * copying backwards when the destination would "catch up" to the source.
+ *
+ * IMPORTANT: Caller must validate that (window_offset + match_size) does not
+ * exceed window bounds before calling this function. This function assumes
+ * window_offset and match_size are pre-validated and does not perform
+ * bounds checking on source reads.
+ *
+ * @param window Circular buffer (size must be power of 2)
+ * @param window_pos Current write position (updated by this function)
+ * @param window_offset Source position to copy from
+ * @param match_size Number of bytes to copy
+ * @param window_mask Bitmask for wrapping (window_size - 1)
  */
 TAMP_NOINLINE static void window_copy(unsigned char* window, uint16_t* window_pos, uint16_t window_offset,
                                       uint8_t match_size, uint16_t window_mask) {
+    /* Calculate distance from source to destination in circular buffer.
+     * src_to_dst = (dst - src) & mask gives the forward distance. */
     const uint16_t src_to_dst = (*window_pos - window_offset) & window_mask;
 
+    /* Critical overlap case: destination is AHEAD of source and they overlap.
+     * When dst > src by less than match_size, a forward copy corrupts data because
+     * we write to positions before reading from them.
+     *
+     * Example: src=100, dst=105, match_size=8
+     *   - Forward copy at i=5 would read window[105], but we already overwrote it at i=0!
+     *   - Must copy in REVERSE order (end to start) to read source bytes before overwriting.
+     */
     if (TAMP_UNLIKELY(src_to_dst < match_size && src_to_dst > 0)) {
-        /* Overlap with dst > src: copy backwards to avoid corruption. */
+        /* Copy in reverse order: start from last byte, work backwards to first byte.
+         * This ensures we read all overlapping source bytes before they're overwritten.
+         * Destination wraps via mask; source doesn't need wrapping (pre-validated bounds). */
         for (uint8_t i = match_size; i-- > 0;) {
             window[(*window_pos + i) & window_mask] = window[window_offset + i];
         }

From 57aeacb801f549601d1a846f6e607e299ee4eca8 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sun, 1 Feb 2026 13:56:41 -0500
Subject: [PATCH 031/109] cleanup decompressor v2 flag check.

---
 tamp/_c_src/tamp/decompressor.c | 12 +++++++-----
 tamp/_c_src/tamp/decompressor.h |  4 +---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 5a289232..5222f4fb 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -286,10 +286,13 @@ static tamp_res decode_extended_match(TampDecompressor* d, unsigned char** outpu
     }
     *output_written_size += to_write;
 
-    /* Update window only on complete decode. */
+    /* Update window only on complete decode.
+     * Write up to end of buffer (no wrap), matching RLE behavior. */
     if (d->token_state == TOKEN_NONE) {
         uint16_t wp = d->window_pos;
-        window_copy(d->window, &wp, window_offset, match_size, window_size - 1);
+        uint16_t remaining = window_size - wp;
+        uint8_t window_write = (match_size < remaining) ? match_size : remaining;
+        window_copy(d->window, &wp, window_offset, window_write, window_size - 1);
         d->window_pos = wp;
     }
 
@@ -329,10 +332,9 @@ static tamp_res tamp_decompressor_populate_from_conf(TampDecompressor* decompres
     decompressor->conf_literal = conf_literal;
     decompressor->min_pattern_size = tamp_compute_min_pattern_size(conf_window, conf_literal);
     decompressor->configured = true;
-#if TAMP_V2_DECOMPRESS
     decompressor->conf_v2 = conf_v2;
-#else
-    (void)conf_v2;
+#if !TAMP_V2_DECOMPRESS
+    if (conf_v2) return TAMP_INVALID_CONF;  // v2 stream but v2 support not compiled in
 #endif
 
     return TAMP_OK;
diff --git a/tamp/_c_src/tamp/decompressor.h b/tamp/_c_src/tamp/decompressor.h
index 8e333ba8..7f1b5843 100644
--- a/tamp/_c_src/tamp/decompressor.h
+++ b/tamp/_c_src/tamp/decompressor.h
@@ -40,9 +40,7 @@ typedef struct {
     uint8_t conf_window : 4;       // Window bits from config
     uint8_t conf_literal : 4;      // Literal bits from config
     uint8_t min_pattern_size : 2;  // Minimum pattern size, 2 or 3
-#if TAMP_V2_DECOMPRESS
-    uint8_t conf_v2 : 1;  // v2 format enabled (from header)
-#endif
+    uint8_t conf_v2 : 1;           // v2 format enabled (from header)
 
     /* COLD: rarely accessed (init or edge cases).
      * Bitfields save space; add new cold fields here. */

From edb6ed41884d178a6964d6a6b396ece0724b8254 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sun, 1 Feb 2026 18:17:41 -0500
Subject: [PATCH 032/109] decompressor fix.

---
 tamp/_c_src/tamp/decompressor.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 5222f4fb..29e4fb7f 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -431,7 +431,9 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigne
             }
             if (res == TAMP_INPUT_EXHAUSTED) {
                 refill_bit_buffer(decompressor, &input, input_end, input_consumed_size);
-                if (input == input_end) return TAMP_INPUT_EXHAUSTED;
+                /* Don't return yet - we may have refilled enough bits to continue.
+                 * The loop condition will exit when both input is exhausted AND
+                 * pos_and_state (including token_state) is zero. */
                 continue;
             }
             if (res != TAMP_OK) return res;

From d2a6f3c4c6f7de55d075261a4f0af9fd2b2e4dfb Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 2 Feb 2026 09:05:09 -0500
Subject: [PATCH 033/109] fix infinite loop

---
 tamp/_c_src/tamp/decompressor.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 29e4fb7f..00169998 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -430,10 +430,13 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigne
                 res = decode_extended_match(decompressor, &output, output_end, output_written_size);
             }
             if (res == TAMP_INPUT_EXHAUSTED) {
+                uint8_t old_bit_pos = decompressor->bit_buffer_pos;
                 refill_bit_buffer(decompressor, &input, input_end, input_consumed_size);
-                /* Don't return yet - we may have refilled enough bits to continue.
-                 * The loop condition will exit when both input is exhausted AND
-                 * pos_and_state (including token_state) is zero. */
+                /* If we couldn't get more bits and input is exhausted, stop.
+                 * Otherwise the loop would run forever with token_state set. */
+                if (decompressor->bit_buffer_pos == old_bit_pos && input == input_end) {
+                    return TAMP_INPUT_EXHAUSTED;
+                }
                 continue;
             }
             if (res != TAMP_OK) return res;

From f55772bdc9d15a986b9a50e2227bab6defea837a Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 2 Feb 2026 09:52:44 -0500
Subject: [PATCH 034/109] configure pytest in pyproject.toml

---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index cf5b40ed..d2b2b717 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -102,6 +102,9 @@ unittest = [
 ]
 uprofiler = "https://github.com/BrianPugh/micropython-libs/blob/main/lib/uprofiler.py"
 
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+
 [tool.coverage.run]
 branch = true
 omit = [

From 9d1b7858398fcb00692c122e13ee4c0ce7a5dd64 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 2 Feb 2026 09:56:32 -0500
Subject: [PATCH 035/109] initial v2 c compressor implementation; needs
 optimizing.

---
 tamp/_c_src/tamp/compressor.c | 363 +++++++++++++++++++++++++++++++++-
 tamp/_c_src/tamp/compressor.h |  20 ++
 2 files changed, 377 insertions(+), 6 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 0e06100f..7347bfa0 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -9,7 +9,13 @@
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2 * !!(condition)]))
 
+#if TAMP_V2_COMPRESS
+// V2 max pattern: min_pattern_size + 11 + 112 = min_pattern_size + 123
+#define MAX_PATTERN_SIZE_V2 (compressor->min_pattern_size + 123)
+#define MAX_PATTERN_SIZE (compressor->conf_v2 ? MAX_PATTERN_SIZE_V2 : (compressor->min_pattern_size + 13))
+#else
 #define MAX_PATTERN_SIZE (compressor->min_pattern_size + 13)
+#endif
 #define WINDOW_SIZE (1 << compressor->conf_window)
 // 0xF because sizeof(TampCompressor.input) == 16;
 #define input_add(offset) ((compressor->input_pos + offset) & 0xF)
@@ -23,23 +29,64 @@ static const uint8_t huffman_codes[] = {0x0, 0x3, 0x8, 0xb, 0x14, 0x24, 0x26, 0x
 // These bit lengths pre-add the 1 bit for the 0-value is_literal flag.
 static const uint8_t huffman_bits[] = {0x2, 0x3, 0x5, 0x5, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0x9, 0x7};
 
+#if TAMP_V2_COMPRESS
+// V2: Maximum RLE count = (13 << 4) + 15 + 2 = 225
+#define RLE_MAX_COUNT 225
+// V2: Maximum extended match extra = (13 << 3) + 7 + 1 = 112
+// Total max match = min_pattern_size + 11 + 112 = min_pattern_size + 123
+#define EXTENDED_MATCH_MAX_EXTRA 112
+#endif
+
 static inline void write_to_bit_buffer(TampCompressor *compressor, uint32_t bits, uint8_t n_bits) {
     compressor->bit_buffer_pos += n_bits;
     compressor->bit_buffer |= bits << (32 - compressor->bit_buffer_pos);
 }
 
+#if TAMP_V2_COMPRESS
+/**
+ * @brief Write extended huffman encoding (huffman + trailing bits).
+ *
+ * Used for both RLE count and extended match size encoding.
+ *
+ * @param[in,out] compressor Compressor with bit buffer.
+ * @param[in] value The value to encode.
+ * @param[in] trailing_bits Number of trailing bits (3 for extended match, 4 for RLE).
+ */
+static inline void write_extended_huffman(TampCompressor *compressor, uint8_t value, uint8_t trailing_bits) {
+    uint8_t mask = (1 << trailing_bits) - 1;
+    uint8_t code_index = value >> trailing_bits;
+    // Write huffman code without literal flag (subtract 1 from bit length)
+    write_to_bit_buffer(compressor, huffman_codes[code_index], huffman_bits[code_index] - 1);
+    write_to_bit_buffer(compressor, value & mask, trailing_bits);
+}
+
+#endif  // TAMP_V2_COMPRESS
+
 /**
  * @brief Partially flush the internal bit buffer.
  *
- * Up to 7 bits may remain in the internal bit buffer.
+ * @param[in] min_threshold Minimum bit_buffer_pos to trigger flush. Default 8 flushes whole bytes.
+ *                          Use 7 for extended match tokens which need up to 26 bits (32 - 26 = 6 max remaining).
+ *
+ * Up to (min_threshold - 1) bits may remain in the internal bit buffer.
  */
-static inline tamp_res partial_flush(TampCompressor *compressor, unsigned char *output, size_t output_size,
-                                     size_t *output_written_size) {
-    for (*output_written_size = output_size; compressor->bit_buffer_pos >= 8 && output_size;
+static inline tamp_res partial_flush_threshold(TampCompressor *compressor, unsigned char *output, size_t output_size,
+                                               size_t *output_written_size, uint8_t min_threshold) {
+    for (*output_written_size = output_size; compressor->bit_buffer_pos >= min_threshold && output_size;
          output_size--, compressor->bit_buffer_pos -= 8, compressor->bit_buffer <<= 8)
         *output++ = compressor->bit_buffer >> 24;
     *output_written_size -= output_size;
-    return (compressor->bit_buffer_pos >= 8) ? TAMP_OUTPUT_FULL : TAMP_OK;
+    return (compressor->bit_buffer_pos >= min_threshold) ? TAMP_OUTPUT_FULL : TAMP_OK;
+}
+
+/**
+ * @brief Partially flush the internal bit buffer.
+ *
+ * Up to 7 bits may remain in the internal bit buffer.
+ */
+static inline tamp_res partial_flush(TampCompressor *compressor, unsigned char *output, size_t output_size,
+                                     size_t *output_written_size) {
+    return partial_flush_threshold(compressor, output, output_size, output_written_size, 8);
 }
 
 inline bool tamp_compressor_full(const TampCompressor *compressor) {
@@ -142,6 +189,9 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
         .use_custom_dictionary = false,
 #if TAMP_LAZY_MATCHING
         .lazy_matching = false,
+#endif
+#if TAMP_V2_COMPRESS
+        .v2 = true,  // Default to v2 format
 #endif
     };
     if (!conf) conf = &conf_default;
@@ -157,6 +207,9 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
 #if TAMP_LAZY_MATCHING
     compressor->conf_lazy_matching = conf->lazy_matching;
 #endif
+#if TAMP_V2_COMPRESS
+    compressor->conf_v2 = conf->v2;
+#endif
 
     compressor->window = window;
     compressor->min_pattern_size = tamp_compute_min_pattern_size(conf->window, conf->literal);
@@ -171,12 +224,120 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
     write_to_bit_buffer(compressor, conf->window - 8, 3);
     write_to_bit_buffer(compressor, conf->literal - 5, 2);
     write_to_bit_buffer(compressor, conf->use_custom_dictionary, 1);
-    write_to_bit_buffer(compressor, 0, 1);  // Reserved
+#if TAMP_V2_COMPRESS
+    write_to_bit_buffer(compressor, conf->v2, 1);  // v2 format flag
+#else
+    write_to_bit_buffer(compressor, 0, 1);  // Reserved (v1 only)
+#endif
     write_to_bit_buffer(compressor, 0, 1);  // No more header bytes
 
     return TAMP_OK;
 }
 
+#if TAMP_V2_COMPRESS
+/**
+ * @brief Get the last byte written to the window.
+ */
+static inline uint8_t get_last_window_byte(TampCompressor *compressor) {
+    uint16_t prev_pos = (compressor->window_pos - 1) & ((1 << compressor->conf_window) - 1);
+    return compressor->window[prev_pos];
+}
+
+/**
+ * @brief Write RLE token to bit buffer and update window.
+ *
+ * @param[in,out] compressor Compressor state.
+ * @param[in] count Number of repeated bytes (must be >= 2).
+ */
+static void write_rle_token(TampCompressor *compressor, uint8_t count) {
+    const uint16_t window_mask = (1 << compressor->conf_window) - 1;
+    uint8_t symbol = get_last_window_byte(compressor);
+
+    // Write RLE symbol (12) with literal flag
+    // Note: symbols 12 and 13 are at indices 12 and 13 in huffman table (not offset by min_pattern_size)
+    write_to_bit_buffer(compressor, huffman_codes[TAMP_RLE_SYMBOL], huffman_bits[TAMP_RLE_SYMBOL]);
+    // Write extended huffman for count-2
+    write_extended_huffman(compressor, count - 2, TAMP_LEADING_RLE_BITS);
+
+    // Write up to TAMP_RLE_MAX_WINDOW bytes to window (or until buffer end, no wrap)
+    uint16_t remaining = WINDOW_SIZE - compressor->window_pos;
+    uint8_t window_write = MIN(MIN(count, TAMP_RLE_MAX_WINDOW), remaining);
+    for (uint8_t i = 0; i < window_write; i++) {
+        compressor->window[compressor->window_pos] = symbol;
+        compressor->window_pos = (compressor->window_pos + 1) & window_mask;
+    }
+}
+
+/**
+ * @brief Write extended match token to bit buffer and update window.
+ *
+ * Extended match tokens can be up to 26 bits (7 symbol + 9 extended_huffman + 10 window).
+ * To avoid overflowing the 32-bit buffer when starting with up to 7 bits remaining,
+ * we do intermediate flushes between token parts.
+ *
+ * @param[in,out] compressor Compressor state.
+ * @param[out] output Output buffer for flushed bytes.
+ * @param[in] output_size Available space in output buffer.
+ * @param[out] output_written_size Bytes written to output.
+ * @return TAMP_OK on success, TAMP_OUTPUT_FULL if output buffer is too small.
+ */
+static tamp_res write_extended_match_token(TampCompressor *compressor, unsigned char *output, size_t output_size,
+                                           size_t *output_written_size) {
+    const uint16_t window_mask = (1 << compressor->conf_window) - 1;
+    const uint8_t count = compressor->extended_match_count;
+    const uint16_t position = compressor->extended_match_position;
+    tamp_res res;
+    size_t flush_bytes;
+
+    *output_written_size = 0;
+
+    // Write extended match symbol (13) with literal flag (7 bits)
+    // Note: symbols 12 and 13 are at indices 12 and 13 in huffman table (not offset by min_pattern_size)
+    write_to_bit_buffer(compressor, huffman_codes[TAMP_EXTENDED_MATCH_SYMBOL],
+                        huffman_bits[TAMP_EXTENDED_MATCH_SYMBOL]);
+
+    // Flush to make room for extended huffman (up to 9 bits)
+    res = partial_flush(compressor, output, output_size, &flush_bytes);
+    *output_written_size += flush_bytes;
+    output += flush_bytes;
+    output_size -= flush_bytes;
+    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+
+    // Write extended huffman for (count - min_pattern_size - 11 - 1)
+    write_extended_huffman(compressor, count - compressor->min_pattern_size - 11 - 1, TAMP_LEADING_EXTENDED_MATCH_BITS);
+
+    // Flush to make room for window position (10 bits)
+    res = partial_flush(compressor, output, output_size, &flush_bytes);
+    *output_written_size += flush_bytes;
+    output += flush_bytes;
+    output_size -= flush_bytes;
+    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+
+    // Write window position
+    write_to_bit_buffer(compressor, position, compressor->conf_window);
+
+    // Flush any remaining complete bytes
+    res = partial_flush(compressor, output, output_size, &flush_bytes);
+    *output_written_size += flush_bytes;
+    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+
+    // Write to window (up to end of buffer, no wrap)
+    uint16_t remaining = WINDOW_SIZE - compressor->window_pos;
+    uint8_t window_write = MIN(count, remaining);
+    for (uint8_t i = 0; i < window_write; i++) {
+        compressor->window[compressor->window_pos] = compressor->window[position + i];
+        compressor->window_pos++;
+    }
+    compressor->window_pos &= window_mask;
+
+    // Reset extended match state
+    compressor->extended_match_count = 0;
+    compressor->extended_match_position = 0;
+
+    return TAMP_OK;
+}
+#endif  // TAMP_V2_COMPRESS
+
 TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned char *output, size_t output_size,
                                             size_t *output_written_size) {
     tamp_res res;
@@ -200,9 +361,132 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
 
     if (TAMP_UNLIKELY(output_size == 0)) return TAMP_OUTPUT_FULL;
 
+#if TAMP_V2_COMPRESS
+    // V2: Handle extended match continuation
+    if (TAMP_UNLIKELY(compressor->conf_v2 && compressor->extended_match_count)) {
+        // We're in extended match mode - try to extend the match using search-based extension
+        // (matching Python's behavior: search for current_pattern + next_byte in window)
+        const uint8_t max_ext_match = compressor->min_pattern_size + 11 + EXTENDED_MATCH_MAX_EXTRA;
+        const unsigned char *window = compressor->window;
+
+        while (compressor->input_size > 0) {
+            // Check if extending would go beyond window buffer boundary (no wrap-around)
+            if (compressor->extended_match_position + compressor->extended_match_count >= WINDOW_SIZE) {
+                size_t token_bytes;
+                res = write_extended_match_token(compressor, output, output_size, &token_bytes);
+                (*output_written_size) += token_bytes;
+                if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+                return TAMP_OK;
+            }
+
+            // Check if we've reached max extended match size
+            if (compressor->extended_match_count >= max_ext_match) {
+                size_t token_bytes;
+                res = write_extended_match_token(compressor, output, output_size, &token_bytes);
+                (*output_written_size) += token_bytes;
+                if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+                return TAMP_OK;
+            }
+
+            // Search-based extension: search for pattern + next_byte in window
+            // This matches Python's _search behavior for extended match extension
+            const uint8_t current_count = compressor->extended_match_count;
+            const uint16_t current_pos = compressor->extended_match_position;
+            const uint8_t next_byte = read_input(0);
+            const uint8_t target_len = current_count + 1;
+
+            // Search window for a position where target_len bytes match
+            // Start from current_pos and search forward
+            uint16_t best_pos = current_pos;
+            uint8_t best_len = 0;
+
+            for (uint16_t search_pos = current_pos; search_pos + target_len <= WINDOW_SIZE; search_pos++) {
+                // Check if prefix matches
+                uint8_t match_len = 0;
+                for (uint8_t i = 0; i < current_count && match_len == i; i++) {
+                    if (window[search_pos + i] == window[current_pos + i]) {
+                        match_len = i + 1;
+                    }
+                }
+                if (match_len < current_count) continue;  // Current pattern doesn't match here
+
+                // Check if next_byte also matches
+                if (window[search_pos + current_count] == next_byte) {
+                    // Found a match of target_len bytes
+                    best_pos = search_pos;
+                    best_len = target_len;
+                    break;  // Take first match (same as Python's index())
+                }
+            }
+
+            if (best_len > current_count) {
+                // Found longer match - update position and count
+                compressor->extended_match_count = best_len;
+                compressor->extended_match_position = best_pos;
+                compressor->input_pos = input_add(1);
+                compressor->input_size--;
+
+                if (compressor->extended_match_count >= max_ext_match) {
+                    size_t token_bytes;
+                    res = write_extended_match_token(compressor, output, output_size, &token_bytes);
+                    (*output_written_size) += token_bytes;
+                    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+                    return TAMP_OK;
+                }
+            } else {
+                // Match ended - emit current match
+                size_t token_bytes;
+                res = write_extended_match_token(compressor, output, output_size, &token_bytes);
+                (*output_written_size) += token_bytes;
+                if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+                return TAMP_OK;
+            }
+        }
+        // Ran out of input while extending - return and wait for more
+        return TAMP_OK;
+    }
+#endif  // TAMP_V2_COMPRESS
+
     uint8_t match_size = 0;
     uint16_t match_index = 0;
 
+#if TAMP_V2_COMPRESS
+    // V2: Handle RLE accumulation with persistent state
+    // For simplicity in C, we commit RLE immediately when the run ends
+    if (TAMP_UNLIKELY(compressor->conf_v2)) {
+        uint8_t last_byte = get_last_window_byte(compressor);
+
+        // Count and CONSUME matching bytes
+        while (compressor->input_size > 0 && compressor->rle_count < RLE_MAX_COUNT) {
+            if (read_input(0) == last_byte) {
+                compressor->rle_count++;
+                compressor->input_pos = input_add(1);
+                compressor->input_size--;
+            } else {
+                break;
+            }
+        }
+
+        // If we consumed whole buffer and haven't hit max, return (accumulate more)
+        if (compressor->input_size == 0 && compressor->rle_count < RLE_MAX_COUNT && compressor->rle_count > 0) {
+            return TAMP_OK;
+        }
+
+        // RLE run has ended
+        if (compressor->rle_count >= 2) {
+            // Commit the RLE (simplified approach for C)
+            write_rle_token(compressor, compressor->rle_count);
+            compressor->rle_count = 0;
+            return TAMP_OK;
+        } else if (compressor->rle_count == 1) {
+            // Single byte - push it back to input for normal literal encoding
+            compressor->input_pos = input_add(-1);
+            compressor->input_size++;
+            compressor->rle_count = 0;
+        }
+    }
+#endif  // TAMP_V2_COMPRESS
+
 #if TAMP_LAZY_MATCHING
     if (compressor->conf_lazy_matching) {
         // Check if we have a cached match from lazy matching
@@ -250,6 +534,8 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
             } else {
                 // Use current match, clear cache
                 compressor->cached_match_index = -1;
+                // Note: No V2 extended match check here - we're in the match_size <= 8 branch,
+                // so extended matches (which require match_size > min_pattern_size + 11) are impossible.
                 uint8_t huffman_index = match_size - compressor->min_pattern_size;
                 write_to_bit_buffer(compressor, huffman_codes[huffman_index], huffman_bits[huffman_index]);
                 write_to_bit_buffer(compressor, match_index, compressor->conf_window);
@@ -266,6 +552,17 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
         } else {
             // Write TOKEN
             compressor->cached_match_index = -1;  // Clear cache
+#if TAMP_V2_COMPRESS
+            // V2: Check for extended match
+            if (compressor->conf_v2 && match_size > compressor->min_pattern_size + 11) {
+                compressor->extended_match_count = match_size;
+                compressor->extended_match_position = match_index;
+                // Consume matched bytes from input
+                compressor->input_pos = input_add(match_size);
+                compressor->input_size -= match_size;
+                return TAMP_OK;
+            }
+#endif
             uint8_t huffman_index = match_size - compressor->min_pattern_size;
             write_to_bit_buffer(compressor, huffman_codes[huffman_index], huffman_bits[huffman_index]);
             write_to_bit_buffer(compressor, match_index, compressor->conf_window);
@@ -283,6 +580,17 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
             }
             write_to_bit_buffer(compressor, IS_LITERAL_FLAG | c, compressor->conf_literal + 1);
         } else {
+#if TAMP_V2_COMPRESS
+            // V2: Check for extended match
+            if (compressor->conf_v2 && match_size > compressor->min_pattern_size + 11) {
+                compressor->extended_match_count = match_size;
+                compressor->extended_match_position = match_index;
+                // Consume matched bytes from input
+                compressor->input_pos = input_add(match_size);
+                compressor->input_size -= match_size;
+                return TAMP_OK;
+            }
+#endif
             // Write TOKEN
             uint8_t huffman_index = match_size - compressor->min_pattern_size;
             write_to_bit_buffer(compressor, huffman_codes[huffman_index], huffman_bits[huffman_index]);
@@ -375,6 +683,49 @@ tamp_res tamp_compressor_flush(TampCompressor *compressor, unsigned char *output
         output += chunk_output_written_size;
     }
 
+#if TAMP_V2_COMPRESS
+    // V2: Flush any pending RLE
+    if (compressor->conf_v2 && compressor->rle_count >= 1) {
+        // Partial flush first to make room
+        res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
+        (*output_written_size) += chunk_output_written_size;
+        if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+        output_size -= chunk_output_written_size;
+        output += chunk_output_written_size;
+
+        if (compressor->rle_count == 1) {
+            // Single byte - write as literal (can't use RLE token for count < 2)
+            uint8_t literal = get_last_window_byte(compressor);
+            write_to_bit_buffer(compressor, IS_LITERAL_FLAG | literal, compressor->conf_literal + 1);
+
+            // Write to window
+            const uint16_t window_mask = (1 << compressor->conf_window) - 1;
+            compressor->window[compressor->window_pos] = literal;
+            compressor->window_pos = (compressor->window_pos + 1) & window_mask;
+        } else {
+            // count >= 2: write as RLE token
+            write_rle_token(compressor, compressor->rle_count);
+        }
+        compressor->rle_count = 0;
+
+        // Partial flush again after writing token
+        res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
+        (*output_written_size) += chunk_output_written_size;
+        if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+        output_size -= chunk_output_written_size;
+        output += chunk_output_written_size;
+    }
+
+    // V2: Flush any pending extended match
+    if (compressor->conf_v2 && compressor->extended_match_count) {
+        res = write_extended_match_token(compressor, output, output_size, &chunk_output_written_size);
+        (*output_written_size) += chunk_output_written_size;
+        if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+        output_size -= chunk_output_written_size;
+        output += chunk_output_written_size;
+    }
+#endif
+
     // Perform partial flush to see if we need a FLUSH token (check if output buffer in not empty),
     // and to subsequently make room for the FLUSH token.
     res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
diff --git a/tamp/_c_src/tamp/compressor.h b/tamp/_c_src/tamp/compressor.h
index cd6ca1f0..91de900f 100644
--- a/tamp/_c_src/tamp/compressor.h
+++ b/tamp/_c_src/tamp/compressor.h
@@ -24,6 +24,9 @@ typedef struct TampCompressor {
     uint8_t conf_use_custom_dictionary;  // Use a custom initialized dictionary.
 #if TAMP_LAZY_MATCHING
     uint8_t conf_lazy_matching;  // Use lazy matching for better compression
+#endif
+#if TAMP_V2_COMPRESS
+    uint8_t conf_v2;  // Use v2 format (RLE, extended match)
 #endif
     uint8_t min_pattern_size;
 
@@ -32,6 +35,13 @@ typedef struct TampCompressor {
     int16_t cached_match_index;
     uint8_t cached_match_size;
 #endif
+
+#if TAMP_V2_COMPRESS
+    /* V2 state */
+    uint8_t rle_count;                 // Current RLE run length (max 225)
+    uint8_t extended_match_count;      // Current extended match size (max ~126)
+    uint16_t extended_match_position;  // Window position for extended match
+#endif
 #else  // Use bitfields for reduced memory-usage
     /* Conf attributes */
     uint32_t conf_window : 4;                 // number of window bits
@@ -40,6 +50,9 @@ typedef struct TampCompressor {
 #if TAMP_LAZY_MATCHING
     uint32_t conf_lazy_matching : 1;  // Use lazy matching for better compression
 #endif
+#if TAMP_V2_COMPRESS
+    uint32_t conf_v2 : 1;  // Use v2 format (RLE, extended match)
+#endif
 
     /* Other small attributes */
     uint32_t window_pos : 15;
@@ -54,6 +67,13 @@ typedef struct TampCompressor {
     int16_t cached_match_index;
     uint8_t cached_match_size;
 #endif
+
+#if TAMP_V2_COMPRESS
+    /* V2 state */
+    uint8_t rle_count;                 // Current RLE run length (max 225)
+    uint8_t extended_match_count;      // Current extended match size (max ~126)
+    uint16_t extended_match_position;  // Window position for extended match
+#endif
 #endif  // TAMP_ESP32
     unsigned char input[16] /* __attribute__ ((aligned (16)))*/;
     uint32_t bit_buffer;

From 775ee58f0f815d6c083f5fd3725dadc14f56362c Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 2 Feb 2026 10:26:29 -0500
Subject: [PATCH 036/109] v2-compressed datasets

---
 .gitattributes                                |  1 +
 .gitignore                                    |  2 +
 .../RPI_PICO-20250415-v1.25.0.uf2.tamp        |  3 +
 datasets/v2-compressed/dickens.tamp           |  3 +
 datasets/v2-compressed/enwik8.tamp            |  3 +
 datasets/v2-compressed/mozilla.tamp           |  3 +
 datasets/v2-compressed/mr.tamp                |  3 +
 datasets/v2-compressed/nci.tamp               |  3 +
 datasets/v2-compressed/ooffice.tamp           |  3 +
 datasets/v2-compressed/osdb.tamp              |  3 +
 datasets/v2-compressed/reymont.tamp           |  3 +
 datasets/v2-compressed/samba.tamp             |  3 +
 datasets/v2-compressed/sao.tamp               |  3 +
 datasets/v2-compressed/webster.tamp           |  3 +
 datasets/v2-compressed/x-ray.tamp             |  3 +
 datasets/v2-compressed/xml.tamp               |  3 +
 tests/test_dataset_regression.py              | 76 +++++++++++++++++++
 17 files changed, 121 insertions(+)
 create mode 100644 datasets/v2-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp
 create mode 100644 datasets/v2-compressed/dickens.tamp
 create mode 100644 datasets/v2-compressed/enwik8.tamp
 create mode 100644 datasets/v2-compressed/mozilla.tamp
 create mode 100644 datasets/v2-compressed/mr.tamp
 create mode 100644 datasets/v2-compressed/nci.tamp
 create mode 100644 datasets/v2-compressed/ooffice.tamp
 create mode 100644 datasets/v2-compressed/osdb.tamp
 create mode 100644 datasets/v2-compressed/reymont.tamp
 create mode 100644 datasets/v2-compressed/samba.tamp
 create mode 100644 datasets/v2-compressed/sao.tamp
 create mode 100644 datasets/v2-compressed/webster.tamp
 create mode 100644 datasets/v2-compressed/x-ray.tamp
 create mode 100644 datasets/v2-compressed/xml.tamp

diff --git a/.gitattributes b/.gitattributes
index 5f6c2223..530b7adf 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1,2 @@
 datasets/v1-compressed/** filter=lfs diff=lfs merge=lfs -text
+datasets/v2-compressed/** filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
index 4344bd22..34568b58 100644
--- a/.gitignore
+++ b/.gitignore
@@ -249,10 +249,12 @@ Temporary Items
 # Compression benchmark datasets
 datasets/*
 !datasets/v1-compressed/
+!datasets/v2-compressed/
 enwik8*
 *.pkl
 *.tamp
 !datasets/v1-compressed/**
+!datasets/v2-compressed/**
 
 # Cython-generated files
 tamp/_c_compressor.c
diff --git a/datasets/v2-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp b/datasets/v2-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp
new file mode 100644
index 00000000..1232a502
--- /dev/null
+++ b/datasets/v2-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fb7efbdc484cbf400817074910275c3f2a89aec0ab96c8984fd58423e5e9290
+size 291036
diff --git a/datasets/v2-compressed/dickens.tamp b/datasets/v2-compressed/dickens.tamp
new file mode 100644
index 00000000..a1ed82b9
--- /dev/null
+++ b/datasets/v2-compressed/dickens.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db20dcfa7e76829a809a4c9d253f1b4e53b294e86db789490bc4fadb19ab5dc0
+size 5538332
diff --git a/datasets/v2-compressed/enwik8.tamp b/datasets/v2-compressed/enwik8.tamp
new file mode 100644
index 00000000..edf7491b
--- /dev/null
+++ b/datasets/v2-compressed/enwik8.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24f3e70e49e7344ebbe3ab23d274910f7aff5bb1fb1212658b1f136a99d244f4
+size 51019055
diff --git a/datasets/v2-compressed/mozilla.tamp b/datasets/v2-compressed/mozilla.tamp
new file mode 100644
index 00000000..b1ee2c59
--- /dev/null
+++ b/datasets/v2-compressed/mozilla.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cedba7ad7710757f3c5e7bf8176fd92eede9b8e5be2e8e697f9a6dc15d45718
+size 24415401
diff --git a/datasets/v2-compressed/mr.tamp b/datasets/v2-compressed/mr.tamp
new file mode 100644
index 00000000..404f4170
--- /dev/null
+++ b/datasets/v2-compressed/mr.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a619cf3440c067f6bd5eebcf6b2145d583feca83067244c0c2585aecb4b3cae
+size 4519929
diff --git a/datasets/v2-compressed/nci.tamp b/datasets/v2-compressed/nci.tamp
new file mode 100644
index 00000000..085b3bfa
--- /dev/null
+++ b/datasets/v2-compressed/nci.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:adf1eac94e48d44092a9e64a7e8d14e1452b357f176e2aba7ed26eb2b7340946
+size 6855616
diff --git a/datasets/v2-compressed/ooffice.tamp b/datasets/v2-compressed/ooffice.tamp
new file mode 100644
index 00000000..d8243440
--- /dev/null
+++ b/datasets/v2-compressed/ooffice.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9376fe3e9f0286c1edbfafeadb1a8d997dd8524a32dbce7438636f270d61789
+size 3773219
diff --git a/datasets/v2-compressed/osdb.tamp b/datasets/v2-compressed/osdb.tamp
new file mode 100644
index 00000000..5dac28be
--- /dev/null
+++ b/datasets/v2-compressed/osdb.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b3188714fc3361691b684fecbff75b2d2cb9e6e690887aec456469d7505a586
+size 8466736
diff --git a/datasets/v2-compressed/reymont.tamp b/datasets/v2-compressed/reymont.tamp
new file mode 100644
index 00000000..14072341
--- /dev/null
+++ b/datasets/v2-compressed/reymont.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e45bf12f0ca5971b47cfca38b2cc47216c93e13915b9b3ac19aa4195b9e87d5
+size 2818601
diff --git a/datasets/v2-compressed/samba.tamp b/datasets/v2-compressed/samba.tamp
new file mode 100644
index 00000000..259d6ccc
--- /dev/null
+++ b/datasets/v2-compressed/samba.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac74af80542ad88dd617b95047e2a77e820cfcac3ef17abb8904949b87fd605f
+size 8386303
diff --git a/datasets/v2-compressed/sao.tamp b/datasets/v2-compressed/sao.tamp
new file mode 100644
index 00000000..46c39e26
--- /dev/null
+++ b/datasets/v2-compressed/sao.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c05ac1c7d78b04874f07e10265cd254ecf9d6dcf1a3f0d1ea695815509ff0b1
+size 6136077
diff --git a/datasets/v2-compressed/webster.tamp b/datasets/v2-compressed/webster.tamp
new file mode 100644
index 00000000..6c6835a9
--- /dev/null
+++ b/datasets/v2-compressed/webster.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1227df26234f9c9cf53d9be82d6a38ecd344db70bd6b25109a6e5ae1d4c1673f
+size 18146647
diff --git a/datasets/v2-compressed/x-ray.tamp b/datasets/v2-compressed/x-ray.tamp
new file mode 100644
index 00000000..981eb1dd
--- /dev/null
+++ b/datasets/v2-compressed/x-ray.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ba0c1fb79addae24888c12a466e84b73c32ca608836c458487226d224a63fc3
+size 7509449
diff --git a/datasets/v2-compressed/xml.tamp b/datasets/v2-compressed/xml.tamp
new file mode 100644
index 00000000..67e68827
--- /dev/null
+++ b/datasets/v2-compressed/xml.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6adb3788d5644d28c56ea785ea4149a6f0b8ba5562c4f8dcc4a4ba8371920e97
+size 1473552
diff --git a/tests/test_dataset_regression.py b/tests/test_dataset_regression.py
index 734401e6..448619f6 100644
--- a/tests/test_dataset_regression.py
+++ b/tests/test_dataset_regression.py
@@ -73,6 +73,66 @@
     ),
 ]
 
+# V2 format datasets (uses RLE and Extended Match encoding)
+V2_DATASETS = [
+    (
+        "datasets/v2-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp",
+        "e0c40eacf1afc550a6add74888c48bb981b28788a6d75a62a0e2444e997b9864",
+    ),
+    (
+        "datasets/v2-compressed/dickens.tamp",
+        "b24c37886142e11d0ee687db6ab06f936207aa7f2ea1fd1d9a36763c7a507e6a",
+    ),
+    (
+        "datasets/v2-compressed/mr.tamp",
+        "68637ed52e3e4860174ed2dc0840ac77d5f1a60abbcb13770d5754e3774d53e6",
+    ),
+    (
+        "datasets/v2-compressed/ooffice.tamp",
+        "e7ee013880d34dd5208283d0d3d91b07f442e067454276095ded14f322a656eb",
+    ),
+    (
+        "datasets/v2-compressed/osdb.tamp",
+        "60f027179302ca3ad87c58ac90b6be72ec23588aaa7a3b7fe8ecc0f11def3fa3",
+    ),
+    (
+        "datasets/v2-compressed/reymont.tamp",
+        "0eac0114a3dfe6e2ee1f345a0f79d653cb26c3bc9f0ed79238af4933422b7578",
+    ),
+    (
+        "datasets/v2-compressed/sao.tamp",
+        "c2d0ea2cc59d4c21b7fe43a71499342a00cbe530a1d5548770e91ecd6214adcc",
+    ),
+    (
+        "datasets/v2-compressed/x-ray.tamp",
+        "7de9fce1405dc44ae5e6813ed21cd5751e761bd4265655a005d39b9685d1c9ad",
+    ),
+    (
+        "datasets/v2-compressed/xml.tamp",
+        "0e82e54e695c1938e4193448022543845b33020c8be6bf3bf3ead2224903e08c",
+    ),
+    (
+        "datasets/v2-compressed/samba.tamp",
+        "93ba07bc44d8267789c1d911992f40b089ffa2140b4a160fac11ccae9a40e7b2",
+    ),
+    (
+        "datasets/v2-compressed/nci.tamp",
+        "fc63a31770947b8c2062d3b19ca94c00485a232bb91b502021948fee983e1635",
+    ),
+    (
+        "datasets/v2-compressed/webster.tamp",
+        "6a68f69b26daf09f9dd84f7470368553194a0b294fcfa80f1604efb11143a383",
+    ),
+    (
+        "datasets/v2-compressed/mozilla.tamp",
+        "657fc3764b0c75ac9de9623125705831ebbfbe08fed248df73bc2dc66e2a963b",
+    ),
+    (
+        "datasets/v2-compressed/enwik8.tamp",
+        "2b49720ec4d78c3c9fabaee6e4179a5e997302b3a70029f30f2d582218c024a8",
+    ),
+]
+
 
 class TestV1Decompression(unittest.TestCase):
     @pytest.mark.dataset
@@ -90,5 +150,21 @@ def test_v1_decompress(self):
                     self.assertEqual(actual, expected_sha256, f"SHA256 mismatch for {rel_path} using {impl_name}")
 
 
+class TestV2Decompression(unittest.TestCase):
+    @pytest.mark.dataset
+    def test_v2_decompress(self):
+        for impl_name, decompress_func in DECOMPRESSOR_IMPLEMENTATIONS:
+            for rel_path, expected_sha256 in V2_DATASETS:
+                with self.subTest(implementation=impl_name, dataset=rel_path):
+                    path = PROJECT_DIR / rel_path
+
+                    with open(path, "rb") as f:
+                        data = f.read()
+
+                    decompressed = decompress_func(data)
+                    actual = hashlib.sha256(decompressed).hexdigest()
+                    self.assertEqual(actual, expected_sha256, f"SHA256 mismatch for {rel_path} using {impl_name}")
+
+
 if __name__ == "__main__":
     unittest.main()

From b573f3aeca42a75c9b34ba9754e96136923165d4 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 2 Feb 2026 13:07:52 -0500
Subject: [PATCH 037/109] simplify pattern-extending.

---
 tamp/_c_src/tamp/compressor.c | 58 +++++++----------------------------
 1 file changed, 11 insertions(+), 47 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 7347bfa0..ed6791d8 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -364,14 +364,16 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
 #if TAMP_V2_COMPRESS
     // V2: Handle extended match continuation
     if (TAMP_UNLIKELY(compressor->conf_v2 && compressor->extended_match_count)) {
-        // We're in extended match mode - try to extend the match using search-based extension
-        // (matching Python's behavior: search for current_pattern + next_byte in window)
+        // We're in extended match mode - try to extend the match at the current position
         const uint8_t max_ext_match = compressor->min_pattern_size + 11 + EXTENDED_MATCH_MAX_EXTRA;
         const unsigned char *window = compressor->window;
 
         while (compressor->input_size > 0) {
+            const uint16_t current_pos = compressor->extended_match_position;
+            const uint8_t current_count = compressor->extended_match_count;
+
             // Check if extending would go beyond window buffer boundary (no wrap-around)
-            if (compressor->extended_match_position + compressor->extended_match_count >= WINDOW_SIZE) {
+            if (current_pos + current_count >= WINDOW_SIZE) {
                 size_t token_bytes;
                 res = write_extended_match_token(compressor, output, output_size, &token_bytes);
                 (*output_written_size) += token_bytes;
@@ -380,7 +382,7 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
             }
 
             // Check if we've reached max extended match size
-            if (compressor->extended_match_count >= max_ext_match) {
+            if (current_count >= max_ext_match) {
                 size_t token_bytes;
                 res = write_extended_match_token(compressor, output, output_size, &token_bytes);
                 (*output_written_size) += token_bytes;
@@ -388,51 +390,13 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
                 return TAMP_OK;
             }
 
-            // Search-based extension: search for pattern + next_byte in window
-            // This matches Python's _search behavior for extended match extension
-            const uint8_t current_count = compressor->extended_match_count;
-            const uint16_t current_pos = compressor->extended_match_position;
-            const uint8_t next_byte = read_input(0);
-            const uint8_t target_len = current_count + 1;
-
-            // Search window for a position where target_len bytes match
-            // Start from current_pos and search forward
-            uint16_t best_pos = current_pos;
-            uint8_t best_len = 0;
-
-            for (uint16_t search_pos = current_pos; search_pos + target_len <= WINDOW_SIZE; search_pos++) {
-                // Check if prefix matches
-                uint8_t match_len = 0;
-                for (uint8_t i = 0; i < current_count && match_len == i; i++) {
-                    if (window[search_pos + i] == window[current_pos + i]) {
-                        match_len = i + 1;
-                    }
-                }
-                if (match_len < current_count) continue;  // Current pattern doesn't match here
-
-                // Check if next_byte also matches
-                if (window[search_pos + current_count] == next_byte) {
-                    // Found a match of target_len bytes
-                    best_pos = search_pos;
-                    best_len = target_len;
-                    break;  // Take first match (same as Python's index())
-                }
-            }
-
-            if (best_len > current_count) {
-                // Found longer match - update position and count
-                compressor->extended_match_count = best_len;
-                compressor->extended_match_position = best_pos;
+            // O(1) extension check: does the next byte at current position match input?
+            if (window[current_pos + current_count] == read_input(0)) {
+                // Extension successful - consume input byte and increment count
+                compressor->extended_match_count++;
                 compressor->input_pos = input_add(1);
                 compressor->input_size--;
-
-                if (compressor->extended_match_count >= max_ext_match) {
-                    size_t token_bytes;
-                    res = write_extended_match_token(compressor, output, output_size, &token_bytes);
-                    (*output_written_size) += token_bytes;
-                    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-                    return TAMP_OK;
-                }
+                // Continue to next iteration to try extending further
             } else {
                 // Match ended - emit current match
                 size_t token_bytes;

From 391efdc6621fed18633d6fbe7af3209a7ba67fce Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 2 Feb 2026 13:28:46 -0500
Subject: [PATCH 038/109] dead code cleanup

---
 tamp/_c_src/tamp/compressor.c | 47 +++++++++--------------------------
 1 file changed, 12 insertions(+), 35 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index ed6791d8..5bb2ef9a 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -65,28 +65,15 @@ static inline void write_extended_huffman(TampCompressor *compressor, uint8_t va
 /**
  * @brief Partially flush the internal bit buffer.
  *
- * @param[in] min_threshold Minimum bit_buffer_pos to trigger flush. Default 8 flushes whole bytes.
- *                          Use 7 for extended match tokens which need up to 26 bits (32 - 26 = 6 max remaining).
- *
- * Up to (min_threshold - 1) bits may remain in the internal bit buffer.
+ * Flushes complete bytes from the bit buffer. Up to 7 bits may remain.
  */
-static inline tamp_res partial_flush_threshold(TampCompressor *compressor, unsigned char *output, size_t output_size,
-                                               size_t *output_written_size, uint8_t min_threshold) {
-    for (*output_written_size = output_size; compressor->bit_buffer_pos >= min_threshold && output_size;
+static inline tamp_res partial_flush(TampCompressor *compressor, unsigned char *output, size_t output_size,
+                                     size_t *output_written_size) {
+    for (*output_written_size = output_size; compressor->bit_buffer_pos >= 8 && output_size;
          output_size--, compressor->bit_buffer_pos -= 8, compressor->bit_buffer <<= 8)
         *output++ = compressor->bit_buffer >> 24;
     *output_written_size -= output_size;
-    return (compressor->bit_buffer_pos >= min_threshold) ? TAMP_OUTPUT_FULL : TAMP_OK;
-}
-
-/**
- * @brief Partially flush the internal bit buffer.
- *
- * Up to 7 bits may remain in the internal bit buffer.
- */
-static inline tamp_res partial_flush(TampCompressor *compressor, unsigned char *output, size_t output_size,
-                                     size_t *output_written_size) {
-    return partial_flush_threshold(compressor, output, output_size, output_written_size, 8);
+    return (compressor->bit_buffer_pos >= 8) ? TAMP_OUTPUT_FULL : TAMP_OK;
 }
 
 inline bool tamp_compressor_full(const TampCompressor *compressor) {
@@ -271,9 +258,8 @@ static void write_rle_token(TampCompressor *compressor, uint8_t count) {
 /**
  * @brief Write extended match token to bit buffer and update window.
  *
- * Extended match tokens can be up to 26 bits (7 symbol + 9 extended_huffman + 10 window).
- * To avoid overflowing the 32-bit buffer when starting with up to 7 bits remaining,
- * we do intermediate flushes between token parts.
+ * Token format: symbol (7 bits) + extended_huffman (up to 11 bits) + window_pos (up to 15 bits)
+ * Total: up to 33 bits. We flush after symbol+huffman (18 bits max) to ensure window_pos fits.
  *
  * @param[in,out] compressor Compressor state.
  * @param[out] output Output buffer for flushed bytes.
@@ -291,32 +277,23 @@ static tamp_res write_extended_match_token(TampCompressor *compressor, unsigned
 
     *output_written_size = 0;
 
-    // Write extended match symbol (13) with literal flag (7 bits)
-    // Note: symbols 12 and 13 are at indices 12 and 13 in huffman table (not offset by min_pattern_size)
+    // Write symbol (7 bits) + extended huffman (up to 11 bits) = 18 bits max
+    // With ≤7 bits already in buffer, total ≤25 bits - fits in 32-bit buffer
     write_to_bit_buffer(compressor, huffman_codes[TAMP_EXTENDED_MATCH_SYMBOL],
                         huffman_bits[TAMP_EXTENDED_MATCH_SYMBOL]);
-
-    // Flush to make room for extended huffman (up to 9 bits)
-    res = partial_flush(compressor, output, output_size, &flush_bytes);
-    *output_written_size += flush_bytes;
-    output += flush_bytes;
-    output_size -= flush_bytes;
-    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-
-    // Write extended huffman for (count - min_pattern_size - 11 - 1)
     write_extended_huffman(compressor, count - compressor->min_pattern_size - 11 - 1, TAMP_LEADING_EXTENDED_MATCH_BITS);
 
-    // Flush to make room for window position (10 bits)
+    // Flush to make room for window position (up to 15 bits)
     res = partial_flush(compressor, output, output_size, &flush_bytes);
     *output_written_size += flush_bytes;
     output += flush_bytes;
     output_size -= flush_bytes;
     if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
 
-    // Write window position
+    // Write window position - with ≤7 bits remaining, up to 22 bits total - fits
     write_to_bit_buffer(compressor, position, compressor->conf_window);
 
-    // Flush any remaining complete bytes
+    // Final flush
     res = partial_flush(compressor, output, output_size, &flush_bytes);
     *output_written_size += flush_bytes;
     if (TAMP_UNLIKELY(res != TAMP_OK)) return res;

From 73660854123d73a25624193d0acd95429f2ea4ce Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 2 Feb 2026 13:45:46 -0500
Subject: [PATCH 039/109] more simplification

---
 tamp/_c_src/tamp/compressor.c | 88 +++++++++++------------------------
 1 file changed, 27 insertions(+), 61 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 5bb2ef9a..3ecbb033 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -465,78 +465,44 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
             // literal and cache the next match
             if (next_match_size > match_size &&
                 validate_no_match_overlap(compressor->window_pos, next_match_index, next_match_size)) {
-                // Write LITERAL at current position
-                match_size = 1;
-                unsigned char c = read_input(0);
-                if (TAMP_UNLIKELY(c >> compressor->conf_literal)) {
-                    return TAMP_EXCESS_BITS;
-                }
-                write_to_bit_buffer(compressor, IS_LITERAL_FLAG | c, compressor->conf_literal + 1);
+                // Force literal at current position, cache next match
+                match_size = 0;  // Will trigger literal write below
             } else {
-                // Use current match, clear cache
                 compressor->cached_match_index = -1;
                 // Note: No V2 extended match check here - we're in the match_size <= 8 branch,
                 // so extended matches (which require match_size > min_pattern_size + 11) are impossible.
-                uint8_t huffman_index = match_size - compressor->min_pattern_size;
-                write_to_bit_buffer(compressor, huffman_codes[huffman_index], huffman_bits[huffman_index]);
-                write_to_bit_buffer(compressor, match_index, compressor->conf_window);
-            }
-        } else if (TAMP_UNLIKELY(match_size < compressor->min_pattern_size)) {
-            // Write LITERAL
-            compressor->cached_match_index = -1;  // Clear cache
-            match_size = 1;
-            unsigned char c = read_input(0);
-            if (TAMP_UNLIKELY(c >> compressor->conf_literal)) {
-                return TAMP_EXCESS_BITS;
             }
-            write_to_bit_buffer(compressor, IS_LITERAL_FLAG | c, compressor->conf_literal + 1);
         } else {
-            // Write TOKEN
             compressor->cached_match_index = -1;  // Clear cache
-#if TAMP_V2_COMPRESS
-            // V2: Check for extended match
-            if (compressor->conf_v2 && match_size > compressor->min_pattern_size + 11) {
-                compressor->extended_match_count = match_size;
-                compressor->extended_match_position = match_index;
-                // Consume matched bytes from input
-                compressor->input_pos = input_add(match_size);
-                compressor->input_size -= match_size;
-                return TAMP_OK;
-            }
-#endif
-            uint8_t huffman_index = match_size - compressor->min_pattern_size;
-            write_to_bit_buffer(compressor, huffman_codes[huffman_index], huffman_bits[huffman_index]);
-            write_to_bit_buffer(compressor, match_index, compressor->conf_window);
         }
-    } else
+    }
 #endif
-    {
-        // Non-lazy matching path
-        if (TAMP_UNLIKELY(match_size < compressor->min_pattern_size)) {
-            // Write LITERAL
-            match_size = 1;
-            unsigned char c = read_input(0);
-            if (TAMP_UNLIKELY(c >> compressor->conf_literal)) {
-                return TAMP_EXCESS_BITS;
-            }
-            write_to_bit_buffer(compressor, IS_LITERAL_FLAG | c, compressor->conf_literal + 1);
-        } else {
+
+    // Shared token/literal writing logic
+    if (TAMP_UNLIKELY(match_size < compressor->min_pattern_size)) {
+        // Write LITERAL
+        match_size = 1;
+        unsigned char c = read_input(0);
+        if (TAMP_UNLIKELY(c >> compressor->conf_literal)) {
+            return TAMP_EXCESS_BITS;
+        }
+        write_to_bit_buffer(compressor, IS_LITERAL_FLAG | c, compressor->conf_literal + 1);
+    } else {
 #if TAMP_V2_COMPRESS
-            // V2: Check for extended match
-            if (compressor->conf_v2 && match_size > compressor->min_pattern_size + 11) {
-                compressor->extended_match_count = match_size;
-                compressor->extended_match_position = match_index;
-                // Consume matched bytes from input
-                compressor->input_pos = input_add(match_size);
-                compressor->input_size -= match_size;
-                return TAMP_OK;
-            }
-#endif
-            // Write TOKEN
-            uint8_t huffman_index = match_size - compressor->min_pattern_size;
-            write_to_bit_buffer(compressor, huffman_codes[huffman_index], huffman_bits[huffman_index]);
-            write_to_bit_buffer(compressor, match_index, compressor->conf_window);
+        // V2: Check for extended match
+        if (compressor->conf_v2 && match_size > compressor->min_pattern_size + 11) {
+            compressor->extended_match_count = match_size;
+            compressor->extended_match_position = match_index;
+            // Consume matched bytes from input
+            compressor->input_pos = input_add(match_size);
+            compressor->input_size -= match_size;
+            return TAMP_OK;
         }
+#endif
+        // Write TOKEN
+        uint8_t huffman_index = match_size - compressor->min_pattern_size;
+        write_to_bit_buffer(compressor, huffman_codes[huffman_index], huffman_bits[huffman_index]);
+        write_to_bit_buffer(compressor, match_index, compressor->conf_window);
     }
     // Populate Window
     for (uint8_t i = 0; i < match_size; i++) {

From 8c222e39f1609b541c098370086571b2faf724b1 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 2 Feb 2026 14:00:32 -0500
Subject: [PATCH 040/109] make window_pos a uint16 instead of a bitfield.

---
 tamp/_c_src/tamp/compressor.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.h b/tamp/_c_src/tamp/compressor.h
index 91de900f..4b3ccf54 100644
--- a/tamp/_c_src/tamp/compressor.h
+++ b/tamp/_c_src/tamp/compressor.h
@@ -55,13 +55,13 @@ typedef struct TampCompressor {
 #endif
 
     /* Other small attributes */
-    uint32_t window_pos : 15;
     uint32_t bit_buffer_pos : 6;
     uint32_t min_pattern_size : 2;
-
     uint32_t input_size : 5;
     uint32_t input_pos : 4;
 
+    uint16_t window_pos;  // Window position (0 to window_size-1)
+
 #if TAMP_LAZY_MATCHING
     /* Lazy matching cache */
     int16_t cached_match_index;

From c8c39c2e90dd6f92e9dad01816431e819e55300d Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 2 Feb 2026 14:13:34 -0500
Subject: [PATCH 041/109] decompress on-device-compression-benchmark results to
 check validity.

---
 Makefile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 0949b15b..ad7a44be 100644
--- a/Makefile
+++ b/Makefile
@@ -227,7 +227,7 @@ define mpremote-sync
 	fi
 endef
 
-on-device-compression-benchmark: mpy build/enwik8-100kb build/enwik8-100kb.tamp
+on-device-compression-benchmark: mpy build/enwik8-100kb
 	$(MPREMOTE) rm :enwik8-100kb.tamp || true
 	@# Remove any viper implementation that may exist from previous belay syncs
 	$(MPREMOTE) rm :tamp/__init__.py :tamp/compressor_viper.py :tamp/decompressor_viper.py :tamp/compressor.py :tamp/decompressor.py :tamp/__main__.py :tamp/py.typed 2>/dev/null || true
@@ -238,7 +238,8 @@ on-device-compression-benchmark: mpy build/enwik8-100kb build/enwik8-100kb.tamp
 	$(MPREMOTE) soft-reset
 	$(MPREMOTE) run tools/on-device-compression-benchmark.py
 	$(MPREMOTE) cp :enwik8-100kb.tamp build/on-device-enwik8-100kb.tamp
-	cmp build/enwik8-100kb.tamp build/on-device-enwik8-100kb.tamp
+	poetry run tamp decompress build/on-device-enwik8-100kb.tamp -o build/on-device-enwik8-100kb-decompressed
+	cmp build/enwik8-100kb build/on-device-enwik8-100kb-decompressed
 	@echo "Success!"
 
 on-device-decompression-benchmark: mpy build/enwik8-100kb.tamp

From 0cba4eac0eb9ff9cbdb87f2ccfb4d8dd0f8cf8fa Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 2 Feb 2026 14:20:05 -0500
Subject: [PATCH 042/109] more firmware optimizations

---
 tamp/_c_src/tamp/compressor.c | 4 ++--
 tamp/_c_src/tamp/compressor.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 3ecbb033..fa853df1 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -67,8 +67,8 @@ static inline void write_extended_huffman(TampCompressor *compressor, uint8_t va
  *
  * Flushes complete bytes from the bit buffer. Up to 7 bits may remain.
  */
-static inline tamp_res partial_flush(TampCompressor *compressor, unsigned char *output, size_t output_size,
-                                     size_t *output_written_size) {
+static TAMP_NOINLINE tamp_res partial_flush(TampCompressor *compressor, unsigned char *output, size_t output_size,
+                                            size_t *output_written_size) {
     for (*output_written_size = output_size; compressor->bit_buffer_pos >= 8 && output_size;
          output_size--, compressor->bit_buffer_pos -= 8, compressor->bit_buffer <<= 8)
         *output++ = compressor->bit_buffer >> 24;
diff --git a/tamp/_c_src/tamp/compressor.h b/tamp/_c_src/tamp/compressor.h
index 4b3ccf54..aea698cc 100644
--- a/tamp/_c_src/tamp/compressor.h
+++ b/tamp/_c_src/tamp/compressor.h
@@ -55,12 +55,12 @@ typedef struct TampCompressor {
 #endif
 
     /* Other small attributes */
-    uint32_t bit_buffer_pos : 6;
     uint32_t min_pattern_size : 2;
     uint32_t input_size : 5;
     uint32_t input_pos : 4;
 
-    uint16_t window_pos;  // Window position (0 to window_size-1)
+    uint8_t bit_buffer_pos;  // 6 bits used; bits pending in bit_buffer (0-32)
+    uint16_t window_pos;     // 15 bits used; position in window (0 to window_size-1)
 
 #if TAMP_LAZY_MATCHING
     /* Lazy matching cache */

From 1d0649598d7be133acaa171316fd0c5373f56f3a Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 2 Feb 2026 14:54:27 -0500
Subject: [PATCH 043/109] rework compressor struct.

---
 tamp/_c_src/tamp/compressor.c | 19 ++++----
 tamp/_c_src/tamp/compressor.h | 88 ++++++++++++++++-------------------
 2 files changed, 49 insertions(+), 58 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index fa853df1..a92c1704 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -184,6 +184,9 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
     if (!conf) conf = &conf_default;
     if (conf->window < 8 || conf->window > 15) return TAMP_INVALID_CONF;
     if (conf->literal < 5 || conf->literal > 8) return TAMP_INVALID_CONF;
+#if !TAMP_V2_COMPRESS
+    if (conf->v2) return TAMP_INVALID_CONF;  // V2 requested but not compiled in
+#endif
 
     for (uint8_t i = 0; i < sizeof(TampCompressor); i++)  // Zero-out the struct
         ((unsigned char *)compressor)[i] = 0;
@@ -191,12 +194,10 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
     compressor->conf_literal = conf->literal;
     compressor->conf_window = conf->window;
     compressor->conf_use_custom_dictionary = conf->use_custom_dictionary;
+    compressor->conf_v2 = conf->v2;
 #if TAMP_LAZY_MATCHING
     compressor->conf_lazy_matching = conf->lazy_matching;
 #endif
-#if TAMP_V2_COMPRESS
-    compressor->conf_v2 = conf->v2;
-#endif
 
     compressor->window = window;
     compressor->min_pattern_size = tamp_compute_min_pattern_size(conf->window, conf->literal);
@@ -208,14 +209,10 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
     if (!compressor->conf_use_custom_dictionary) tamp_initialize_dictionary(window, (1 << conf->window));
 
     // Write header to bit buffer
-    write_to_bit_buffer(compressor, conf->window - 8, 3);
-    write_to_bit_buffer(compressor, conf->literal - 5, 2);
-    write_to_bit_buffer(compressor, conf->use_custom_dictionary, 1);
-#if TAMP_V2_COMPRESS
-    write_to_bit_buffer(compressor, conf->v2, 1);  // v2 format flag
-#else
-    write_to_bit_buffer(compressor, 0, 1);  // Reserved (v1 only)
-#endif
+    write_to_bit_buffer(compressor, compressor->conf_window - 8, 3);
+    write_to_bit_buffer(compressor, compressor->conf_literal - 5, 2);
+    write_to_bit_buffer(compressor, compressor->conf_use_custom_dictionary, 1);
+    write_to_bit_buffer(compressor, compressor->conf_v2, 1);
     write_to_bit_buffer(compressor, 0, 1);  // No more header bytes
 
     return TAMP_OK;
diff --git a/tamp/_c_src/tamp/compressor.h b/tamp/_c_src/tamp/compressor.h
index aea698cc..361bf112 100644
--- a/tamp/_c_src/tamp/compressor.h
+++ b/tamp/_c_src/tamp/compressor.h
@@ -7,78 +7,72 @@ extern "C" {
 
 #include "common.h"
 
-/* Externally, do not directly edit ANY of these attributes */
+/* Externally, do not directly edit ANY of these attributes.
+ * Fields are ordered by access frequency for cache efficiency.
+ */
 typedef struct TampCompressor {
-    /* nicely aligned attributes */
-
 #if TAMP_ESP32  // Avoid bitfields for speed.
-    uint32_t window_pos;
-    uint32_t bit_buffer_pos;
-
-    uint32_t input_size;
-    uint32_t input_pos;
-
-    /* Conf attributes */
-    uint8_t conf_window;                 // number of window bits
-    uint8_t conf_literal;                // number of literal bits
-    uint8_t conf_use_custom_dictionary;  // Use a custom initialized dictionary.
-#if TAMP_LAZY_MATCHING
-    uint8_t conf_lazy_matching;  // Use lazy matching for better compression
-#endif
-#if TAMP_V2_COMPRESS
-    uint8_t conf_v2;  // Use v2 format (RLE, extended match)
-#endif
-    uint8_t min_pattern_size;
+    /* HOT: accessed every iteration of the compression loop */
+    unsigned char *window;    // Pointer to window buffer
+    uint32_t bit_buffer;      // Bit buffer for output (32 bits)
+    uint32_t window_pos;      // Current position in window (15 bits used)
+    uint32_t bit_buffer_pos;  // Bits currently in bit_buffer (6 bits used)
+    uint32_t input_size;      // Bytes in input buffer (5 bits used; 0-16)
+    uint32_t input_pos;       // Current position in input buffer (4 bits used; 0-15)
+    unsigned char input[16];  // Input ring buffer
 
+    /* WARM: read frequently, often cached in locals */
+    uint8_t min_pattern_size;            // Minimum pattern size (2 bits used; 2 or 3)
+    uint8_t conf_window;                 // Window bits (4 bits used; 8-15)
+    uint8_t conf_literal;                // Literal bits (4 bits used; 5-8)
+    uint8_t conf_use_custom_dictionary;  // Custom dictionary (1 bit used; init only)
+    uint8_t conf_v2;                     // V2 format enabled (1 bit used)
 #if TAMP_LAZY_MATCHING
-    /* Lazy matching cache */
-    int16_t cached_match_index;
+    uint8_t conf_lazy_matching;  // Lazy matching enabled (1 bit used)
+    int16_t cached_match_index;  // Lazy matching cache
     uint8_t cached_match_size;
 #endif
 
 #if TAMP_V2_COMPRESS
-    /* V2 state */
+    /* V2 state (only needed when V2 compression is enabled) */
     uint8_t rle_count;                 // Current RLE run length (max 225)
     uint8_t extended_match_count;      // Current extended match size (max ~126)
     uint16_t extended_match_position;  // Window position for extended match
 #endif
-#else  // Use bitfields for reduced memory-usage
-    /* Conf attributes */
-    uint32_t conf_window : 4;                 // number of window bits
-    uint32_t conf_literal : 4;                // number of literal bits
-    uint32_t conf_use_custom_dictionary : 1;  // Use a custom initialized dictionary.
-#if TAMP_LAZY_MATCHING
-    uint32_t conf_lazy_matching : 1;  // Use lazy matching for better compression
-#endif
-#if TAMP_V2_COMPRESS
-    uint32_t conf_v2 : 1;  // Use v2 format (RLE, extended match)
-#endif
 
-    /* Other small attributes */
-    uint32_t min_pattern_size : 2;
-    uint32_t input_size : 5;
-    uint32_t input_pos : 4;
+#else  // Use bitfields for reduced memory-usage
 
-    uint8_t bit_buffer_pos;  // 6 bits used; bits pending in bit_buffer (0-32)
-    uint16_t window_pos;     // 15 bits used; position in window (0 to window_size-1)
+    /* HOT: accessed every iteration of the compression loop */
+    unsigned char *window;    // Pointer to window buffer
+    uint32_t bit_buffer;      // Bit buffer for output (32 bits)
+    uint16_t window_pos;      // Current position in window (15 bits used)
+    uint8_t bit_buffer_pos;   // Bits currently in bit_buffer (6 bits used)
+    uint8_t input_size;       // Bytes in input buffer (5 bits used; 0-16)
+    uint8_t input_pos;        // Current position in input buffer (4 bits used; 0-15)
+    unsigned char input[16];  // Input ring buffer
 
+    /* WARM: read frequently, often cached in locals.
+     * Bitfields: min_pattern_size(2) + conf_window(4) + conf_literal(4) +
+     *            conf_use_custom_dictionary(1) + conf_v2(1) + conf_lazy_matching(1) = 13 bits
+     */
+    uint8_t min_pattern_size : 2;            // Minimum pattern size (2 or 3)
+    uint8_t conf_window : 4;                 // Window bits (8-15)
+    uint8_t conf_literal : 4;                // Literal bits (5-8)
+    uint8_t conf_use_custom_dictionary : 1;  // Custom dictionary (init only)
+    uint8_t conf_v2 : 1;                     // V2 format enabled
 #if TAMP_LAZY_MATCHING
-    /* Lazy matching cache */
-    int16_t cached_match_index;
+    uint8_t conf_lazy_matching : 1;  // Lazy matching enabled
+    int16_t cached_match_index;      // Lazy matching cache
     uint8_t cached_match_size;
 #endif
 
 #if TAMP_V2_COMPRESS
-    /* V2 state */
     uint8_t rle_count;                 // Current RLE run length (max 225)
     uint8_t extended_match_count;      // Current extended match size (max ~126)
     uint16_t extended_match_position;  // Window position for extended match
 #endif
-#endif  // TAMP_ESP32
-    unsigned char input[16] /* __attribute__ ((aligned (16)))*/;
-    uint32_t bit_buffer;
 
-    unsigned char *window;
+#endif  // TAMP_ESP32
 } TampCompressor;
 
 /**

From e51f184770e339519f2b1a4c9a6d521057645044 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 2 Feb 2026 14:58:37 -0500
Subject: [PATCH 044/109] update mpy_bindings with v2 flag.

---
 mpy_bindings/bindings.c             | 1 +
 mpy_bindings/bindings_compressor.py | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/mpy_bindings/bindings.c b/mpy_bindings/bindings.c
index d75d559e..2841e5cc 100644
--- a/mpy_bindings/bindings.c
+++ b/mpy_bindings/bindings.c
@@ -67,6 +67,7 @@ static mp_obj_t compressor_make_new(const mp_obj_type_t *type, size_t n_args, si
         .window = mp_obj_get_int(args_in[1]),
         .literal = mp_obj_get_int(args_in[2]),
         .use_custom_dictionary = mp_obj_get_int(args_in[4]),
+        .v2 = mp_obj_get_int(args_in[5]),
     };
 
     mp_obj_compressor_t *o = mp_obj_malloc(mp_obj_compressor_t, type);
diff --git a/mpy_bindings/bindings_compressor.py b/mpy_bindings/bindings_compressor.py
index 414ae6a4..c40b2351 100644
--- a/mpy_bindings/bindings_compressor.py
+++ b/mpy_bindings/bindings_compressor.py
@@ -9,6 +9,7 @@ def __init__(
         window=10,
         literal=8,
         dictionary=None,
+        v2=True,
     ):
         self._cf = False  # shorter name to save binary space
         if not hasattr(f, "write"):  # It's probably a path-like object.
@@ -18,7 +19,7 @@ def __init__(
         custom = dictionary is not None
         if not dictionary:
             dictionary = bytearray(1 << window)
-        self._c = _C(f, window, literal, dictionary, custom)
+        self._c = _C(f, window, literal, dictionary, custom, v2)
 
         self.write = self._c.write
 

From 961803a55da94c621b5e1f0c63f9d78ba90a4fb4 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 2 Feb 2026 18:38:36 -0500
Subject: [PATCH 045/109] prevent TAMP_COMPRESS_FULL for extended-match

---
 mpy_bindings/bindings.c       |  3 ++-
 tamp/_c_compressor.pyx        |  2 +-
 tamp/_c_src/tamp/compressor.c | 14 ++++++++++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/mpy_bindings/bindings.c b/mpy_bindings/bindings.c
index 2841e5cc..9f07ae39 100644
--- a/mpy_bindings/bindings.c
+++ b/mpy_bindings/bindings.c
@@ -9,7 +9,8 @@
  **********/
 
 #include "tamp/common.h"
-#define CHUNK_SIZE 32  // Must be <= 65535
+#define CHUNK_SIZE 32  // Must be >= 32 and <= 65535
+_Static_assert(CHUNK_SIZE >= 32, "CHUNK_SIZE must be >= 32 to hold flush output");
 #define mp_type_bytearray (*(mp_obj_type_t *)(mp_load_global(MP_QSTR_bytearray)))
 
 static void TAMP_CHECK(tamp_res res) {
diff --git a/tamp/_c_compressor.pyx b/tamp/_c_compressor.pyx
index 8655d085..eff25276 100644
--- a/tamp/_c_compressor.pyx
+++ b/tamp/_c_compressor.pyx
@@ -104,7 +104,7 @@ cdef class Compressor:
 
     cpdef int flush(self, bool write_token = True) except -1:
         cdef ctamp.tamp_res res
-        cdef bytearray buffer = bytearray(24)
+        cdef bytearray buffer = bytearray(32)
         cdef size_t output_written_size = 0
 
         res = ctamp.tamp_compressor_flush(
diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index a92c1704..9b282a15 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -35,6 +35,12 @@ static const uint8_t huffman_bits[] = {0x2, 0x3, 0x5, 0x5, 0x6, 0x7, 0x7, 0x7, 0
 // V2: Maximum extended match extra = (13 << 3) + 7 + 1 = 112
 // Total max match = min_pattern_size + 11 + 112 = min_pattern_size + 123
 #define EXTENDED_MATCH_MAX_EXTRA 112
+
+// Minimum output buffer space required for extended match token.
+// Extended match: symbol (7 bits) + extended huffman (11 bits) + window pos (15 bits) = 33 bits.
+// With 7 bits in bit buffer, need up to 40 bits = 5 bytes. Add 1 byte margin.
+// Pre-checking prevents OUTPUT_FULL mid-token, which would corrupt bit_buffer on retry.
+#define EXTENDED_MATCH_MIN_OUTPUT_BYTES 6
 #endif
 
 static inline void write_to_bit_buffer(TampCompressor *compressor, uint32_t bits, uint8_t n_bits) {
@@ -348,6 +354,8 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
 
             // Check if extending would go beyond window buffer boundary (no wrap-around)
             if (current_pos + current_count >= WINDOW_SIZE) {
+                // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
+                if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
                 size_t token_bytes;
                 res = write_extended_match_token(compressor, output, output_size, &token_bytes);
                 (*output_written_size) += token_bytes;
@@ -357,6 +365,8 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
 
             // Check if we've reached max extended match size
             if (current_count >= max_ext_match) {
+                // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
+                if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
                 size_t token_bytes;
                 res = write_extended_match_token(compressor, output, output_size, &token_bytes);
                 (*output_written_size) += token_bytes;
@@ -373,6 +383,8 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
                 // Continue to next iteration to try extending further
             } else {
                 // Match ended - emit current match
+                // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
+                if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
                 size_t token_bytes;
                 res = write_extended_match_token(compressor, output, output_size, &token_bytes);
                 (*output_written_size) += token_bytes;
@@ -622,6 +634,8 @@ tamp_res tamp_compressor_flush(TampCompressor *compressor, unsigned char *output
 
     // V2: Flush any pending extended match
     if (compressor->conf_v2 && compressor->extended_match_count) {
+        // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
+        if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
         res = write_extended_match_token(compressor, output, output_size, &chunk_output_written_size);
         (*output_written_size) += chunk_output_written_size;
         if (TAMP_UNLIKELY(res != TAMP_OK)) return res;

From 3f4666322ceaf3892071798dfd6b874e6d3abad4 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 2 Feb 2026 18:43:59 -0500
Subject: [PATCH 046/109] don't inline rle/extended-match

---
 tamp/_c_src/tamp/compressor.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 9b282a15..ad9768aa 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -58,7 +58,7 @@ static inline void write_to_bit_buffer(TampCompressor *compressor, uint32_t bits
  * @param[in] value The value to encode.
  * @param[in] trailing_bits Number of trailing bits (3 for extended match, 4 for RLE).
  */
-static inline void write_extended_huffman(TampCompressor *compressor, uint8_t value, uint8_t trailing_bits) {
+static TAMP_NOINLINE void write_extended_huffman(TampCompressor *compressor, uint8_t value, uint8_t trailing_bits) {
     uint8_t mask = (1 << trailing_bits) - 1;
     uint8_t code_index = value >> trailing_bits;
     // Write huffman code without literal flag (subtract 1 from bit length)
@@ -239,7 +239,7 @@ static inline uint8_t get_last_window_byte(TampCompressor *compressor) {
  * @param[in,out] compressor Compressor state.
  * @param[in] count Number of repeated bytes (must be >= 2).
  */
-static void write_rle_token(TampCompressor *compressor, uint8_t count) {
+static TAMP_NOINLINE void write_rle_token(TampCompressor *compressor, uint8_t count) {
     const uint16_t window_mask = (1 << compressor->conf_window) - 1;
     uint8_t symbol = get_last_window_byte(compressor);
 
@@ -270,8 +270,8 @@ static void write_rle_token(TampCompressor *compressor, uint8_t count) {
  * @param[out] output_written_size Bytes written to output.
  * @return TAMP_OK on success, TAMP_OUTPUT_FULL if output buffer is too small.
  */
-static tamp_res write_extended_match_token(TampCompressor *compressor, unsigned char *output, size_t output_size,
-                                           size_t *output_written_size) {
+static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor *compressor, unsigned char *output,
+                                                         size_t output_size, size_t *output_written_size) {
     const uint16_t window_mask = (1 << compressor->conf_window) - 1;
     const uint8_t count = compressor->extended_match_count;
     const uint16_t position = compressor->extended_match_position;

From 02c596e4d8cf9d6681d3159afd287e106c2c639d Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 2 Feb 2026 19:34:51 -0500
Subject: [PATCH 047/109] Add .clangd to the gitignore

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 34568b58..2bac59d1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -437,6 +437,9 @@ wasm/build/
 *.swo
 *~
 
+# clangd (C/C++ language server)
+.clangd
+
 # Emacs
 *~
 \#*\#

From fe16ce9ccb55174794ff3b16a3790b6c2f3ce0eb Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 2 Feb 2026 19:35:29 -0500
Subject: [PATCH 048/109] Add v2 option to wasm

---
 wasm/src/tamp.d.ts | 3 +++
 wasm/src/tamp.js   | 7 +++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/wasm/src/tamp.d.ts b/wasm/src/tamp.d.ts
index 584e2c43..b2c11fac 100644
--- a/wasm/src/tamp.d.ts
+++ b/wasm/src/tamp.d.ts
@@ -43,6 +43,8 @@ export interface TampOptions {
   literal?: number;
   /** Custom dictionary data. If null, no custom dictionary is used. If Uint8Array, uses the provided dictionary. Default: null */
   dictionary?: Uint8Array | null;
+  /** Enable v2 format (RLE, extended match) for better compression ratios. Default: true */
+  v2?: boolean;
   /** Enable lazy matching for better compression ratios. Default: false */
   lazy_matching?: boolean;
 }
@@ -68,6 +70,7 @@ export interface TampDefaults {
   readonly window: 10;
   readonly literal: 8;
   readonly dictionary: null;
+  readonly v2: true;
   readonly lazy_matching: false;
 }
 
diff --git a/wasm/src/tamp.js b/wasm/src/tamp.js
index f52788db..3de7c235 100644
--- a/wasm/src/tamp.js
+++ b/wasm/src/tamp.js
@@ -122,6 +122,7 @@ export class TampCompressor {
       window: 10,
       literal: 8,
       dictionary: null,
+      v2: true,
       lazy_matching: false,
       ...options,
     };
@@ -183,7 +184,8 @@ export class TampCompressor {
         (this.options.window & 0xf) |
         ((this.options.literal & 0xf) << 4) |
         ((this.options.dictionary ? 1 : 0) << 8) |
-        ((this.options.lazy_matching ? 1 : 0) << 9);
+        ((this.options.v2 ? 1 : 0) << 9) |
+        ((this.options.lazy_matching ? 1 : 0) << 10);
       this.module.setValue(confPtr, confValue, 'i32');
 
       // Initialize compressor
@@ -790,10 +792,11 @@ export async function compress(data, options = {}) {
   const callbackOptions = {};
 
   // Extract compression-specific options
-  const { window, literal, dictionary, lazy_matching, onPoll, signal, pollIntervalMs, pollIntervalBytes } = options;
+  const { window, literal, dictionary, v2, lazy_matching, onPoll, signal, pollIntervalMs, pollIntervalBytes } = options;
   if (window !== undefined) compressionOptions.window = window;
   if (literal !== undefined) compressionOptions.literal = literal;
   if (dictionary !== undefined) compressionOptions.dictionary = dictionary;
+  if (v2 !== undefined) compressionOptions.v2 = v2;
   if (lazy_matching !== undefined) compressionOptions.lazy_matching = lazy_matching;
 
   // Extract callback options

From 41b4b13419bd89a017f7e31b3f1b972b9f1e5ea8 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Mon, 2 Feb 2026 19:42:55 -0500
Subject: [PATCH 049/109] Add v2 to the website.

---
 website/index.html | 22 +++++++++++++++++++++-
 website/js/main.js | 14 +++++++++++---
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/website/index.html b/website/index.html
index 88474519..9fb60465 100644
--- a/website/index.html
+++ b/website/index.html
@@ -144,6 +144,16 @@ <h3 style="margin-bottom: 0.75rem; font-weight: 500; color: #333;">Configuration
                         <option value="15">15</option>
                     </select>
                 </div>
+                <div style="display: flex; align-items: center; gap: 0.5rem;">
+                    <input type="checkbox" class="option-checkbox" id="v2" checked>
+                    <label class="checkbox-label" for="v2" style="margin: 0;">V2 Format</label>
+                    <div class="help-tooltip">
+                        <span class="help-icon">?</span>
+                        <div class="tooltip-content">
+                            Enables v2 compression format with RLE and extended match encoding for better compression ratios. Recommended for most use cases.
+                        </div>
+                    </div>
+                </div>
                 <div style="display: flex; align-items: center; gap: 0.5rem;">
                     <input type="checkbox" class="option-checkbox" id="lazyMatching">
                     <label class="checkbox-label" for="lazyMatching" style="margin: 0;">Lazy Matching</label>
@@ -182,6 +192,16 @@ <h3 style="margin-bottom: 0.75rem; font-weight: 500; color: #333;">Configuration
                             <option value="15">15</option>
                         </select>
                     </div>
+                    <div style="display: flex; align-items: center; gap: 0.5rem;">
+                        <input type="checkbox" class="option-checkbox" id="textV2" checked>
+                        <label class="checkbox-label" for="textV2" style="margin: 0;">V2 Format</label>
+                        <div class="help-tooltip">
+                            <span class="help-icon">?</span>
+                            <div class="tooltip-content">
+                                Enables v2 compression format with RLE and extended match encoding for better compression ratios. Recommended for most use cases.
+                            </div>
+                        </div>
+                    </div>
                     <div style="display: flex; align-items: center; gap: 0.5rem;">
                         <input type="checkbox" class="option-checkbox" id="textLazyMatching">
                         <label class="checkbox-label" for="textLazyMatching" style="margin: 0;">Lazy Matching</label>
@@ -194,7 +214,7 @@ <h3 style="margin-bottom: 0.75rem; font-weight: 500; color: #333;">Configuration
                     </div>
                 </div>
             </div>
-            
+
             <div class="option-group">
                 <div style="display: flex; align-items: center; gap: 0.5rem; margin-bottom: 0.5rem;">
                     <button type="button" class="toggle-dictionary-btn" id="toggleDictionaryBtn">
diff --git a/website/js/main.js b/website/js/main.js
index 093d9bf9..15458ab2 100644
--- a/website/js/main.js
+++ b/website/js/main.js
@@ -23,8 +23,10 @@ let dropZone,
   compressionOptions,
   textMode,
   windowBitsSelect,
+  v2Checkbox,
   lazyMatchingCheckbox,
   textWindowBitsSelect,
+  textV2Checkbox,
   textLazyMatchingCheckbox,
   plainTextArea,
   compressedTextArea,
@@ -87,8 +89,10 @@ document.addEventListener('DOMContentLoaded', () => {
   compressionOptions = document.getElementById('compressionOptions');
   textMode = document.getElementById('textMode');
   windowBitsSelect = document.getElementById('windowBits');
+  v2Checkbox = document.getElementById('v2');
   lazyMatchingCheckbox = document.getElementById('lazyMatching');
   textWindowBitsSelect = document.getElementById('textWindowBits');
+  textV2Checkbox = document.getElementById('textV2');
   textLazyMatchingCheckbox = document.getElementById('textLazyMatching');
   plainTextArea = document.getElementById('plainText');
   compressedTextArea = document.getElementById('compressedText');
@@ -348,6 +352,7 @@ async function processFiles() {
         const windowBits = parseInt(windowBitsSelect.value);
         const options = {
           window: windowBits,
+          v2: v2Checkbox.checked,
           // Add progress callback for compression with overall progress calculation
           onPoll: async progressInfo => {
             const bytesProcessed = progressInfo.bytesProcessed || 0;
@@ -583,6 +588,7 @@ async function compressTextContent() {
   try {
     const options = {
       window: windowBits,
+      v2: textV2Checkbox.checked,
       // Add progress callback for text compression
       onPoll: async progressInfo => {
         const bytesProcessed = progressInfo.bytesProcessed || 0;
@@ -624,9 +630,11 @@ async function compressTextContent() {
     const ratio = data.length > 0 ? (data.length / compressed.length).toFixed(2) : '0';
     const savings = data.length > 0 ? ((1 - compressed.length / data.length) * 100).toFixed(1) : '0';
 
-    const configStr = `${windowBits}-bit window${isPureAscii ? ', 7-bit literals' : ''}${
-      textLazyMatchingCheckbox.checked ? ', lazy matching' : ''
-    }${dictionaryValidation.dictionaryBytes ? ', custom dictionary' : ''}`;
+    const configStr = `${windowBits}-bit window${textV2Checkbox.checked ? ', v2' : ', v1'}${
+      isPureAscii ? ', 7-bit literals' : ''
+    }${textLazyMatchingCheckbox.checked ? ', lazy matching' : ''}${
+      dictionaryValidation.dictionaryBytes ? ', custom dictionary' : ''
+    }`;
 
     const stats = [
       { label: 'Configuration', value: configStr },

From 865d051f319194023e19f7a49f91e3d51d65359b Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Tue, 3 Feb 2026 09:16:39 -0500
Subject: [PATCH 050/109] rename v2 field to 'extended'

---
 .gitignore                                    |   4 +-
 Makefile                                      | 124 +++++++++---------
 .../RPI_PICO-20250415-v1.25.0.uf2.tamp        |   0
 .../dickens.tamp                              |   0
 .../enwik8.tamp                               |   0
 .../mozilla.tamp                              |   0
 .../mr.tamp                                   |   0
 .../nci.tamp                                  |   0
 .../ooffice.tamp                              |   0
 .../osdb.tamp                                 |   0
 .../reymont.tamp                              |   0
 .../samba.tamp                                |   0
 .../sao.tamp                                  |   0
 .../webster.tamp                              |   0
 .../x-ray.tamp                                |   0
 .../xml.tamp                                  |   0
 docs/source/specification.rst                 |   4 +-
 mpy_bindings/bindings.c                       |   2 +-
 mpy_bindings/bindings_compressor.py           |   4 +-
 tamp/_c_compressor.pyx                        |   4 +-
 tamp/_c_src/tamp/common.h                     |  18 +--
 tamp/_c_src/tamp/compressor.c                 |  70 +++++-----
 tamp/_c_src/tamp/compressor.h                 |  12 +-
 tamp/_c_src/tamp/decompressor.c               |  44 +++----
 tamp/_c_src/tamp/decompressor.h               |   6 +-
 tamp/cli/main.py                              |   8 +-
 tamp/compressor.py                            |  28 ++--
 tamp/ctamp.pxd                                |   2 +-
 tamp/decompressor.py                          |   4 +-
 tests/test_cli.py                             |   4 +-
 tests/test_compressor.py                      |  18 +--
 tests/test_dataset_regression.py              |  38 +++---
 wasm/src/tamp.d.ts                            |   6 +-
 wasm/src/tamp.js                              |   9 +-
 website/index.html                            |  12 +-
 website/js/main.js                            |  14 +-
 36 files changed, 218 insertions(+), 217 deletions(-)
 rename datasets/{v2-compressed => extended-compressed}/RPI_PICO-20250415-v1.25.0.uf2.tamp (100%)
 rename datasets/{v2-compressed => extended-compressed}/dickens.tamp (100%)
 rename datasets/{v2-compressed => extended-compressed}/enwik8.tamp (100%)
 rename datasets/{v2-compressed => extended-compressed}/mozilla.tamp (100%)
 rename datasets/{v2-compressed => extended-compressed}/mr.tamp (100%)
 rename datasets/{v2-compressed => extended-compressed}/nci.tamp (100%)
 rename datasets/{v2-compressed => extended-compressed}/ooffice.tamp (100%)
 rename datasets/{v2-compressed => extended-compressed}/osdb.tamp (100%)
 rename datasets/{v2-compressed => extended-compressed}/reymont.tamp (100%)
 rename datasets/{v2-compressed => extended-compressed}/samba.tamp (100%)
 rename datasets/{v2-compressed => extended-compressed}/sao.tamp (100%)
 rename datasets/{v2-compressed => extended-compressed}/webster.tamp (100%)
 rename datasets/{v2-compressed => extended-compressed}/x-ray.tamp (100%)
 rename datasets/{v2-compressed => extended-compressed}/xml.tamp (100%)

diff --git a/.gitignore b/.gitignore
index 2bac59d1..e69f8043 100644
--- a/.gitignore
+++ b/.gitignore
@@ -249,12 +249,12 @@ Temporary Items
 # Compression benchmark datasets
 datasets/*
 !datasets/v1-compressed/
-!datasets/v2-compressed/
+!datasets/extended-compressed/
 enwik8*
 *.pkl
 *.tamp
 !datasets/v1-compressed/**
-!datasets/v2-compressed/**
+!datasets/extended-compressed/**
 
 # Cython-generated files
 tamp/_c_compressor.c
diff --git a/Makefile b/Makefile
index ad7a44be..89a12d92 100644
--- a/Makefile
+++ b/Makefile
@@ -182,7 +182,7 @@ build/enwik8-100kb: download-enwik8
 	@head -c 100000 datasets/enwik8 > build/enwik8-100kb
 
 build/enwik8-100kb.tamp: build/enwik8-100kb
-	@# Use Python implementation until C compressor supports v2
+	@# Use Python implementation for extended format compression
 	@poetry run tamp compress --implementation=python build/enwik8-100kb -o build/enwik8-100kb.tamp
 
 download-micropython:
@@ -293,7 +293,7 @@ mpy-viper-size:
 	size_comp=$$(wc -c < /tmp/_tamp_comp.mpy | tr -d ' '); \
 	size_decomp=$$(wc -c < /tmp/_tamp_decomp.mpy | tr -d ' '); \
 	rm -f /tmp/_tamp_init.mpy /tmp/_tamp_comp.mpy /tmp/_tamp_decomp.mpy; \
-	printf 'Tamp (MicroPython Viper)   %d  %d  %d\n' \
+	printf '%-34s %10d %12d %25d\n' "Tamp (MicroPython Viper)" \
 		$$((size_init + size_comp)) $$((size_init + size_decomp)) $$((size_init + size_comp + size_decomp))
 
 mpy-native-size:
@@ -309,7 +309,7 @@ endif
 		rm -rf tamp.mpy build/tamp build/mpy_bindings build/tamp.native.mpy && \
 		$(MAKE) -s _mpy-build MPY_DIR=$(MPY_DIR) ARCH=armv6m TAMP_COMPRESSOR=1 TAMP_DECOMPRESSOR=1 >/dev/null 2>&1 && \
 		size_both=$$(wc -c < tamp.mpy | tr -d ' ') && \
-		printf 'Tamp (MicroPython Native)  %s  %s  %s\n' $$size_comp $$size_decomp $$size_both
+		printf '%-34s %10s %12s %25s\n' "Tamp (MicroPython Native)" $$size_comp $$size_decomp $$size_both
 
 mpy-compression-benchmark:
 	@time belay run micropython -X heapsize=300M tools/micropython-compression-benchmark.py
@@ -492,7 +492,7 @@ tamp-c-library: build/tamp.a
 # Binary Sizes
 ###############
 # Generate binary size information for README table (armv6m with -O3).
-.PHONY: binary-size c-size c-size-v1 c-size-v2
+.PHONY: binary-size c-size c-size-no-extended c-size-extended
 
 ARM_CC := arm-none-eabi-gcc
 ARM_AR := arm-none-eabi-ar
@@ -503,85 +503,85 @@ C_SRC_COMMON = tamp/_c_src/tamp/common.c
 C_SRC_COMP = tamp/_c_src/tamp/compressor.c
 C_SRC_DECOMP = tamp/_c_src/tamp/decompressor.c
 
-# V2 flags to disable v2 support
-V1_FLAGS = -DTAMP_V2_COMPRESS=0 -DTAMP_V2_DECOMPRESS=0
+# Flags to disable extended format support
+NO_EXTENDED_FLAGS = -DTAMP_EXTENDED_COMPRESS=0 -DTAMP_EXTENDED_DECOMPRESS=0
 
-c-size-v1:
+c-size-no-extended:
 	@rm -rf build/arm && mkdir -p build/arm
-	@# v1 without stream API
-	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
-	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
-	@$(ARM_AR) rcs build/arm/v1_comp.a build/arm/common.o build/arm/compressor.o
-	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
-	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
-	@$(ARM_AR) rcs build/arm/v1_decomp.a build/arm/common.o build/arm/decompressor.o
-	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
-	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
-	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
-	@$(ARM_AR) rcs build/arm/v1_full.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o
-	@# v1 with stream API
-	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMMON) -o build/arm/common.o
-	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
-	@$(ARM_AR) rcs build/arm/v1_comp_s.a build/arm/common.o build/arm/compressor.o
-	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o
-	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
-	@$(ARM_AR) rcs build/arm/v1_decomp_s.a build/arm/common.o build/arm/decompressor.o
-	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o
-	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMP) -o build/arm/compressor.o
-	@$(ARM_CC) $(ARM_CFLAGS) $(V1_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
-	@$(ARM_AR) rcs build/arm/v1_full_s.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o
-	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/v1_comp.a | grep TOTALS | awk '{print $$1+$$2}'); \
-	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/v1_decomp.a | grep TOTALS | awk '{print $$1+$$2}'); \
-	size_full=$$($(ARM_SIZE) -B --totals build/arm/v1_full.a | grep TOTALS | awk '{print $$1+$$2}'); \
-	printf 'Tamp v1 (C, no stream)     %d  %d  %d\n' $$size_comp $$size_decomp $$size_full
-	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/v1_comp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
-	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/v1_decomp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
-	size_full=$$($(ARM_SIZE) -B --totals build/arm/v1_full_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
-	printf 'Tamp v1 (C)                %d  %d  %d\n' $$size_comp $$size_decomp $$size_full
-
-c-size-v2:
+	@# No-extended without stream API
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_AR) rcs build/arm/noext_comp.a build/arm/common.o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/noext_decomp.a build/arm/common.o build/arm/decompressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/noext_full.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o
+	@# No-extended with stream API
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_AR) rcs build/arm/noext_comp_s.a build/arm/common.o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/noext_decomp_s.a build/arm/common.o build/arm/decompressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/noext_full_s.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o
+	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/noext_comp.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/noext_decomp.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_full=$$($(ARM_SIZE) -B --totals build/arm/noext_full.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	printf '%-34s %10d %12d %25d\n' "Tamp (C, no extended, no stream)" $$size_comp $$size_decomp $$size_full
+	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/noext_comp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/noext_decomp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_full=$$($(ARM_SIZE) -B --totals build/arm/noext_full_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	printf '%-34s %10d %12d %25d\n' "Tamp (C, no extended)" $$size_comp $$size_decomp $$size_full
+
+c-size-extended:
 	@rm -rf build/arm && mkdir -p build/arm
-	@# v2 without stream API
+	@# Extended without stream API
 	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
 	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
-	@$(ARM_AR) rcs build/arm/v2_comp.a build/arm/common.o build/arm/compressor.o
+	@$(ARM_AR) rcs build/arm/ext_comp.a build/arm/common.o build/arm/compressor.o
 	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
 	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
-	@$(ARM_AR) rcs build/arm/v2_decomp.a build/arm/common.o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/ext_decomp.a build/arm/common.o build/arm/decompressor.o
 	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
 	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
 	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
-	@$(ARM_AR) rcs build/arm/v2_full.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o
-	@# v2 with stream API
+	@$(ARM_AR) rcs build/arm/ext_full.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o
+	@# Extended with stream API
 	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMMON) -o build/arm/common.o
 	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
-	@$(ARM_AR) rcs build/arm/v2_comp_s.a build/arm/common.o build/arm/compressor.o
+	@$(ARM_AR) rcs build/arm/ext_comp_s.a build/arm/common.o build/arm/compressor.o
 	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o
 	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
-	@$(ARM_AR) rcs build/arm/v2_decomp_s.a build/arm/common.o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/ext_decomp_s.a build/arm/common.o build/arm/decompressor.o
 	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o
 	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMP) -o build/arm/compressor.o
 	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
-	@$(ARM_AR) rcs build/arm/v2_full_s.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o
-	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/v2_comp.a | grep TOTALS | awk '{print $$1+$$2}'); \
-	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/v2_decomp.a | grep TOTALS | awk '{print $$1+$$2}'); \
-	size_full=$$($(ARM_SIZE) -B --totals build/arm/v2_full.a | grep TOTALS | awk '{print $$1+$$2}'); \
-	printf 'Tamp v2 (C, no stream)     %d  %d  %d\n' $$size_comp $$size_decomp $$size_full
-	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/v2_comp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
-	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/v2_decomp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
-	size_full=$$($(ARM_SIZE) -B --totals build/arm/v2_full_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
-	printf 'Tamp v2 (C)                %d  %d  %d\n' $$size_comp $$size_decomp $$size_full
-
-c-size: c-size-v1 c-size-v2
+	@$(ARM_AR) rcs build/arm/ext_full_s.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o
+	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/ext_comp.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/ext_decomp.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_full=$$($(ARM_SIZE) -B --totals build/arm/ext_full.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	printf '%-34s %10d %12d %25d\n' "Tamp (C, extended, no stream)" $$size_comp $$size_decomp $$size_full
+	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/ext_comp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/ext_decomp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_full=$$($(ARM_SIZE) -B --totals build/arm/ext_full_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	printf '%-34s %10d %12d %25d\n' "Tamp (C, extended)" $$size_comp $$size_decomp $$size_full
+
+c-size: c-size-no-extended c-size-extended
 
 binary-size:
 	@echo "Binary sizes for armv6m (bytes):"
 	@echo ""
-	@printf '%-27s %-10s %-12s %s\n' "" "Compressor" "Decompressor" "Compressor + Decompressor"
-	@printf '%-27s %-10s %-12s %s\n' "---------------------------" "----------" "------------" "-------------------------"
-	@output=$$($(MAKE) -s mpy-viper-size 2>&1) && echo "$$output" || echo "Tamp (MicroPython Viper)   (requires mpy-cross)"
-	@output=$$($(MAKE) -s mpy-native-size 2>&1) && echo "$$output" || echo "Tamp (MicroPython Native)  (requires MPY_DIR)"
-	@output=$$($(MAKE) -s c-size 2>&1) && echo "$$output" || echo "Tamp (C)                   (requires arm-none-eabi-gcc)"
+	@printf '%-34s %10s %12s %25s\n' "" "Compressor" "Decompressor" "Compressor + Decompressor"
+	@printf '%-34s %10s %12s %25s\n' "----------------------------------" "----------" "------------" "-------------------------"
+	@output=$$($(MAKE) -s mpy-viper-size 2>&1) && echo "$$output" || echo "Tamp (MicroPython Viper)           (requires mpy-cross)"
+	@output=$$($(MAKE) -s mpy-native-size 2>&1) && echo "$$output" || echo "Tamp (MicroPython Native)          (requires MPY_DIR)"
+	@output=$$($(MAKE) -s c-size 2>&1) && echo "$$output" || echo "Tamp (C)                           (requires arm-none-eabi-gcc)"
 
 
 ##########
diff --git a/datasets/v2-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp b/datasets/extended-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp
similarity index 100%
rename from datasets/v2-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp
rename to datasets/extended-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp
diff --git a/datasets/v2-compressed/dickens.tamp b/datasets/extended-compressed/dickens.tamp
similarity index 100%
rename from datasets/v2-compressed/dickens.tamp
rename to datasets/extended-compressed/dickens.tamp
diff --git a/datasets/v2-compressed/enwik8.tamp b/datasets/extended-compressed/enwik8.tamp
similarity index 100%
rename from datasets/v2-compressed/enwik8.tamp
rename to datasets/extended-compressed/enwik8.tamp
diff --git a/datasets/v2-compressed/mozilla.tamp b/datasets/extended-compressed/mozilla.tamp
similarity index 100%
rename from datasets/v2-compressed/mozilla.tamp
rename to datasets/extended-compressed/mozilla.tamp
diff --git a/datasets/v2-compressed/mr.tamp b/datasets/extended-compressed/mr.tamp
similarity index 100%
rename from datasets/v2-compressed/mr.tamp
rename to datasets/extended-compressed/mr.tamp
diff --git a/datasets/v2-compressed/nci.tamp b/datasets/extended-compressed/nci.tamp
similarity index 100%
rename from datasets/v2-compressed/nci.tamp
rename to datasets/extended-compressed/nci.tamp
diff --git a/datasets/v2-compressed/ooffice.tamp b/datasets/extended-compressed/ooffice.tamp
similarity index 100%
rename from datasets/v2-compressed/ooffice.tamp
rename to datasets/extended-compressed/ooffice.tamp
diff --git a/datasets/v2-compressed/osdb.tamp b/datasets/extended-compressed/osdb.tamp
similarity index 100%
rename from datasets/v2-compressed/osdb.tamp
rename to datasets/extended-compressed/osdb.tamp
diff --git a/datasets/v2-compressed/reymont.tamp b/datasets/extended-compressed/reymont.tamp
similarity index 100%
rename from datasets/v2-compressed/reymont.tamp
rename to datasets/extended-compressed/reymont.tamp
diff --git a/datasets/v2-compressed/samba.tamp b/datasets/extended-compressed/samba.tamp
similarity index 100%
rename from datasets/v2-compressed/samba.tamp
rename to datasets/extended-compressed/samba.tamp
diff --git a/datasets/v2-compressed/sao.tamp b/datasets/extended-compressed/sao.tamp
similarity index 100%
rename from datasets/v2-compressed/sao.tamp
rename to datasets/extended-compressed/sao.tamp
diff --git a/datasets/v2-compressed/webster.tamp b/datasets/extended-compressed/webster.tamp
similarity index 100%
rename from datasets/v2-compressed/webster.tamp
rename to datasets/extended-compressed/webster.tamp
diff --git a/datasets/v2-compressed/x-ray.tamp b/datasets/extended-compressed/x-ray.tamp
similarity index 100%
rename from datasets/v2-compressed/x-ray.tamp
rename to datasets/extended-compressed/x-ray.tamp
diff --git a/datasets/v2-compressed/xml.tamp b/datasets/extended-compressed/xml.tamp
similarity index 100%
rename from datasets/v2-compressed/xml.tamp
rename to datasets/extended-compressed/xml.tamp
diff --git a/docs/source/specification.rst b/docs/source/specification.rst
index 89a880a5..b5fc97a7 100644
--- a/docs/source/specification.rst
+++ b/docs/source/specification.rst
@@ -26,8 +26,8 @@ The bit-location 0 is equivalent to typical MSb position 7 of the first byte.
 | [2]     | custom_dictionary | A custom dictionary initialization method was used                  |
 |         |                   | and must be provided at decompression.                              |
 +---------+-------------------+---------------------------------------------------------------------+
-| [1]     | v2                | Enables Tamp v2 features. Generally improves compression, but the   |
-|         |                   | decompressor needs to support it (introduced in tamp v1.11.0).      |
+| [1]     | extended          | Enables extended format features (RLE, extended match encoding).    |
+|         |                   | Generally improves compression, introduced in tamp v2.0.0.          |
 +---------+-------------------+---------------------------------------------------------------------+
 | [0]     | more_header       | If ``True``, then the next byte in the stream is more header data.  |
 |         |                   | Currently always ``False``, but allows for future expandability.    |
diff --git a/mpy_bindings/bindings.c b/mpy_bindings/bindings.c
index 9f07ae39..125f31c5 100644
--- a/mpy_bindings/bindings.c
+++ b/mpy_bindings/bindings.c
@@ -68,7 +68,7 @@ static mp_obj_t compressor_make_new(const mp_obj_type_t *type, size_t n_args, si
         .window = mp_obj_get_int(args_in[1]),
         .literal = mp_obj_get_int(args_in[2]),
         .use_custom_dictionary = mp_obj_get_int(args_in[4]),
-        .v2 = mp_obj_get_int(args_in[5]),
+        .extended = mp_obj_get_int(args_in[5]),
     };
 
     mp_obj_compressor_t *o = mp_obj_malloc(mp_obj_compressor_t, type);
diff --git a/mpy_bindings/bindings_compressor.py b/mpy_bindings/bindings_compressor.py
index c40b2351..9fcbb818 100644
--- a/mpy_bindings/bindings_compressor.py
+++ b/mpy_bindings/bindings_compressor.py
@@ -9,7 +9,7 @@ def __init__(
         window=10,
         literal=8,
         dictionary=None,
-        v2=True,
+        extended=True,
     ):
         self._cf = False  # shorter name to save binary space
         if not hasattr(f, "write"):  # It's probably a path-like object.
@@ -19,7 +19,7 @@ def __init__(
         custom = dictionary is not None
         if not dictionary:
             dictionary = bytearray(1 << window)
-        self._c = _C(f, window, literal, dictionary, custom, v2)
+        self._c = _C(f, window, literal, dictionary, custom, extended)
 
         self.write = self._c.write
 
diff --git a/tamp/_c_compressor.pyx b/tamp/_c_compressor.pyx
index eff25276..9c19f0b3 100644
--- a/tamp/_c_compressor.pyx
+++ b/tamp/_c_compressor.pyx
@@ -35,7 +35,7 @@ cdef class Compressor:
         int literal=8,
         dictionary=None,
         bool lazy_matching=False,
-        bool v2=True,
+        bool extended=True,
     ):
         cdef ctamp.TampConf conf
 
@@ -56,7 +56,7 @@ cdef class Compressor:
         # Set lazy_matching - this field is conditionally compiled based on TAMP_LAZY_MATCHING
         # The build system defines this macro, so the field should be available
         conf.lazy_matching = lazy_matching
-        conf.v2 = v2
+        conf.extended = extended
 
         self._window_buffer = dictionary if dictionary else bytearray(1 << window)
         self._window_buffer_ptr = <unsigned char *>self._window_buffer
diff --git a/tamp/_c_src/tamp/common.h b/tamp/_c_src/tamp/common.h
index 089b7769..48acdd7f 100644
--- a/tamp/_c_src/tamp/common.h
+++ b/tamp/_c_src/tamp/common.h
@@ -68,19 +68,19 @@ extern "C" {
 #define TAMP_STREAM_WORK_BUFFER_SIZE 32
 #endif
 
-/* V2 format support (RLE, extended match).
+/* Extended format support (RLE, extended match).
  * Enabled by default. Disable to save code size on minimal builds.
- * Separate flags allow decompressor-only or compressor-only v2 support.
+ * Separate flags allow decompressor-only or compressor-only extended support.
  */
-#ifndef TAMP_V2_DECOMPRESS
-#define TAMP_V2_DECOMPRESS 1
+#ifndef TAMP_EXTENDED_DECOMPRESS
+#define TAMP_EXTENDED_DECOMPRESS 1
 #endif
-#ifndef TAMP_V2_COMPRESS
-#define TAMP_V2_COMPRESS 1
+#ifndef TAMP_EXTENDED_COMPRESS
+#define TAMP_EXTENDED_COMPRESS 1
 #endif
 
-/* V2 encoding constants */
-#if TAMP_V2_DECOMPRESS || TAMP_V2_COMPRESS
+/* Extended encoding constants */
+#if TAMP_EXTENDED_DECOMPRESS || TAMP_EXTENDED_COMPRESS
 #define TAMP_RLE_SYMBOL 12
 #define TAMP_EXTENDED_MATCH_SYMBOL 13
 #define TAMP_LEADING_EXTENDED_MATCH_BITS 3
@@ -113,7 +113,7 @@ typedef struct TampConf {
     uint16_t window : 4;                 // number of window bits
     uint16_t literal : 4;                // number of literal bits
     uint16_t use_custom_dictionary : 1;  // Use a custom initialized dictionary.
-    uint16_t v2 : 1;                     // v2 format (RLE, extended match). Read from header bit [1].
+    uint16_t extended : 1;               // Extended format (RLE, extended match). Read from header bit [1].
 #if TAMP_LAZY_MATCHING
     uint16_t lazy_matching : 1;  // use Lazy Matching (spend 50-75% more CPU for around 0.5-2.0% better compression.)
                                  // only effects compression operations.
diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index ad9768aa..ecfbe4db 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -9,10 +9,10 @@
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2 * !!(condition)]))
 
-#if TAMP_V2_COMPRESS
-// V2 max pattern: min_pattern_size + 11 + 112 = min_pattern_size + 123
-#define MAX_PATTERN_SIZE_V2 (compressor->min_pattern_size + 123)
-#define MAX_PATTERN_SIZE (compressor->conf_v2 ? MAX_PATTERN_SIZE_V2 : (compressor->min_pattern_size + 13))
+#if TAMP_EXTENDED_COMPRESS
+// Extended max pattern: min_pattern_size + 11 + 112 = min_pattern_size + 123
+#define MAX_PATTERN_SIZE_EXTENDED (compressor->min_pattern_size + 123)
+#define MAX_PATTERN_SIZE (compressor->conf_extended ? MAX_PATTERN_SIZE_EXTENDED : (compressor->min_pattern_size + 13))
 #else
 #define MAX_PATTERN_SIZE (compressor->min_pattern_size + 13)
 #endif
@@ -29,10 +29,10 @@ static const uint8_t huffman_codes[] = {0x0, 0x3, 0x8, 0xb, 0x14, 0x24, 0x26, 0x
 // These bit lengths pre-add the 1 bit for the 0-value is_literal flag.
 static const uint8_t huffman_bits[] = {0x2, 0x3, 0x5, 0x5, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0x9, 0x7};
 
-#if TAMP_V2_COMPRESS
-// V2: Maximum RLE count = (13 << 4) + 15 + 2 = 225
+#if TAMP_EXTENDED_COMPRESS
+// Extended: Maximum RLE count = (13 << 4) + 15 + 2 = 225
 #define RLE_MAX_COUNT 225
-// V2: Maximum extended match extra = (13 << 3) + 7 + 1 = 112
+// Extended: Maximum extended match extra = (13 << 3) + 7 + 1 = 112
 // Total max match = min_pattern_size + 11 + 112 = min_pattern_size + 123
 #define EXTENDED_MATCH_MAX_EXTRA 112
 
@@ -48,7 +48,7 @@ static inline void write_to_bit_buffer(TampCompressor *compressor, uint32_t bits
     compressor->bit_buffer |= bits << (32 - compressor->bit_buffer_pos);
 }
 
-#if TAMP_V2_COMPRESS
+#if TAMP_EXTENDED_COMPRESS
 /**
  * @brief Write extended huffman encoding (huffman + trailing bits).
  *
@@ -66,7 +66,7 @@ static TAMP_NOINLINE void write_extended_huffman(TampCompressor *compressor, uin
     write_to_bit_buffer(compressor, value & mask, trailing_bits);
 }
 
-#endif  // TAMP_V2_COMPRESS
+#endif  // TAMP_EXTENDED_COMPRESS
 
 /**
  * @brief Partially flush the internal bit buffer.
@@ -183,15 +183,15 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
 #if TAMP_LAZY_MATCHING
         .lazy_matching = false,
 #endif
-#if TAMP_V2_COMPRESS
-        .v2 = true,  // Default to v2 format
+#if TAMP_EXTENDED_COMPRESS
+        .extended = true,  // Default to extended format
 #endif
     };
     if (!conf) conf = &conf_default;
     if (conf->window < 8 || conf->window > 15) return TAMP_INVALID_CONF;
     if (conf->literal < 5 || conf->literal > 8) return TAMP_INVALID_CONF;
-#if !TAMP_V2_COMPRESS
-    if (conf->v2) return TAMP_INVALID_CONF;  // V2 requested but not compiled in
+#if !TAMP_EXTENDED_COMPRESS
+    if (conf->extended) return TAMP_INVALID_CONF;  // Extended requested but not compiled in
 #endif
 
     for (uint8_t i = 0; i < sizeof(TampCompressor); i++)  // Zero-out the struct
@@ -200,7 +200,7 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
     compressor->conf_literal = conf->literal;
     compressor->conf_window = conf->window;
     compressor->conf_use_custom_dictionary = conf->use_custom_dictionary;
-    compressor->conf_v2 = conf->v2;
+    compressor->conf_extended = conf->extended;
 #if TAMP_LAZY_MATCHING
     compressor->conf_lazy_matching = conf->lazy_matching;
 #endif
@@ -218,13 +218,13 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
     write_to_bit_buffer(compressor, compressor->conf_window - 8, 3);
     write_to_bit_buffer(compressor, compressor->conf_literal - 5, 2);
     write_to_bit_buffer(compressor, compressor->conf_use_custom_dictionary, 1);
-    write_to_bit_buffer(compressor, compressor->conf_v2, 1);
+    write_to_bit_buffer(compressor, compressor->conf_extended, 1);
     write_to_bit_buffer(compressor, 0, 1);  // No more header bytes
 
     return TAMP_OK;
 }
 
-#if TAMP_V2_COMPRESS
+#if TAMP_EXTENDED_COMPRESS
 /**
  * @brief Get the last byte written to the window.
  */
@@ -316,7 +316,7 @@ static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor *compres
 
     return TAMP_OK;
 }
-#endif  // TAMP_V2_COMPRESS
+#endif  // TAMP_EXTENDED_COMPRESS
 
 TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned char *output, size_t output_size,
                                             size_t *output_written_size) {
@@ -341,9 +341,9 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
 
     if (TAMP_UNLIKELY(output_size == 0)) return TAMP_OUTPUT_FULL;
 
-#if TAMP_V2_COMPRESS
-    // V2: Handle extended match continuation
-    if (TAMP_UNLIKELY(compressor->conf_v2 && compressor->extended_match_count)) {
+#if TAMP_EXTENDED_COMPRESS
+    // Extended: Handle extended match continuation
+    if (TAMP_UNLIKELY(compressor->conf_extended && compressor->extended_match_count)) {
         // We're in extended match mode - try to extend the match at the current position
         const uint8_t max_ext_match = compressor->min_pattern_size + 11 + EXTENDED_MATCH_MAX_EXTRA;
         const unsigned char *window = compressor->window;
@@ -395,15 +395,15 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
         // Ran out of input while extending - return and wait for more
         return TAMP_OK;
     }
-#endif  // TAMP_V2_COMPRESS
+#endif  // TAMP_EXTENDED_COMPRESS
 
     uint8_t match_size = 0;
     uint16_t match_index = 0;
 
-#if TAMP_V2_COMPRESS
-    // V2: Handle RLE accumulation with persistent state
+#if TAMP_EXTENDED_COMPRESS
+    // Extended: Handle RLE accumulation with persistent state
     // For simplicity in C, we commit RLE immediately when the run ends
-    if (TAMP_UNLIKELY(compressor->conf_v2)) {
+    if (TAMP_UNLIKELY(compressor->conf_extended)) {
         uint8_t last_byte = get_last_window_byte(compressor);
 
         // Count and CONSUME matching bytes
@@ -435,7 +435,7 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
             compressor->rle_count = 0;
         }
     }
-#endif  // TAMP_V2_COMPRESS
+#endif  // TAMP_EXTENDED_COMPRESS
 
 #if TAMP_LAZY_MATCHING
     if (compressor->conf_lazy_matching) {
@@ -497,9 +497,9 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
         }
         write_to_bit_buffer(compressor, IS_LITERAL_FLAG | c, compressor->conf_literal + 1);
     } else {
-#if TAMP_V2_COMPRESS
-        // V2: Check for extended match
-        if (compressor->conf_v2 && match_size > compressor->min_pattern_size + 11) {
+#if TAMP_EXTENDED_COMPRESS
+        // Extended: Check for extended match
+        if (compressor->conf_extended && match_size > compressor->min_pattern_size + 11) {
             compressor->extended_match_count = match_size;
             compressor->extended_match_position = match_index;
             // Consume matched bytes from input
@@ -507,7 +507,7 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
             compressor->input_size -= match_size;
             return TAMP_OK;
         }
-#endif
+#endif  // TAMP_EXTENDED_COMPRESS
         // Write TOKEN
         uint8_t huffman_index = match_size - compressor->min_pattern_size;
         write_to_bit_buffer(compressor, huffman_codes[huffman_index], huffman_bits[huffman_index]);
@@ -599,9 +599,9 @@ tamp_res tamp_compressor_flush(TampCompressor *compressor, unsigned char *output
         output += chunk_output_written_size;
     }
 
-#if TAMP_V2_COMPRESS
-    // V2: Flush any pending RLE
-    if (compressor->conf_v2 && compressor->rle_count >= 1) {
+#if TAMP_EXTENDED_COMPRESS
+    // Extended: Flush any pending RLE
+    if (compressor->conf_extended && compressor->rle_count >= 1) {
         // Partial flush first to make room
         res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
         (*output_written_size) += chunk_output_written_size;
@@ -632,8 +632,8 @@ tamp_res tamp_compressor_flush(TampCompressor *compressor, unsigned char *output
         output += chunk_output_written_size;
     }
 
-    // V2: Flush any pending extended match
-    if (compressor->conf_v2 && compressor->extended_match_count) {
+    // Extended: Flush any pending extended match
+    if (compressor->conf_extended && compressor->extended_match_count) {
         // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
         if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
         res = write_extended_match_token(compressor, output, output_size, &chunk_output_written_size);
@@ -642,7 +642,7 @@ tamp_res tamp_compressor_flush(TampCompressor *compressor, unsigned char *output
         output_size -= chunk_output_written_size;
         output += chunk_output_written_size;
     }
-#endif
+#endif  // TAMP_EXTENDED_COMPRESS
 
     // Perform partial flush to see if we need a FLUSH token (check if output buffer in not empty),
     // and to subsequently make room for the FLUSH token.
diff --git a/tamp/_c_src/tamp/compressor.h b/tamp/_c_src/tamp/compressor.h
index 361bf112..ff2f1663 100644
--- a/tamp/_c_src/tamp/compressor.h
+++ b/tamp/_c_src/tamp/compressor.h
@@ -26,15 +26,15 @@ typedef struct TampCompressor {
     uint8_t conf_window;                 // Window bits (4 bits used; 8-15)
     uint8_t conf_literal;                // Literal bits (4 bits used; 5-8)
     uint8_t conf_use_custom_dictionary;  // Custom dictionary (1 bit used; init only)
-    uint8_t conf_v2;                     // V2 format enabled (1 bit used)
+    uint8_t conf_extended;               // Extended format enabled (1 bit used)
 #if TAMP_LAZY_MATCHING
     uint8_t conf_lazy_matching;  // Lazy matching enabled (1 bit used)
     int16_t cached_match_index;  // Lazy matching cache
     uint8_t cached_match_size;
 #endif
 
-#if TAMP_V2_COMPRESS
-    /* V2 state (only needed when V2 compression is enabled) */
+#if TAMP_EXTENDED_COMPRESS
+    /* Extended state (only needed when extended compression is enabled) */
     uint8_t rle_count;                 // Current RLE run length (max 225)
     uint8_t extended_match_count;      // Current extended match size (max ~126)
     uint16_t extended_match_position;  // Window position for extended match
@@ -53,20 +53,20 @@ typedef struct TampCompressor {
 
     /* WARM: read frequently, often cached in locals.
      * Bitfields: min_pattern_size(2) + conf_window(4) + conf_literal(4) +
-     *            conf_use_custom_dictionary(1) + conf_v2(1) + conf_lazy_matching(1) = 13 bits
+     *            conf_use_custom_dictionary(1) + conf_extended(1) + conf_lazy_matching(1) = 13 bits
      */
     uint8_t min_pattern_size : 2;            // Minimum pattern size (2 or 3)
     uint8_t conf_window : 4;                 // Window bits (8-15)
     uint8_t conf_literal : 4;                // Literal bits (5-8)
     uint8_t conf_use_custom_dictionary : 1;  // Custom dictionary (init only)
-    uint8_t conf_v2 : 1;                     // V2 format enabled
+    uint8_t conf_extended : 1;               // Extended format enabled
 #if TAMP_LAZY_MATCHING
     uint8_t conf_lazy_matching : 1;  // Lazy matching enabled
     int16_t cached_match_index;      // Lazy matching cache
     uint8_t cached_match_size;
 #endif
 
-#if TAMP_V2_COMPRESS
+#if TAMP_EXTENDED_COMPRESS
     uint8_t rle_count;                 // Current RLE run length (max 225)
     uint8_t extended_match_count;      // Current extended match size (max ~126)
     uint16_t extended_match_position;  // Window position for extended match
diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 00169998..a17cb1d3 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -7,8 +7,8 @@
 
 #define FLUSH 15
 
-#if TAMP_V2_DECOMPRESS
-/* Token state for v2 decode suspend/resume (2 bits).
+#if TAMP_EXTENDED_DECOMPRESS
+/* Token state for extended decode suspend/resume (2 bits).
  * TOKEN_RLE and TOKEN_EXT_MATCH_FRESH are arranged so that:
  *     token_state = match_size - (TAMP_RLE_SYMBOL - 1)
  * maps TAMP_RLE_SYMBOL (12) -> 1 and TAMP_EXTENDED_MATCH_SYMBOL (13) -> 2.
@@ -127,7 +127,7 @@ static tamp_res decode_huffman(uint32_t* bit_buffer, uint8_t* bit_buffer_pos, ui
     return TAMP_OK;
 }
 
-#if TAMP_V2_DECOMPRESS
+#if TAMP_EXTENDED_DECOMPRESS
 
 /**
  * @brief Decode RLE token and write repeated bytes to output.
@@ -298,7 +298,7 @@ static tamp_res decode_extended_match(TampDecompressor* d, unsigned char** outpu
 
     return (d->token_state == TOKEN_NONE) ? TAMP_OK : TAMP_OUTPUT_FULL;
 }
-#endif /* TAMP_V2_DECOMPRESS */
+#endif /* TAMP_EXTENDED_DECOMPRESS */
 
 tamp_res tamp_decompressor_read_header(TampConf* conf, const unsigned char* input, size_t input_size,
                                        size_t* input_consumed_size) {
@@ -310,7 +310,7 @@ tamp_res tamp_decompressor_read_header(TampConf* conf, const unsigned char* inpu
     conf->window = ((input[0] >> 5) & 0x7) + 8;
     conf->literal = ((input[0] >> 3) & 0x3) + 5;
     conf->use_custom_dictionary = ((input[0] >> 2) & 0x1);
-    conf->v2 = ((input[0] >> 1) & 0x1);
+    conf->extended = ((input[0] >> 1) & 0x1);
 
     return TAMP_OK;
 }
@@ -322,7 +322,7 @@ tamp_res tamp_decompressor_read_header(TampConf* conf, const unsigned char* inpu
  */
 static tamp_res tamp_decompressor_populate_from_conf(TampDecompressor* decompressor, uint8_t conf_window,
                                                      uint8_t conf_literal, uint8_t conf_use_custom_dictionary,
-                                                     uint8_t conf_v2) {
+                                                     uint8_t conf_extended) {
     if (conf_window < 8 || conf_window > 15) return TAMP_INVALID_CONF;
     if (conf_literal < 5 || conf_literal > 8) return TAMP_INVALID_CONF;
     if (conf_window > decompressor->window_bits_max) return TAMP_INVALID_CONF;
@@ -332,9 +332,9 @@ static tamp_res tamp_decompressor_populate_from_conf(TampDecompressor* decompres
     decompressor->conf_literal = conf_literal;
     decompressor->min_pattern_size = tamp_compute_min_pattern_size(conf_window, conf_literal);
     decompressor->configured = true;
-    decompressor->conf_v2 = conf_v2;
-#if !TAMP_V2_DECOMPRESS
-    if (conf_v2) return TAMP_INVALID_CONF;  // v2 stream but v2 support not compiled in
+    decompressor->conf_extended = conf_extended;
+#if !TAMP_EXTENDED_DECOMPRESS
+    if (conf_extended) return TAMP_INVALID_CONF;  // Extended stream but extended support not compiled in
 #endif
 
     return TAMP_OK;
@@ -353,7 +353,7 @@ tamp_res tamp_decompressor_init(TampDecompressor* decompressor, const TampConf*
     decompressor->window_bits_max = window_bits;
     if (conf) {
         res = tamp_decompressor_populate_from_conf(decompressor, conf->window, conf->literal,
-                                                   conf->use_custom_dictionary, conf->v2);
+                                                   conf->use_custom_dictionary, conf->extended);
     }
 
     return res;
@@ -397,7 +397,7 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigne
         if (res != TAMP_OK) return res;
 
         res = tamp_decompressor_populate_from_conf(decompressor, conf.window, conf.literal, conf.use_custom_dictionary,
-                                                   conf.v2);
+                                                   conf.extended);
         if (res != TAMP_OK) return res;
 
         input += header_consumed_size;
@@ -410,8 +410,8 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigne
     const uint8_t min_pattern_size = decompressor->min_pattern_size;
 
     const uint16_t window_mask = (1 << conf_window) - 1;
-#if TAMP_V2_DECOMPRESS
-    const bool v2_enabled = decompressor->conf_v2;
+#if TAMP_EXTENDED_DECOMPRESS
+    const bool extended_enabled = decompressor->conf_extended;
 #endif
 
     while (input != input_end || decompressor->pos_and_state) {
@@ -420,10 +420,10 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigne
         // Populate the bit buffer
         refill_bit_buffer(decompressor, &input, input_end, input_consumed_size);
 
-#if TAMP_V2_DECOMPRESS
-        /* Handle v2 tokens - either resuming or fresh from match_size detection below. */
+#if TAMP_EXTENDED_DECOMPRESS
+        /* Handle extended tokens - either resuming or fresh from match_size detection below. */
         if (TAMP_UNLIKELY(decompressor->token_state)) {
-        v2_dispatch:
+        extended_dispatch:
             if (decompressor->token_state == TOKEN_RLE) {
                 res = decode_rle(decompressor, &output, output_end, output_written_size);
             } else {
@@ -442,7 +442,7 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigne
             if (res != TAMP_OK) return res;
             continue;
         }
-#endif
+#endif  // TAMP_EXTENDED_DECOMPRESS
 
         if (TAMP_UNLIKELY(decompressor->bit_buffer_pos == 0)) return TAMP_INPUT_EXHAUSTED;
 
@@ -489,16 +489,16 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigne
                 continue;
             }
 
-#if TAMP_V2_DECOMPRESS
-            /* Check for v2 symbols (RLE=12, extended match=13).
+#if TAMP_EXTENDED_DECOMPRESS
+            /* Check for extended symbols (RLE=12, extended match=13).
              * Convert match_size to token_state via subtraction (see TOKEN_* defines). */
-            if (TAMP_UNLIKELY(v2_enabled && match_size >= TAMP_RLE_SYMBOL)) {
+            if (TAMP_UNLIKELY(extended_enabled && match_size >= TAMP_RLE_SYMBOL)) {
                 decompressor->bit_buffer = bit_buffer;
                 decompressor->bit_buffer_pos = bit_buffer_pos;
                 decompressor->token_state = match_size - (TAMP_RLE_SYMBOL - 1);
-                goto v2_dispatch;
+                goto extended_dispatch;
             }
-#endif
+#endif  // TAMP_EXTENDED_DECOMPRESS
 
             if (TAMP_UNLIKELY(bit_buffer_pos < conf_window)) {
                 // There are not enough bits to decode window offset
diff --git a/tamp/_c_src/tamp/decompressor.h b/tamp/_c_src/tamp/decompressor.h
index 7f1b5843..96598175 100644
--- a/tamp/_c_src/tamp/decompressor.h
+++ b/tamp/_c_src/tamp/decompressor.h
@@ -17,7 +17,7 @@ typedef struct {
     uint16_t window_pos;    // Current position in window (15 bits)
 
     /* Union allows single zero-check in main loop instead of two separate checks. */
-#if TAMP_V2_DECOMPRESS
+#if TAMP_EXTENDED_DECOMPRESS
     union {
         struct {
             uint8_t bit_buffer_pos;  // Bits currently in bit_buffer (6 bits needed)
@@ -31,7 +31,7 @@ typedef struct {
         uint8_t pos_and_state;   // Alias for consistent access in main loop
     };
 #endif
-#if TAMP_V2_DECOMPRESS
+#if TAMP_EXTENDED_DECOMPRESS
     uint16_t pending_window_offset;  // Saved window_offset for extended match output-full resume
     uint16_t pending_match_size;     // Saved match_size for extended match resume
 #endif
@@ -40,7 +40,7 @@ typedef struct {
     uint8_t conf_window : 4;       // Window bits from config
     uint8_t conf_literal : 4;      // Literal bits from config
     uint8_t min_pattern_size : 2;  // Minimum pattern size, 2 or 3
-    uint8_t conf_v2 : 1;           // v2 format enabled (from header)
+    uint8_t conf_extended : 1;     // Extended format enabled (from header)
 
     /* COLD: rarely accessed (init or edge cases).
      * Bitfields save space; add new cold fields here. */
diff --git a/tamp/cli/main.py b/tamp/cli/main.py
index e2054b6f..11f70622 100644
--- a/tamp/cli/main.py
+++ b/tamp/cli/main.py
@@ -119,7 +119,7 @@ def compress(
         ),
     ] = 8,
     lazy_matching: bool = False,
-    v1: bool = False,
+    extended: bool = True,
     implementation: ImplementationType = None,
 ):
     """Compress an input file or stream.
@@ -136,8 +136,8 @@ def compress(
         Number of bits used to represent a literal.
     lazy_matching: bool
         Use roughly 50% more cpu to get 0~2% better compression.
-    v1: bool
-        Use version 1 compression format.
+    extended: bool
+        Use extended compression format (RLE, extended match encoding).
     implementation: Optional[Literal["c", "python"]]
         Explicitly specify which implementation to use (c or python). Defaults to auto-detection.
     """
@@ -148,7 +148,7 @@ def compress(
         window=window,
         literal=literal,
         lazy_matching=lazy_matching,
-        v2=not v1,
+        extended=extended,
     )
     write(output, output_bytes)
 
diff --git a/tamp/compressor.py b/tamp/compressor.py
index b5e7c392..2451aa75 100644
--- a/tamp/compressor.py
+++ b/tamp/compressor.py
@@ -143,7 +143,7 @@ def __init__(
         literal: int = 8,
         dictionary: Optional[bytearray] = None,
         lazy_matching: bool = False,
-        v2: bool = True,
+        extended: bool = True,
     ):
         """
         Parameters
@@ -174,7 +174,7 @@ def __init__(
         self.window_bits = window
         self.literal_bits = literal
         self.min_pattern_size = compute_min_pattern_size(window, literal)
-        self.v2: bool = v2
+        self.extended: bool = extended
 
         self._rle_count = 0
 
@@ -198,7 +198,7 @@ def __init__(
         if dictionary and bit_size(len(dictionary) - 1) != window:
             raise ValueError("Dictionary-window size mismatch.")
 
-        if self.v2:
+        if self.extended:
             self.max_pattern_size = (
                 self.min_pattern_size
                 + 11
@@ -230,7 +230,7 @@ def __init__(
         self._bit_writer.write(window - 8, 3, flush=False)
         self._bit_writer.write(literal - 5, 2, flush=False)
         self._bit_writer.write(bool(dictionary), 1, flush=False)
-        self._bit_writer.write(self.v2, 1, flush=False)
+        self._bit_writer.write(self.extended, 1, flush=False)
         self._bit_writer.write(0, 1, flush=False)  # No other header bytes
 
     def _validate_no_match_overlap(self, write_pos, match_index, match_size):
@@ -275,7 +275,7 @@ def _compress_input_buffer_single(self) -> int:
         # Accumulate RLE count across compression cycles for better compression of long runs
         have_match_from_rle = False  # Track if we already did pattern matching in RLE section
 
-        if self.v2:
+        if self.extended:
             last_byte = self._window_buffer.last_written_byte
 
             # Count additional matching bytes in current buffer
@@ -326,7 +326,7 @@ def _compress_input_buffer_single(self) -> int:
 
                     # Write the pattern match immediately and return
                     # (Don't continue to normal flow which would try to consume bytes again)
-                    if self.v2 and match_size > (self.min_pattern_size + 11):
+                    if self.extended and match_size > (self.min_pattern_size + 11):
                         self._extended_match_position = search_i
                         self._extended_match_count = match_size
                         bytes_written += self._write_extended_match()
@@ -382,7 +382,7 @@ def _compress_input_buffer_single(self) -> int:
                 return bytes_written
 
         if match_size >= self.min_pattern_size:
-            if self.v2 and match_size > (self.min_pattern_size + 11):
+            if self.extended and match_size > (self.min_pattern_size + 11):
                 # Protects +12 to be RLE symbol, and +13 to be extended match symbol
                 self._extended_match_position = search_i
                 self._extended_match_count = match_size
@@ -550,9 +550,9 @@ def flush(self, write_token: bool = True) -> int:
             self.flush_cb()
         while self._input_buffer:
             bytes_written += self._compress_input_buffer_single()
-        if self.v2 and self._rle_count:
+        if self.extended and self._rle_count:
             bytes_written += self._write_rle()
-        if self.v2 and self._extended_match_count:
+        if self.extended and self._extended_match_count:
             bytes_written += self._write_extended_match()
 
         # Clear any cached lazy matching state
@@ -606,7 +606,7 @@ def compress(
     literal: int = 8,
     dictionary: Optional[bytearray] = None,
     lazy_matching: bool = False,
-    v2: bool = True,
+    extended: bool = True,
 ) -> bytes:
     """Single-call to compress data.
 
@@ -633,8 +633,8 @@ def compress(
         first be initialized with :func:`~tamp.initialize_dictionary`
     lazy_matching: bool
         Use roughly 50% more cpu to get 0~2% better compression.
-    v2: bool
-        Use v2 compression format. Defaults to True.
+    extended: bool
+        Use extended compression format. Defaults to True.
 
     Returns
     -------
@@ -649,7 +649,7 @@ def compress(
                 literal=literal,
                 dictionary=dictionary,
                 lazy_matching=lazy_matching,
-                v2=v2,
+                extended=extended,
             )
             c.write(data)
         else:
@@ -659,7 +659,7 @@ def compress(
                 literal=literal,
                 dictionary=dictionary,
                 lazy_matching=lazy_matching,
-                v2=v2,
+                extended=extended,
             )
             c.write(data)
         c.flush(write_token=False)
diff --git a/tamp/ctamp.pxd b/tamp/ctamp.pxd
index cd563ee1..3a703080 100644
--- a/tamp/ctamp.pxd
+++ b/tamp/ctamp.pxd
@@ -6,7 +6,7 @@ cdef extern from "tamp/common.h":
         int window
         int literal
         bool use_custom_dictionary
-        bool v2  # v2 format (RLE, extended match). Read from header bit [1].
+        bool extended  # Extended format (RLE, extended match). Read from header bit [1].
         # The lazy_matching field is conditionally compiled based on TAMP_LAZY_MATCHING
         # We declare it here, but accessing it when the macro is disabled will cause compile errors
         # This is handled in the Cython code by always setting it when the struct is initialized
diff --git a/tamp/decompressor.py b/tamp/decompressor.py
index a61ad5d5..6216b3e1 100644
--- a/tamp/decompressor.py
+++ b/tamp/decompressor.py
@@ -179,7 +179,7 @@ def __init__(self, f, *, dictionary: Optional[bytearray] = None):
         self.window_bits = self._bit_reader.read(3) + 8
         self.literal_bits = self._bit_reader.read(2) + 5
         uses_custom_dictionary = self._bit_reader.read(1)
-        self.v2 = self._bit_reader.read(1)
+        self.extended = self._bit_reader.read(1)
         more_header_bytes = self._bit_reader.read(1)
 
         if more_header_bytes:
@@ -246,7 +246,7 @@ def write_to_output(string):
                         if match_size is _FLUSH:
                             self._bit_reader.clear()
                             continue
-                        if self.v2 and match_size > 11:
+                        if self.extended and match_size > 11:
                             if match_size == _RLE_SYMBOL:
                                 rle_count = self._bit_reader.read_huffman()
                                 rle_count <<= _LEADING_RLE_HUFFMAN_BITS
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 428431e5..b6b79baa 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -51,7 +51,7 @@ def test_compress_file_to_stdout(self):
             test_file.write_bytes(b"foo foo foo")
 
             with patch("sys.stdout.buffer.write") as mock_stdout:
-                app(["compress", "--v1", str(test_file)], **_app_kwargs)
+                app(["compress", "--no-extended", str(test_file)], **_app_kwargs)
                 mock_stdout.assert_called_once_with(compressed_foo_foo_foo)
 
     def test_compress_stdin_to_stdout(self):
@@ -59,7 +59,7 @@ def test_compress_stdin_to_stdout(self):
             patch("sys.stdout.buffer.write") as mock_stdout,
             patch("sys.stdin.buffer.read", return_value="foo foo foo"),
         ):
-            app(["compress", "--v1"], **_app_kwargs)
+            app(["compress", "--no-extended"], **_app_kwargs)
             mock_stdout.assert_called_once_with(compressed_foo_foo_foo)
 
     def test_decompress_file_to_stdout(self):
diff --git a/tests/test_compressor.py b/tests/test_compressor.py
index 00b0ceac..eeeff9db 100644
--- a/tests/test_compressor.py
+++ b/tests/test_compressor.py
@@ -94,7 +94,7 @@ def test_compressor_default(self):
 
                 bytes_written = 0
                 with io.BytesIO() as f:
-                    compressor = Compressor(f, v2=False)
+                    compressor = Compressor(f, extended=False)
                     bytes_written += compressor.write(test_string)
                     bytes_written += compressor.flush(write_token=False)
 
@@ -106,7 +106,7 @@ def test_compressor_default(self):
 
                 # Test Context Manager
                 bytes_written = 0
-                with io.BytesIO() as f, Compressor(f, v2=False) as compressor:
+                with io.BytesIO() as f, Compressor(f, extended=False) as compressor:
                     bytes_written += compressor.write(test_string)
                     bytes_written += compressor.flush(write_token=False)
 
@@ -137,7 +137,7 @@ def test_compressor_input_buffer(self):
                 )
 
                 with io.BytesIO() as f:
-                    compressor = Compressor(f, v2=False)
+                    compressor = Compressor(f, extended=False)
                     compressor.write(b"f")
                     compressor.write(b"oo")
                     compressor.write(b" fo")
@@ -171,7 +171,7 @@ def test_compressor_7bit(self):
                     # fmt: on
                 )
                 with io.BytesIO() as f:
-                    compressor = Compressor(f, literal=7, v2=False)
+                    compressor = Compressor(f, literal=7, extended=False)
                     compressor.write(test_string)
                     compressor.flush(write_token=False)
 
@@ -200,7 +200,7 @@ def test_compressor_predefined_dictionary(self):
                 )
 
                 with io.BytesIO() as f:
-                    compressor = Compressor(f, window=8, literal=7, dictionary=dictionary, v2=False)
+                    compressor = Compressor(f, window=8, literal=7, dictionary=dictionary, extended=False)
                     compressor.write(test_string)
                     compressor.flush(write_token=False)
 
@@ -223,7 +223,7 @@ def test_oob_2_byte_pattern(self):
                 test_string = memoryview(test_string_extended)[:3]  # b"Q\x00Q"
 
                 with io.BytesIO() as f:
-                    compressor = Compressor(f, v2=False)
+                    compressor = Compressor(f, extended=False)
                     compressor.write(test_string)
                     compressor.flush(write_token=False)
 
@@ -245,7 +245,7 @@ def test_oob_2_byte_pattern(self):
     def test_excess_bits(self):
         for Compressor in Compressors:
             with self.subTest(Compressor=Compressor), io.BytesIO() as f:
-                compressor = Compressor(f, literal=7, v2=False)
+                compressor = Compressor(f, literal=7, extended=False)
 
                 with self.assertRaises((ExcessBitsError, NativeExcessBitsError)):
                     compressor.write(b"\xff")
@@ -271,7 +271,7 @@ def test_single_shot_compress_text(self):
                     ]
                     # fmt: on
                 )
-                self.assertEqual(compress("foo foo foo", v2=False), expected)
+                self.assertEqual(compress("foo foo foo", extended=False), expected)
 
     def test_single_shot_compress_binary(self):
         for compress in compresses:
@@ -293,7 +293,7 @@ def test_single_shot_compress_binary(self):
                     ]
                     # fmt: on
                 )
-                self.assertEqual(compress(b"foo foo foo", v2=False), expected)
+                self.assertEqual(compress(b"foo foo foo", extended=False), expected)
 
     def test_invalid_conf(self):
         for Compressor in Compressors:
diff --git a/tests/test_dataset_regression.py b/tests/test_dataset_regression.py
index 448619f6..a6716cfc 100644
--- a/tests/test_dataset_regression.py
+++ b/tests/test_dataset_regression.py
@@ -73,62 +73,62 @@
     ),
 ]
 
-# V2 format datasets (uses RLE and Extended Match encoding)
-V2_DATASETS = [
+# Extended format datasets (uses RLE and Extended Match encoding)
+EXTENDED_DATASETS = [
     (
-        "datasets/v2-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp",
+        "datasets/extended-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp",
         "e0c40eacf1afc550a6add74888c48bb981b28788a6d75a62a0e2444e997b9864",
     ),
     (
-        "datasets/v2-compressed/dickens.tamp",
+        "datasets/extended-compressed/dickens.tamp",
         "b24c37886142e11d0ee687db6ab06f936207aa7f2ea1fd1d9a36763c7a507e6a",
     ),
     (
-        "datasets/v2-compressed/mr.tamp",
+        "datasets/extended-compressed/mr.tamp",
         "68637ed52e3e4860174ed2dc0840ac77d5f1a60abbcb13770d5754e3774d53e6",
     ),
     (
-        "datasets/v2-compressed/ooffice.tamp",
+        "datasets/extended-compressed/ooffice.tamp",
         "e7ee013880d34dd5208283d0d3d91b07f442e067454276095ded14f322a656eb",
     ),
     (
-        "datasets/v2-compressed/osdb.tamp",
+        "datasets/extended-compressed/osdb.tamp",
         "60f027179302ca3ad87c58ac90b6be72ec23588aaa7a3b7fe8ecc0f11def3fa3",
     ),
     (
-        "datasets/v2-compressed/reymont.tamp",
+        "datasets/extended-compressed/reymont.tamp",
         "0eac0114a3dfe6e2ee1f345a0f79d653cb26c3bc9f0ed79238af4933422b7578",
     ),
     (
-        "datasets/v2-compressed/sao.tamp",
+        "datasets/extended-compressed/sao.tamp",
         "c2d0ea2cc59d4c21b7fe43a71499342a00cbe530a1d5548770e91ecd6214adcc",
     ),
     (
-        "datasets/v2-compressed/x-ray.tamp",
+        "datasets/extended-compressed/x-ray.tamp",
         "7de9fce1405dc44ae5e6813ed21cd5751e761bd4265655a005d39b9685d1c9ad",
     ),
     (
-        "datasets/v2-compressed/xml.tamp",
+        "datasets/extended-compressed/xml.tamp",
         "0e82e54e695c1938e4193448022543845b33020c8be6bf3bf3ead2224903e08c",
     ),
     (
-        "datasets/v2-compressed/samba.tamp",
+        "datasets/extended-compressed/samba.tamp",
         "93ba07bc44d8267789c1d911992f40b089ffa2140b4a160fac11ccae9a40e7b2",
     ),
     (
-        "datasets/v2-compressed/nci.tamp",
+        "datasets/extended-compressed/nci.tamp",
         "fc63a31770947b8c2062d3b19ca94c00485a232bb91b502021948fee983e1635",
     ),
     (
-        "datasets/v2-compressed/webster.tamp",
+        "datasets/extended-compressed/webster.tamp",
         "6a68f69b26daf09f9dd84f7470368553194a0b294fcfa80f1604efb11143a383",
     ),
     (
-        "datasets/v2-compressed/mozilla.tamp",
+        "datasets/extended-compressed/mozilla.tamp",
         "657fc3764b0c75ac9de9623125705831ebbfbe08fed248df73bc2dc66e2a963b",
     ),
     (
-        "datasets/v2-compressed/enwik8.tamp",
+        "datasets/extended-compressed/enwik8.tamp",
         "2b49720ec4d78c3c9fabaee6e4179a5e997302b3a70029f30f2d582218c024a8",
     ),
 ]
@@ -150,11 +150,11 @@ def test_v1_decompress(self):
                     self.assertEqual(actual, expected_sha256, f"SHA256 mismatch for {rel_path} using {impl_name}")
 
 
-class TestV2Decompression(unittest.TestCase):
+class TestExtendedDecompression(unittest.TestCase):
     @pytest.mark.dataset
-    def test_v2_decompress(self):
+    def test_extended_decompress(self):
         for impl_name, decompress_func in DECOMPRESSOR_IMPLEMENTATIONS:
-            for rel_path, expected_sha256 in V2_DATASETS:
+            for rel_path, expected_sha256 in EXTENDED_DATASETS:
                 with self.subTest(implementation=impl_name, dataset=rel_path):
                     path = PROJECT_DIR / rel_path
 
diff --git a/wasm/src/tamp.d.ts b/wasm/src/tamp.d.ts
index b2c11fac..8f196749 100644
--- a/wasm/src/tamp.d.ts
+++ b/wasm/src/tamp.d.ts
@@ -43,8 +43,8 @@ export interface TampOptions {
   literal?: number;
   /** Custom dictionary data. If null, no custom dictionary is used. If Uint8Array, uses the provided dictionary. Default: null */
   dictionary?: Uint8Array | null;
-  /** Enable v2 format (RLE, extended match) for better compression ratios. Default: true */
-  v2?: boolean;
+  /** Enable extended format (RLE, extended match) for better compression ratios. Default: true */
+  extended?: boolean;
   /** Enable lazy matching for better compression ratios. Default: false */
   lazy_matching?: boolean;
 }
@@ -70,7 +70,7 @@ export interface TampDefaults {
   readonly window: 10;
   readonly literal: 8;
   readonly dictionary: null;
-  readonly v2: true;
+  readonly extended: true;
   readonly lazy_matching: false;
 }
 
diff --git a/wasm/src/tamp.js b/wasm/src/tamp.js
index 3de7c235..df82ba72 100644
--- a/wasm/src/tamp.js
+++ b/wasm/src/tamp.js
@@ -122,7 +122,7 @@ export class TampCompressor {
       window: 10,
       literal: 8,
       dictionary: null,
-      v2: true,
+      extended: true,
       lazy_matching: false,
       ...options,
     };
@@ -184,7 +184,7 @@ export class TampCompressor {
         (this.options.window & 0xf) |
         ((this.options.literal & 0xf) << 4) |
         ((this.options.dictionary ? 1 : 0) << 8) |
-        ((this.options.v2 ? 1 : 0) << 9) |
+        ((this.options.extended ? 1 : 0) << 9) |
         ((this.options.lazy_matching ? 1 : 0) << 10);
       this.module.setValue(confPtr, confValue, 'i32');
 
@@ -792,11 +792,12 @@ export async function compress(data, options = {}) {
   const callbackOptions = {};
 
   // Extract compression-specific options
-  const { window, literal, dictionary, v2, lazy_matching, onPoll, signal, pollIntervalMs, pollIntervalBytes } = options;
+  const { window, literal, dictionary, extended, lazy_matching, onPoll, signal, pollIntervalMs, pollIntervalBytes } =
+    options;
   if (window !== undefined) compressionOptions.window = window;
   if (literal !== undefined) compressionOptions.literal = literal;
   if (dictionary !== undefined) compressionOptions.dictionary = dictionary;
-  if (v2 !== undefined) compressionOptions.v2 = v2;
+  if (extended !== undefined) compressionOptions.extended = extended;
   if (lazy_matching !== undefined) compressionOptions.lazy_matching = lazy_matching;
 
   // Extract callback options
diff --git a/website/index.html b/website/index.html
index 9fb60465..50ec9a68 100644
--- a/website/index.html
+++ b/website/index.html
@@ -145,12 +145,12 @@ <h3 style="margin-bottom: 0.75rem; font-weight: 500; color: #333;">Configuration
                     </select>
                 </div>
                 <div style="display: flex; align-items: center; gap: 0.5rem;">
-                    <input type="checkbox" class="option-checkbox" id="v2" checked>
-                    <label class="checkbox-label" for="v2" style="margin: 0;">V2 Format</label>
+                    <input type="checkbox" class="option-checkbox" id="extended" checked>
+                    <label class="checkbox-label" for="extended" style="margin: 0;">Extended</label>
                     <div class="help-tooltip">
                         <span class="help-icon">?</span>
                         <div class="tooltip-content">
-                            Enables v2 compression format with RLE and extended match encoding for better compression ratios. Recommended for most use cases.
+                            Enables extended compression format with RLE and extended match encoding for better compression ratios. Recommended for most use cases.
                         </div>
                     </div>
                 </div>
@@ -193,12 +193,12 @@ <h3 style="margin-bottom: 0.75rem; font-weight: 500; color: #333;">Configuration
                         </select>
                     </div>
                     <div style="display: flex; align-items: center; gap: 0.5rem;">
-                        <input type="checkbox" class="option-checkbox" id="textV2" checked>
-                        <label class="checkbox-label" for="textV2" style="margin: 0;">V2 Format</label>
+                        <input type="checkbox" class="option-checkbox" id="textExtended" checked>
+                        <label class="checkbox-label" for="textExtended" style="margin: 0;">Extended</label>
                         <div class="help-tooltip">
                             <span class="help-icon">?</span>
                             <div class="tooltip-content">
-                                Enables v2 compression format with RLE and extended match encoding for better compression ratios. Recommended for most use cases.
+                                Enables extended compression format with RLE and extended match encoding for better compression ratios. Recommended for most use cases.
                             </div>
                         </div>
                     </div>
diff --git a/website/js/main.js b/website/js/main.js
index 15458ab2..58ea6e03 100644
--- a/website/js/main.js
+++ b/website/js/main.js
@@ -23,10 +23,10 @@ let dropZone,
   compressionOptions,
   textMode,
   windowBitsSelect,
-  v2Checkbox,
+  extendedCheckbox,
   lazyMatchingCheckbox,
   textWindowBitsSelect,
-  textV2Checkbox,
+  textExtendedCheckbox,
   textLazyMatchingCheckbox,
   plainTextArea,
   compressedTextArea,
@@ -89,10 +89,10 @@ document.addEventListener('DOMContentLoaded', () => {
   compressionOptions = document.getElementById('compressionOptions');
   textMode = document.getElementById('textMode');
   windowBitsSelect = document.getElementById('windowBits');
-  v2Checkbox = document.getElementById('v2');
+  extendedCheckbox = document.getElementById('extended');
   lazyMatchingCheckbox = document.getElementById('lazyMatching');
   textWindowBitsSelect = document.getElementById('textWindowBits');
-  textV2Checkbox = document.getElementById('textV2');
+  textExtendedCheckbox = document.getElementById('textExtended');
   textLazyMatchingCheckbox = document.getElementById('textLazyMatching');
   plainTextArea = document.getElementById('plainText');
   compressedTextArea = document.getElementById('compressedText');
@@ -352,7 +352,7 @@ async function processFiles() {
         const windowBits = parseInt(windowBitsSelect.value);
         const options = {
           window: windowBits,
-          v2: v2Checkbox.checked,
+          extended: extendedCheckbox.checked,
           // Add progress callback for compression with overall progress calculation
           onPoll: async progressInfo => {
             const bytesProcessed = progressInfo.bytesProcessed || 0;
@@ -588,7 +588,7 @@ async function compressTextContent() {
   try {
     const options = {
       window: windowBits,
-      v2: textV2Checkbox.checked,
+      extended: textExtendedCheckbox.checked,
       // Add progress callback for text compression
       onPoll: async progressInfo => {
         const bytesProcessed = progressInfo.bytesProcessed || 0;
@@ -630,7 +630,7 @@ async function compressTextContent() {
     const ratio = data.length > 0 ? (data.length / compressed.length).toFixed(2) : '0';
     const savings = data.length > 0 ? ((1 - compressed.length / data.length) * 100).toFixed(1) : '0';
 
-    const configStr = `${windowBits}-bit window${textV2Checkbox.checked ? ', v2' : ', v1'}${
+    const configStr = `${windowBits}-bit window${textExtendedCheckbox.checked ? ', extended' : ', basic'}${
       isPureAscii ? ', 7-bit literals' : ''
     }${textLazyMatchingCheckbox.checked ? ', lazy matching' : ''}${
       dictionaryValidation.dictionaryBytes ? ', custom dictionary' : ''

From a0a75434a20fc1c1ccb2983f7fb8a3ea06282ddf Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Tue, 3 Feb 2026 09:32:06 -0500
Subject: [PATCH 051/109] Change cppcheck to use local

---
 .pre-commit-config.yaml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1c1cbfcb..686769a3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -57,10 +57,13 @@ repos:
         args: ['-style=file', '-i']
         exclude: ^espidf/tamp/private/tamp_search\.hpp$
 
-  - repo: https://github.com/pocc/pre-commit-hooks
-    rev: v1.3.5
+  - repo: local
     hooks:
       - id: cppcheck
+        name: cppcheck
+        entry: cppcheck
+        language: system
+        files: \.(c|h|cpp|hpp)$
         exclude: ^(espidf|mpy_bindings|ctests|tools)/
         args: [
             '-Itamp/_c_src',

From 0652b98214965247cc0c13bccfa1e24c07e5c312 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Tue, 3 Feb 2026 11:12:11 -0500
Subject: [PATCH 052/109] Add singular TAMP_EXTENDED flag; better document the
 compile-time flags.

---
 Makefile                  |  2 +-
 docs/source/c_library.rst | 55 +++++++++++++++++++++++++++++++++++++++
 tamp/_c_src/tamp/common.h | 12 ++++++---
 3 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 89a12d92..b9833532 100644
--- a/Makefile
+++ b/Makefile
@@ -504,7 +504,7 @@ C_SRC_COMP = tamp/_c_src/tamp/compressor.c
 C_SRC_DECOMP = tamp/_c_src/tamp/decompressor.c
 
 # Flags to disable extended format support
-NO_EXTENDED_FLAGS = -DTAMP_EXTENDED_COMPRESS=0 -DTAMP_EXTENDED_DECOMPRESS=0
+NO_EXTENDED_FLAGS = -DTAMP_EXTENDED=0
 
 c-size-no-extended:
 	@rm -rf build/arm && mkdir -p build/arm
diff --git a/docs/source/c_library.rst b/docs/source/c_library.rst
index dadf934d..42e950a6 100644
--- a/docs/source/c_library.rst
+++ b/docs/source/c_library.rst
@@ -5,6 +5,61 @@ C Library
 Tamp provides a C library optimized for low-memory-usage, fast runtime, and small binary footprint.
 This page describes how to use the provided library.
 
+Compile-Time Flags
+^^^^^^^^^^^^^^^^^^
+Tamp's C library can be customized via compile-time flags to control features, code size, and performance.
+Pass these flags to your compiler (e.g., ``-DTAMP_STREAM=0``).
+
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| Flag                             | Default           | Description                                                                  |
++==================================+===================+==============================================================================+
+| ``TAMP_EXTENDED``                | ``1``             | Default value for extended format support (RLE, extended match encoding).    |
+|                                  |                   | Set to ``0`` to disable extended support in both compressor and decompressor.|
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_EXTENDED_COMPRESS``       | ``TAMP_EXTENDED`` | Enable extended format compression. Defaults to ``TAMP_EXTENDED`` but can    |
+|                                  |                   | be individually overridden for compressor-only or decompressor-only builds.  |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_EXTENDED_DECOMPRESS``     | ``TAMP_EXTENDED`` | Enable extended format decompression. Defaults to ``TAMP_EXTENDED`` but can  |
+|                                  |                   | be individually overridden for compressor-only or decompressor-only builds.  |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_LAZY_MATCHING``           | ``0``             | Enable lazy matching support. When enabled, ``TampConf.lazy_matching``       |
+|                                  |                   | becomes available. Improves compression ratio by 0.5-2% at the cost of       |
+|                                  |                   | 50-75% slower compression. Most embedded systems should leave disabled.      |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_STREAM``                  | ``1``             | Include stream API (``tamp_compress_stream``, ``tamp_decompress_stream``).   |
+|                                  |                   | Disable with ``-DTAMP_STREAM=0`` to save ~2.8KB if only using low-level API. |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_STREAM_WORK_BUFFER_SIZE`` | ``32``            | Stack-allocated work buffer size (bytes) for stream API. Split in half       |
+|                                  |                   | for input/output. Larger values reduce I/O callback invocations,             |
+|                                  |                   | improving decompression speed. 256+ bytes recommended when stack permits.    |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_STREAM_STDIO``            | ``0``             | Enable stdio (``FILE*``) stream handlers. Works with standard C library,     |
+|                                  |                   | ESP-IDF VFS, and POSIX-compatible systems.                                   |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_STREAM_MEMORY``           | ``0``             | Enable memory buffer stream handlers (``TampMemReader``, ``TampMemWriter``). |
+|                                  |                   | Useful for file-to-memory or memory-to-file operations.                      |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_STREAM_LITTLEFS``         | ``0``             | Enable LittleFS stream handlers. Requires LittleFS headers.                  |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_STREAM_FATFS``            | ``0``             | Enable FatFs (ChaN's FAT filesystem) stream handlers. Requires FatFs headers.|
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_ESP32``                   | ``0``             | Use ESP32-optimized variant. Avoids bitfields for speed at the cost of       |
+|                                  |                   | slightly higher memory usage. Automatically enabled via Kconfig on ESP-IDF.  |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+
+**Example: Minimal decompressor-only build**
+
+.. code-block:: bash
+
+   gcc -DTAMP_EXTENDED_COMPRESS=0 -DTAMP_STREAM=0 -c decompressor.c common.c
+
+**Example: Full-featured build with LittleFS support**
+
+.. code-block:: bash
+
+   gcc -DTAMP_LAZY_MATCHING=1 -DTAMP_STREAM_LITTLEFS=1 -DTAMP_STREAM_WORK_BUFFER_SIZE=256 \
+       -c compressor.c decompressor.c common.c
+
 Overview
 ^^^^^^^^
 To use Tamp in your C project, simply copy the contents of ``tamp/_c_src`` into your project.
diff --git a/tamp/_c_src/tamp/common.h b/tamp/_c_src/tamp/common.h
index 48acdd7f..c33a8cb4 100644
--- a/tamp/_c_src/tamp/common.h
+++ b/tamp/_c_src/tamp/common.h
@@ -70,13 +70,19 @@ extern "C" {
 
 /* Extended format support (RLE, extended match).
  * Enabled by default. Disable to save code size on minimal builds.
- * Separate flags allow decompressor-only or compressor-only extended support.
+ *
+ * TAMP_EXTENDED is the master switch (default: 1).
+ * TAMP_EXTENDED_COMPRESS and TAMP_EXTENDED_DECOMPRESS default to TAMP_EXTENDED,
+ * but can be individually overridden for compressor-only or decompressor-only builds.
  */
+#ifndef TAMP_EXTENDED
+#define TAMP_EXTENDED 1
+#endif
 #ifndef TAMP_EXTENDED_DECOMPRESS
-#define TAMP_EXTENDED_DECOMPRESS 1
+#define TAMP_EXTENDED_DECOMPRESS TAMP_EXTENDED
 #endif
 #ifndef TAMP_EXTENDED_COMPRESS
-#define TAMP_EXTENDED_COMPRESS 1
+#define TAMP_EXTENDED_COMPRESS TAMP_EXTENDED
 #endif
 
 /* Extended encoding constants */

From f1a06db7ae83eab2b9d891c2357c0246fd49dda1 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Tue, 3 Feb 2026 11:29:36 -0500
Subject: [PATCH 053/109] Mention extended field in javascript docs.

---
 docs/source/javascript.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/source/javascript.rst b/docs/source/javascript.rst
index 39bbe215..6972befe 100644
--- a/docs/source/javascript.rst
+++ b/docs/source/javascript.rst
@@ -51,6 +51,12 @@ Customize compression behavior with options:
      // For general use, 8 (the whole byte) is appropriate.
      literal: 7,
 
+     // Enable extended format (RLE, extended match) for better compression ratios.
+     // The extended format provides better compression for typical data at the
+     // cost of slightly more complex encoding.
+     // Default: true
+     extended: true,
+
      // Enable lazy matching to slightly improve compression (0.5-2.0%) ratios
      // at the cost of 50-75% slower compression.
      // Most embedded systems will **not** want to use this feature and disable it.
@@ -136,6 +142,7 @@ Configure compression parameters by passing in options:
    const options = {
      window: 12,           // Larger window for (usually) better compression
      literal: 7,           // ASCII text only requires 7 bits.
+     extended: true,       // Enable extended format (RLE, extended match)
      lazy_matching: true   // Better compression ratios; slower to compress
    };
 

From 8fb80424f305fb726c46412763ea04e384d8e4df Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Tue, 3 Feb 2026 12:08:24 -0500
Subject: [PATCH 054/109] update README, add ablation study.

---
 README.md | 70 ++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 73fdf5be..811b73af 100644
--- a/README.md
+++ b/README.md
@@ -258,31 +258,30 @@ input data sourced from the
 [Enwik8](https://mattmahoney.net/dc/textdata.html). This should give a general
 idea of how these algorithms perform over a variety of input data types.
 
-| dataset         | raw         | tamp           | tamp (LazyMatching) | zlib           | heatshrink |
-| --------------- | ----------- | -------------- | ------------------- | -------------- | ---------- |
-| enwik8          | 100,000,000 | **51,635,633** | 51,252,113          | 56,205,166     | 56,110,394 |
-| silesia/dickens | 10,192,446  | **5,546,761**  | 5,511,604           | 6,049,169      | 6,155,768  |
-| silesia/mozilla | 51,220,480  | 25,121,385     | 24,936,067          | **25,104,966** | 25,435,908 |
-| silesia/mr      | 9,970,564   | 5,027,032      | 4,886,272           | **4,864,734**  | 5,442,180  |
-| silesia/nci     | 33,553,445  | 8,643,610      | 8,645,299           | **5,765,521**  | 8,247,487  |
-| silesia/ooffice | 6,152,192   | **3,814,938**  | 3,798,261           | 4,077,277      | 3,994,589  |
-| silesia/osdb    | 10,085,684  | **8,520,835**  | 8,506,443           | 8,625,159      | 8,747,527  |
-| silesia/reymont | 6,627,202   | **2,847,981**  | 2,820,870           | 2,897,661      | 2,910,251  |
-| silesia/samba   | 21,606,400  | 9,102,594      | 9,060,692           | **8,862,423**  | 9,223,827  |
-| silesia/sao     | 7,251,944   | **6,137,755**  | 6,101,744           | 6,506,417      | 6,400,926  |
-| silesia/webster | 41,458,703  | **18,694,172** | 18,567,288          | 20,212,235     | 19,942,817 |
-| silesia/x-ray   | 8,474,240   | 7,510,606      | 7,405,814           | **7,351,750**  | 8,059,723  |
-| silesia/xml     | 5,345,280   | 1,681,687      | 1,672,660           | **1,586,985**  | 1,665,179  |
-
-Tamp usually out-performs heatshrink, and is generally very competitive with
-zlib. While trying to be an apples-to-apples comparison, zlib still uses
-significantly more memory during both compression and decompression (see next
-section). Tamp accomplishes competitive performance while using around 10x less
-memory.
+| dataset         | raw         | tamp        | tamp (LazyMatching) | zlib          | heatshrink |
+| --------------- | ----------- | ----------- | ------------------- | ------------- | ---------- |
+| enwik8          | 100,000,000 | 51,116,968  | **50,725,098**      | 56,205,166    | 56,110,394 |
+| RPI_PICO (.uf2) | 667,648     | **288,704** | 289,735             | 303,763       | -          |
+| silesia/dickens | 10,192,446  | 5,538,712   | **5,503,021**       | 6,049,169     | 6,155,768  |
+| silesia/mozilla | 51,220,480  | 24,499,954  | **24,311,290**      | 25,104,966    | 25,435,908 |
+| silesia/mr      | 9,970,564   | 4,524,424   | **4,396,124**       | 4,864,734     | 5,442,180  |
+| silesia/nci     | 33,553,445  | 7,093,354   | 7,003,632           | **5,765,521** | 8,247,487  |
+| silesia/ooffice | 6,152,192   | 3,779,238   | **3,763,795**       | 4,077,277     | 3,994,589  |
+| silesia/osdb    | 10,085,684  | 8,467,407   | **8,452,497**       | 8,625,159     | 8,747,527  |
+| silesia/reymont | 6,627,202   | 2,825,458   | **2,793,473**       | 2,897,661     | 2,910,251  |
+| silesia/samba   | 21,606,400  | 8,443,932   | **8,395,048**       | 8,862,423     | 9,223,827  |
+| silesia/sao     | 7,251,944   | 6,136,102   | **6,100,071**       | 6,506,417     | 6,400,926  |
+| silesia/webster | 41,458,703  | 18,259,149  | **18,118,788**      | 20,212,235    | 19,942,817 |
+| silesia/x-ray   | 8,474,240   | 7,509,652   | 7,404,794           | **7,351,750** | 8,059,723  |
+| silesia/xml     | 5,345,280   | 1,493,131   | **1,473,832**       | 1,586,985     | 1,665,179  |
+
+Tamp outperforms both heatshrink and zlib on most datasets, winning 12 out of 14
+benchmarks. This is while using around 10x less memory than zlib during both
+compression and decompression (see next section).
 
 Lazy Matching is a simple technique to improve compression ratios at the expense
 of CPU while requiring very little code. One can expect **50-75%** more CPU
-usage for modest compression gains (around 0.5 - 2.0%). Because of this poor
+usage for modest compression gains (around 0.5 - 2.0%). Because of this
 trade-off, it is disabled by default; however, in applications where we want to
 compress once on a powerful machine (like a desktop/server) and decompress on an
 embedded device, it may be worth it to spend a bit more compute. Lazy matched
@@ -305,6 +304,33 @@ repeating data more efficiently. Given Tamp's excellent performance in most of
 the other data compression benchmark files, this is a good tradeoff for most
 real-world scenarios.
 
+### Ablation Study
+
+The following table shows the effect of the `extended` and `lazy_matching`
+compression parameters across all benchmark datasets (`window=10`, `literal=8`).
+
+| dataset         | raw         | Baseline   | +lazy              | +extended          | +lazy +extended    |
+| --------------- | ----------- | ---------- | ------------------ | ------------------ | ------------------ |
+| enwik8          | 100,000,000 | 51,635,633 | 51,252,113 (−0.7%) | 51,116,968 (−1.0%) | 50,725,098 (−1.8%) |
+| RPI_PICO (.uf2) | 667,648     | 331,310    | 329,875 (−0.4%)    | 288,704 (−12.9%)   | 289,735 (−12.5%)   |
+| silesia/dickens | 10,192,446  | 5,546,761  | 5,511,604 (−0.6%)  | 5,538,712 (−0.1%)  | 5,503,021 (−0.8%)  |
+| silesia/mozilla | 51,220,480  | 25,121,385 | 24,936,067 (−0.7%) | 24,499,954 (−2.5%) | 24,311,290 (−3.2%) |
+| silesia/mr      | 9,970,564   | 5,027,032  | 4,886,272 (−2.8%)  | 4,524,424 (−10.0%) | 4,396,124 (−12.6%) |
+| silesia/nci     | 33,553,445  | 8,643,610  | 8,645,299 (+0.0%)  | 7,093,354 (−17.9%) | 7,003,632 (−19.0%) |
+| silesia/ooffice | 6,152,192   | 3,814,938  | 3,798,261 (−0.4%)  | 3,779,238 (−0.9%)  | 3,763,795 (−1.3%)  |
+| silesia/osdb    | 10,085,684  | 8,520,835  | 8,506,443 (−0.2%)  | 8,467,407 (−0.6%)  | 8,452,497 (−0.8%)  |
+| silesia/reymont | 6,627,202   | 2,847,981  | 2,820,870 (−1.0%)  | 2,825,458 (−0.8%)  | 2,793,473 (−1.9%)  |
+| silesia/samba   | 21,606,400  | 9,102,594  | 9,060,692 (−0.5%)  | 8,443,932 (−7.2%)  | 8,395,048 (−7.8%)  |
+| silesia/sao     | 7,251,944   | 6,137,755  | 6,101,744 (−0.6%)  | 6,136,102 (−0.0%)  | 6,100,071 (−0.6%)  |
+| silesia/webster | 41,458,703  | 18,694,172 | 18,567,228 (−0.7%) | 18,259,149 (−2.3%) | 18,118,788 (−3.1%) |
+| silesia/x-ray   | 8,474,240   | 7,510,606  | 7,405,814 (−1.4%)  | 7,509,652 (−0.0%)  | 7,404,794 (−1.4%)  |
+| silesia/xml     | 5,345,280   | 1,681,687  | 1,672,660 (−0.5%)  | 1,493,131 (−11.2%) | 1,473,832 (−12.4%) |
+
+The `extended` parameter enables additional Huffman codes for longer pattern
+matches, which significantly improves compression on datasets with many long
+repeating patterns (e.g., nci, samba, xml). Extended support was added in
+v2.0.0.
+
 ## Memory Usage
 
 The following table shows approximately how much memory each algorithm uses

From 23239dd4f9cf5e89747adce8e219c62dc619e9a1 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Tue, 3 Feb 2026 15:04:26 -0500
Subject: [PATCH 055/109] Address copilot feedback

---
 .gitattributes                | 2 +-
 docs/source/specification.rst | 5 +++--
 tamp/_c_src/tamp/compressor.c | 2 ++
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.gitattributes b/.gitattributes
index 530b7adf..a1dce8fa 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1,2 @@
 datasets/v1-compressed/** filter=lfs diff=lfs merge=lfs -text
-datasets/v2-compressed/** filter=lfs diff=lfs merge=lfs -text
+datasets/extended-compressed/** filter=lfs diff=lfs merge=lfs -text
diff --git a/docs/source/specification.rst b/docs/source/specification.rst
index b5fc97a7..af211760 100644
--- a/docs/source/specification.rst
+++ b/docs/source/specification.rst
@@ -61,8 +61,9 @@ Modifications are made to make the implementation simpler/faster.
       and points at the offset from the beginning of the dictionary buffer to the pattern.
       The shortest pattern-length is either going to be 2 or 3 bytes, depending on ``window``
       and ``literal`` parameters. The shortest pattern-length encoding must be shorter than
-      an equivalent stream of literals. The longest pattern-length is the minimum
-      pattern-length plus 13.
+      an equivalent stream of literals. In the basic (non-extended) format, the longest
+      pattern-length is the minimum pattern-length plus 13. When the ``extended`` flag
+      is set, longer matches are possible via extended match encoding.
 
 Classically, the ``offset`` is from the current position in the buffer. Doing so results
 in the ``offset`` distribution slightly favoring smaller numbers. Intuitively, it makes
diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index ecfbe4db..ab7ce531 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -475,6 +475,8 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
             if (next_match_size > match_size &&
                 validate_no_match_overlap(compressor->window_pos, next_match_index, next_match_size)) {
                 // Force literal at current position, cache next match
+                compressor->cached_match_index = next_match_index;
+                compressor->cached_match_size = next_match_size;
                 match_size = 0;  // Will trigger literal write below
             } else {
                 compressor->cached_match_index = -1;

From 2d1829a4a85902492eaca9e808e0e6b1c95fd429 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Tue, 3 Feb 2026 19:39:07 -0500
Subject: [PATCH 056/109] update specs for v2

---
 docs/source/specification.rst | 79 +++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/docs/source/specification.rst b/docs/source/specification.rst
index af211760..cbbf5f58 100644
--- a/docs/source/specification.rst
+++ b/docs/source/specification.rst
@@ -169,6 +169,85 @@ The maximum match-size is more likely than the second-highest match-size because
 
 For any given huffman coding schema, a equivalent coding can be obtained by inverting all the bits (reflecting the huffman tree). The single-bit, most common code ``0b0`` representing a pattern-size 2 is intentionally represented as ``0b0`` instead of ``0b1``. This makes the MSb of all other codes be 1, simplifying the decoding procedure because the number of bits read doesn't strictly have to be recorded.
 
+Extended Format (v2.0.0+)
+^^^^^^^^^^^^^^^^^^^^^^^^^
+When the ``extended`` header bit is set, two additional token types are available:
+RLE (Run-Length Encoding) and Extended Match. These use Huffman symbols 12 and 13
+respectively, which in the basic format would represent match sizes ``min_pattern_size + 12``
+and ``min_pattern_size + 13``.
+
+Extended Huffman Encoding
+-------------------------
+Both RLE and Extended Match use a secondary Huffman encoding to represent their payload values.
+This encoding combines a Huffman code (without the literal flag) with trailing bits:
+
+1. Read the Huffman symbol (12 for RLE, 13 for Extended Match) with the literal flag (``0b0``).
+2. Decode an additional Huffman code (reusing the same table, but without the leading literal flag bit).
+3. Read trailing bits (4 bits for RLE, 3 bits for Extended Match).
+4. Combine: ``value = (huffman_index << trailing_bits) + trailing_bits_value``
+
+RLE Token (Symbol 12)
+---------------------
+RLE encodes runs of repeated bytes efficiently. The repeated byte is implicitly
+the last byte written to the window buffer.
+
+Format: ``0b0 | huffman_code[12] | extended_huffman(count - 2, trailing=4)``
+
+Where:
+
+- ``huffman_code[12]`` = ``0xAA`` (9 bits including literal flag)
+- ``extended_huffman`` encodes ``count - 2`` with 4 trailing bits
+- ``count`` ranges from 2 to 225: ``(13 << 4) + 15 + 2 = 225``
+
+Window update: Only the first 8 bytes are written to the dictionary (no wrap-around).
+If fewer than 8 bytes remain before the end of the window buffer, only those bytes
+are written. This bounds the window update cost while still allowing the decompressor
+to find subsequent pattern matches.
+
+.. code-block:: text
+
+   RLE Token Structure:
+   +---+------------+-------------------+----------------+
+   | 0 | huffman[12]| huffman(cnt>>4)   | cnt & 0xF      |
+   +---+------------+-------------------+----------------+
+   |1b |   8 bits   | 1-8 bits          | 4 bits         |
+   +---+------------+-------------------+----------------+
+
+Extended Match Token (Symbol 13)
+--------------------------------
+Extended Match allows pattern matches longer than the basic format's maximum of
+``min_pattern_size + 13``. It is used when a match exceeds ``min_pattern_size + 11``.
+
+Format: ``0b0 | huffman_code[13] | extended_huffman(size - min_pattern_size - 12, trailing=3) | offset``
+
+Where:
+
+- ``huffman_code[13]`` = ``0x27`` (7 bits including literal flag)
+- ``extended_huffman`` encodes ``size - min_pattern_size - 12`` with 3 trailing bits
+- ``offset`` is ``window`` bits, pointing to the start of the pattern
+- Maximum extra size: ``(13 << 3) + 7 + 1 = 112``
+- Maximum total match size: ``min_pattern_size + 11 + 112 = min_pattern_size + 123``
+
+The ``-12`` offset ensures extended matches start at ``min_pattern_size + 12``, leaving
+symbols 0-11 for basic matches (0-11 maps to ``min_pattern_size`` through ``min_pattern_size + 11``).
+
+Window constraints: The source pattern cannot span past the window buffer boundary;
+the compressor terminates extended matches early if they would cross this boundary.
+Similarly, destination writes do not wrap-around; only bytes up to the end of the
+window buffer are written. This simplifies implementation while having minimal
+impact on compression ratio (approximately 0.02% loss).
+
+.. code-block:: text
+
+   Extended Match Token Structure:
+   +---+------------+-------------------+----------------+--------+
+   | 0 | huffman[13]| huffman(sz>>3)    | sz & 0x7       | offset |
+   +---+------------+-------------------+----------------+--------+
+   |1b |   6 bits   | 1-8 bits          | 3 bits         | window |
+   +---+------------+-------------------+----------------+--------+
+
+   Where sz = match_size - min_pattern_size - 12
+
 Flush Symbol
 ------------
 A special FLUSH symbol is encoded as the least likely Huffman code.

From eb072c9bb1dac2f825661ac05fa84e57771fd6b6 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Tue, 3 Feb 2026 19:43:59 -0500
Subject: [PATCH 057/109] document RLE edgecase

---
 docs/source/specification.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/source/specification.rst b/docs/source/specification.rst
index cbbf5f58..31bb1024 100644
--- a/docs/source/specification.rst
+++ b/docs/source/specification.rst
@@ -189,7 +189,9 @@ This encoding combines a Huffman code (without the literal flag) with trailing b
 RLE Token (Symbol 12)
 ---------------------
 RLE encodes runs of repeated bytes efficiently. The repeated byte is implicitly
-the last byte written to the window buffer.
+the last byte written to the window buffer. If no bytes have been written yet
+(i.e., ``window_pos == 0``), the byte at position ``window_size - 1`` of the
+initial dictionary is used.
 
 Format: ``0b0 | huffman_code[12] | extended_huffman(count - 2, trailing=4)``
 

From 050c2d08de110453b645cfe38c0989c75b1e0087 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Tue, 3 Feb 2026 19:51:17 -0500
Subject: [PATCH 058/109]  fix(wasm): add cancel handlers to streams to prevent
 memory leaks

---
 wasm/src/streams.js | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/wasm/src/streams.js b/wasm/src/streams.js
index d3420027..2a6c3509 100644
--- a/wasm/src/streams.js
+++ b/wasm/src/streams.js
@@ -41,9 +41,17 @@ export class TampCompressionStream extends TransformStream {
         } finally {
           if (compressor) {
             compressor.destroy();
+            compressor = null;
           }
         }
       },
+
+      cancel(_reason) {
+        if (compressor) {
+          compressor.destroy();
+          compressor = null;
+        }
+      },
     });
   }
 }
@@ -88,9 +96,17 @@ export class TampDecompressionStream extends TransformStream {
         } finally {
           if (decompressor) {
             decompressor.destroy();
+            decompressor = null;
           }
         }
       },
+
+      cancel(_reason) {
+        if (decompressor) {
+          decompressor.destroy();
+          decompressor = null;
+        }
+      },
     });
   }
 }

From eebc663b2399479e9f390514d8aa9f38e8c82586 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Tue, 3 Feb 2026 19:56:37 -0500
Subject: [PATCH 059/109] update README

---
 README.md | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 811b73af..636e38d6 100644
--- a/README.md
+++ b/README.md
@@ -357,7 +357,7 @@ on an M3 Macbook Air.
 |                              | Compression (s) | Decompression (s) |
 | ---------------------------- | --------------- | ----------------- |
 | Tamp (Pure Python Reference) | 136.2           | 105.0             |
-| Tamp (C bindings)            | 5.56            | 0.544             |
+| Tamp (C bindings)            | 5.45            | 0.544             |
 | ZLib                         | 3.65            | 0.578             |
 | Heatshrink (with index)      | 4.42            | 0.67              |
 | Heatshrink (without index)   | 27.40           | 0.67              |
@@ -377,7 +377,7 @@ speed Tamp can achieve. In all tests, a 1KB window (10 bit) was used.
 |                                  | Compression (bytes/s) | Decompression (bytes/s) |
 | -------------------------------- | --------------------- | ----------------------- |
 | Tamp (MicroPython Viper)         | 4,300                 | 42,000                  |
-| Tamp (Micropython Native Module) | 31,192                | 1,086,957               |
+| Tamp (Micropython Native Module) | 31,949                | 1,086,957               |
 | Tamp (C)                         | 36,127                | 1,400,600               |
 | Deflate (micropython builtin)    | 6,885                 | 294,985                 |
 
@@ -391,19 +391,21 @@ compiled for the Pi Pico (`armv6m`). All libraries were compiled with `-O3`.
 Numbers reported in bytes. Tamp sizes were measured using `arm-none-eabi-gcc`
 15.2.1 and MicroPython v1.27, and can be regenerated with `make binary-size`.
 
-|                           | Compressor | Decompressor | Compressor + Decompressor |
-| ------------------------- | ---------- | ------------ | ------------------------- |
-| Tamp (MicroPython Viper)  | 4676       | 4372         | 7917                      |
-| Tamp (MicroPython Native) | 3896       | 3559         | 6616                      |
-| Tamp (C, -DTAMP_STREAM=0) | 2028       | 1992         | 3900                      |
-| Tamp (C)                  | 2472       | 2444         | 4796                      |
-| Heatshrink (C)            | 2956       | 3876         | 6832                      |
-| uzlib (C)                 | 2355       | 3963         | 6318                      |
-
-Tamp C includes a high-level stream API by default. Even with `-DTAMP_STREAM=0`,
-Tamp includes buffer-looping functions (like `tamp_compressor_compress`) that
-Heatshrink lacks (Heatshrink only provides poll/sink primitives). In an
-apples-to-apples comparison, Tamp would be even smaller.
+|                                  | Compressor | Decompressor | Compressor + Decompressor |
+| -------------------------------- | ---------- | ------------ | ------------------------- |
+| Tamp (MicroPython Viper)         | 4676       | 4372         | 7917                      |
+| Tamp (MicroPython Native)        | 3896       | 3559         | 6616                      |
+| Tamp (C, no extended, no stream) | 1800       | 1584         | 3264                      |
+| Tamp (C, no extended)            | 2204       | 2036         | 4120                      |
+| Tamp (C, extended, no stream)    | 2884       | 2436         | 5200                      |
+| Tamp (C, extended)               | 3288       | 2888         | 6056                      |
+| Heatshrink (C)                   | 2956       | 3876         | 6832                      |
+| uzlib (C)                        | 2355       | 3963         | 6318                      |
+
+Tamp C "extended" includes `tamp_compressor_compress_and_flush`. Tamp C includes
+a high-level stream API by default. Even with `no stream`, Tamp includes
+buffer-looping functions (like `tamp_compressor_compress`) that Heatshrink lacks
+(Heatshrink only provides poll/sink primitives).
 
 ## Acknowledgement
 

From f614a0016e0763eca430aa3ea74a0aa2a379fbbf Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 09:21:31 -0500
Subject: [PATCH 060/109] combine some TAMP_LAZY_MATCHING blocks

---
 tamp/_c_src/tamp/compressor.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index ab7ce531..d3ce0580 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -201,14 +201,11 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
     compressor->conf_window = conf->window;
     compressor->conf_use_custom_dictionary = conf->use_custom_dictionary;
     compressor->conf_extended = conf->extended;
-#if TAMP_LAZY_MATCHING
-    compressor->conf_lazy_matching = conf->lazy_matching;
-#endif
-
     compressor->window = window;
     compressor->min_pattern_size = tamp_compute_min_pattern_size(conf->window, conf->literal);
 
 #if TAMP_LAZY_MATCHING
+    compressor->conf_lazy_matching = conf->lazy_matching;
     compressor->cached_match_index = -1;  // Initialize cache as invalid
 #endif
 

From a8987796d928820d9a947b66e0390a3cf71eb610 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 09:25:40 -0500
Subject: [PATCH 061/109] more macro conditional blocks combined

---
 tamp/_c_src/tamp/compressor.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index d3ce0580..e63fd26f 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -338,6 +338,9 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
 
     if (TAMP_UNLIKELY(output_size == 0)) return TAMP_OUTPUT_FULL;
 
+    uint8_t match_size = 0;
+    uint16_t match_index = 0;
+
 #if TAMP_EXTENDED_COMPRESS
     // Extended: Handle extended match continuation
     if (TAMP_UNLIKELY(compressor->conf_extended && compressor->extended_match_count)) {
@@ -392,12 +395,7 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
         // Ran out of input while extending - return and wait for more
         return TAMP_OK;
     }
-#endif  // TAMP_EXTENDED_COMPRESS
-
-    uint8_t match_size = 0;
-    uint16_t match_index = 0;
 
-#if TAMP_EXTENDED_COMPRESS
     // Extended: Handle RLE accumulation with persistent state
     // For simplicity in C, we commit RLE immediately when the run ends
     if (TAMP_UNLIKELY(compressor->conf_extended)) {
@@ -444,15 +442,7 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
         } else {
             find_best_match(compressor, &match_index, &match_size);
         }
-    } else {
-        find_best_match(compressor, &match_index, &match_size);
-    }
-#else
-    find_best_match(compressor, &match_index, &match_size);
-#endif
 
-#if TAMP_LAZY_MATCHING
-    if (compressor->conf_lazy_matching) {
         // Lazy matching: if we have a good match, check if position i+1 has a better match
         if (match_size >= compressor->min_pattern_size && match_size <= 8 && compressor->input_size > match_size + 2) {
             // Temporarily advance input position to check next position
@@ -483,7 +473,11 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
         } else {
             compressor->cached_match_index = -1;  // Clear cache
         }
+    } else {
+        find_best_match(compressor, &match_index, &match_size);
     }
+#else
+    find_best_match(compressor, &match_index, &match_size);
 #endif
 
     // Shared token/literal writing logic

From 34f48be0fc82495d75e5c27670aa62241dcaddb3 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 09:38:23 -0500
Subject: [PATCH 062/109] combine some write_to_bit_buffer calls, reducing
 binary by 20~36 bytes

---
 README.md                     |  8 ++++----
 tamp/_c_src/tamp/compressor.c | 24 +++++++++++-------------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 636e38d6..ac4306b5 100644
--- a/README.md
+++ b/README.md
@@ -395,10 +395,10 @@ Numbers reported in bytes. Tamp sizes were measured using `arm-none-eabi-gcc`
 | -------------------------------- | ---------- | ------------ | ------------------------- |
 | Tamp (MicroPython Viper)         | 4676       | 4372         | 7917                      |
 | Tamp (MicroPython Native)        | 3896       | 3559         | 6616                      |
-| Tamp (C, no extended, no stream) | 1800       | 1584         | 3264                      |
-| Tamp (C, no extended)            | 2204       | 2036         | 4120                      |
-| Tamp (C, extended, no stream)    | 2884       | 2436         | 5200                      |
-| Tamp (C, extended)               | 3288       | 2888         | 6056                      |
+| Tamp (C, no extended, no stream) | 1764       | 1584         | 3228                      |
+| Tamp (C, no extended)            | 2168       | 2036         | 4084                      |
+| Tamp (C, extended, no stream)    | 2864       | 2436         | 5180                      |
+| Tamp (C, extended)               | 3268       | 2888         | 6036                      |
 | Heatshrink (C)                   | 2956       | 3876         | 6832                      |
 | uzlib (C)                        | 2355       | 3963         | 6318                      |
 
diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index e63fd26f..685511ad 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -59,11 +59,10 @@ static inline void write_to_bit_buffer(TampCompressor *compressor, uint32_t bits
  * @param[in] trailing_bits Number of trailing bits (3 for extended match, 4 for RLE).
  */
 static TAMP_NOINLINE void write_extended_huffman(TampCompressor *compressor, uint8_t value, uint8_t trailing_bits) {
-    uint8_t mask = (1 << trailing_bits) - 1;
     uint8_t code_index = value >> trailing_bits;
-    // Write huffman code without literal flag (subtract 1 from bit length)
-    write_to_bit_buffer(compressor, huffman_codes[code_index], huffman_bits[code_index] - 1);
-    write_to_bit_buffer(compressor, value & mask, trailing_bits);
+    // Write huffman code (without literal flag) + trailing bits in one call
+    write_to_bit_buffer(compressor, (huffman_codes[code_index] << trailing_bits) | (value & ((1 << trailing_bits) - 1)),
+                        (huffman_bits[code_index] - 1) + trailing_bits);
 }
 
 #endif  // TAMP_EXTENDED_COMPRESS
@@ -211,12 +210,11 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
 
     if (!compressor->conf_use_custom_dictionary) tamp_initialize_dictionary(window, (1 << conf->window));
 
-    // Write header to bit buffer
-    write_to_bit_buffer(compressor, compressor->conf_window - 8, 3);
-    write_to_bit_buffer(compressor, compressor->conf_literal - 5, 2);
-    write_to_bit_buffer(compressor, compressor->conf_use_custom_dictionary, 1);
-    write_to_bit_buffer(compressor, compressor->conf_extended, 1);
-    write_to_bit_buffer(compressor, 0, 1);  // No more header bytes
+    // Write header to bit buffer (8 bits total)
+    // Layout: [window:3][literal:2][use_custom_dictionary:1][extended:1][more_headers:1]
+    uint8_t header = ((compressor->conf_window - 8) << 5) | ((compressor->conf_literal - 5) << 3) |
+                     (compressor->conf_use_custom_dictionary << 2) | (compressor->conf_extended << 1);
+    write_to_bit_buffer(compressor, header, 8);
 
     return TAMP_OK;
 }
@@ -501,10 +499,10 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
             return TAMP_OK;
         }
 #endif  // TAMP_EXTENDED_COMPRESS
-        // Write TOKEN
+        // Write TOKEN (huffman code + window position)
         uint8_t huffman_index = match_size - compressor->min_pattern_size;
-        write_to_bit_buffer(compressor, huffman_codes[huffman_index], huffman_bits[huffman_index]);
-        write_to_bit_buffer(compressor, match_index, compressor->conf_window);
+        write_to_bit_buffer(compressor, (huffman_codes[huffman_index] << compressor->conf_window) | match_index,
+                            huffman_bits[huffman_index] + compressor->conf_window);
     }
     // Populate Window
     for (uint8_t i = 0; i < match_size; i++) {

From d8828bc3e4b63fae6bc1c4ae790e62df43db90c6 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 09:40:39 -0500
Subject: [PATCH 063/109] replace hard-coded values with their math formulas

---
 tamp/_c_src/tamp/compressor.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 685511ad..d0ff261a 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -30,11 +30,8 @@ static const uint8_t huffman_codes[] = {0x0, 0x3, 0x8, 0xb, 0x14, 0x24, 0x26, 0x
 static const uint8_t huffman_bits[] = {0x2, 0x3, 0x5, 0x5, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0x9, 0x7};
 
 #if TAMP_EXTENDED_COMPRESS
-// Extended: Maximum RLE count = (13 << 4) + 15 + 2 = 225
-#define RLE_MAX_COUNT 225
-// Extended: Maximum extended match extra = (13 << 3) + 7 + 1 = 112
-// Total max match = min_pattern_size + 11 + 112 = min_pattern_size + 123
-#define EXTENDED_MATCH_MAX_EXTRA 112
+#define RLE_MAX_COUNT ((13 << 4) + 15 + 2)            // 225
+#define EXTENDED_MATCH_MAX_EXTRA ((13 << 3) + 7 + 1)  // 112
 
 // Minimum output buffer space required for extended match token.
 // Extended match: symbol (7 bits) + extended huffman (11 bits) + window pos (15 bits) = 33 bits.

From 578fcea7e92d9939a586c9fc70cdfb7dcac24bdd Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 09:43:54 -0500
Subject: [PATCH 064/109] build header from conf rather than compressor values.
 Slightly more efficient

---
 README.md                     |  8 ++++----
 tamp/_c_src/tamp/compressor.c | 11 ++++++-----
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index ac4306b5..65485a9b 100644
--- a/README.md
+++ b/README.md
@@ -395,10 +395,10 @@ Numbers reported in bytes. Tamp sizes were measured using `arm-none-eabi-gcc`
 | -------------------------------- | ---------- | ------------ | ------------------------- |
 | Tamp (MicroPython Viper)         | 4676       | 4372         | 7917                      |
 | Tamp (MicroPython Native)        | 3896       | 3559         | 6616                      |
-| Tamp (C, no extended, no stream) | 1764       | 1584         | 3228                      |
-| Tamp (C, no extended)            | 2168       | 2036         | 4084                      |
-| Tamp (C, extended, no stream)    | 2864       | 2436         | 5180                      |
-| Tamp (C, extended)               | 3268       | 2888         | 6036                      |
+| Tamp (C, no extended, no stream) | 1756       | 1584         | 3220                      |
+| Tamp (C, no extended)            | 2160       | 2036         | 4076                      |
+| Tamp (C, extended, no stream)    | 2856       | 2436         | 5172                      |
+| Tamp (C, extended)               | 3260       | 2888         | 6028                      |
 | Heatshrink (C)                   | 2956       | 3876         | 6832                      |
 | uzlib (C)                        | 2355       | 3963         | 6318                      |
 
diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index d0ff261a..686bc019 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -193,6 +193,11 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
     for (uint8_t i = 0; i < sizeof(TampCompressor); i++)  // Zero-out the struct
         ((unsigned char *)compressor)[i] = 0;
 
+    // Build header directly from conf (8 bits total)
+    // Layout: [window:3][literal:2][use_custom_dictionary:1][extended:1][more_headers:1]
+    uint8_t header = ((conf->window - 8) << 5) | ((conf->literal - 5) << 3) | (conf->use_custom_dictionary << 2) |
+                     (conf->extended << 1);
+
     compressor->conf_literal = conf->literal;
     compressor->conf_window = conf->window;
     compressor->conf_use_custom_dictionary = conf->use_custom_dictionary;
@@ -205,12 +210,8 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
     compressor->cached_match_index = -1;  // Initialize cache as invalid
 #endif
 
-    if (!compressor->conf_use_custom_dictionary) tamp_initialize_dictionary(window, (1 << conf->window));
+    if (!conf->use_custom_dictionary) tamp_initialize_dictionary(window, (1 << conf->window));
 
-    // Write header to bit buffer (8 bits total)
-    // Layout: [window:3][literal:2][use_custom_dictionary:1][extended:1][more_headers:1]
-    uint8_t header = ((compressor->conf_window - 8) << 5) | ((compressor->conf_literal - 5) << 3) |
-                     (compressor->conf_use_custom_dictionary << 2) | (compressor->conf_extended << 1);
     write_to_bit_buffer(compressor, header, 8);
 
     return TAMP_OK;

From b6cdf4f00b968ab6e915b2bfc8c8f922f6079d39 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 10:34:19 -0500
Subject: [PATCH 065/109] shrink binary more by embedding conf into
 TampCompressor instead of repacking.

---
 README.md                        |  8 ++---
 espidf/tamp/compressor_esp32.cpp |  2 +-
 tamp/_c_src/tamp/compressor.c    | 46 +++++++++++++--------------
 tamp/_c_src/tamp/compressor.h    | 54 ++++++++++----------------------
 4 files changed, 43 insertions(+), 67 deletions(-)

diff --git a/README.md b/README.md
index 65485a9b..f7c21060 100644
--- a/README.md
+++ b/README.md
@@ -395,10 +395,10 @@ Numbers reported in bytes. Tamp sizes were measured using `arm-none-eabi-gcc`
 | -------------------------------- | ---------- | ------------ | ------------------------- |
 | Tamp (MicroPython Viper)         | 4676       | 4372         | 7917                      |
 | Tamp (MicroPython Native)        | 3896       | 3559         | 6616                      |
-| Tamp (C, no extended, no stream) | 1756       | 1584         | 3220                      |
-| Tamp (C, no extended)            | 2160       | 2036         | 4076                      |
-| Tamp (C, extended, no stream)    | 2856       | 2436         | 5172                      |
-| Tamp (C, extended)               | 3260       | 2888         | 6028                      |
+| Tamp (C, no extended, no stream) | 1648       | 1584         | 3112                      |
+| Tamp (C, no extended)            | 2052       | 2036         | 3968                      |
+| Tamp (C, extended, no stream)    | 2796       | 2436         | 5112                      |
+| Tamp (C, extended)               | 3200       | 2888         | 5968                      |
 | Heatshrink (C)                   | 2956       | 3876         | 6832                      |
 | uzlib (C)                        | 2355       | 3963         | 6318                      |
 
diff --git a/espidf/tamp/compressor_esp32.cpp b/espidf/tamp/compressor_esp32.cpp
index afa0c9be..a17570cd 100644
--- a/espidf/tamp/compressor_esp32.cpp
+++ b/espidf/tamp/compressor_esp32.cpp
@@ -27,7 +27,7 @@ typedef uint32_t u16;
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 
 #define MAX_PATTERN_SIZE (compressor->min_pattern_size + 13)
-#define WINDOW_SIZE (1 << compressor->conf_window)
+#define WINDOW_SIZE (1 << compressor->conf.window)
 
 static inline void write_to_bit_buffer(TampCompressor *compressor, uint32_t bits, u8 n_bits) {
     compressor->bit_buffer_pos += n_bits;
diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 686bc019..991267b7 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -12,15 +12,15 @@
 #if TAMP_EXTENDED_COMPRESS
 // Extended max pattern: min_pattern_size + 11 + 112 = min_pattern_size + 123
 #define MAX_PATTERN_SIZE_EXTENDED (compressor->min_pattern_size + 123)
-#define MAX_PATTERN_SIZE (compressor->conf_extended ? MAX_PATTERN_SIZE_EXTENDED : (compressor->min_pattern_size + 13))
+#define MAX_PATTERN_SIZE (compressor->conf.extended ? MAX_PATTERN_SIZE_EXTENDED : (compressor->min_pattern_size + 13))
 #else
 #define MAX_PATTERN_SIZE (compressor->min_pattern_size + 13)
 #endif
-#define WINDOW_SIZE (1 << compressor->conf_window)
+#define WINDOW_SIZE (1 << compressor->conf.window)
 // 0xF because sizeof(TampCompressor.input) == 16;
 #define input_add(offset) ((compressor->input_pos + offset) & 0xF)
 #define read_input(offset) (compressor->input[input_add(offset)])
-#define IS_LITERAL_FLAG (1 << compressor->conf_literal)
+#define IS_LITERAL_FLAG (1 << compressor->conf.literal)
 
 #define FLUSH_CODE (0xAB)
 
@@ -198,15 +198,11 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
     uint8_t header = ((conf->window - 8) << 5) | ((conf->literal - 5) << 3) | (conf->use_custom_dictionary << 2) |
                      (conf->extended << 1);
 
-    compressor->conf_literal = conf->literal;
-    compressor->conf_window = conf->window;
-    compressor->conf_use_custom_dictionary = conf->use_custom_dictionary;
-    compressor->conf_extended = conf->extended;
+    compressor->conf = *conf;  // Single struct copy
     compressor->window = window;
     compressor->min_pattern_size = tamp_compute_min_pattern_size(conf->window, conf->literal);
 
 #if TAMP_LAZY_MATCHING
-    compressor->conf_lazy_matching = conf->lazy_matching;
     compressor->cached_match_index = -1;  // Initialize cache as invalid
 #endif
 
@@ -222,7 +218,7 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
  * @brief Get the last byte written to the window.
  */
 static inline uint8_t get_last_window_byte(TampCompressor *compressor) {
-    uint16_t prev_pos = (compressor->window_pos - 1) & ((1 << compressor->conf_window) - 1);
+    uint16_t prev_pos = (compressor->window_pos - 1) & ((1 << compressor->conf.window) - 1);
     return compressor->window[prev_pos];
 }
 
@@ -233,7 +229,7 @@ static inline uint8_t get_last_window_byte(TampCompressor *compressor) {
  * @param[in] count Number of repeated bytes (must be >= 2).
  */
 static TAMP_NOINLINE void write_rle_token(TampCompressor *compressor, uint8_t count) {
-    const uint16_t window_mask = (1 << compressor->conf_window) - 1;
+    const uint16_t window_mask = (1 << compressor->conf.window) - 1;
     uint8_t symbol = get_last_window_byte(compressor);
 
     // Write RLE symbol (12) with literal flag
@@ -265,7 +261,7 @@ static TAMP_NOINLINE void write_rle_token(TampCompressor *compressor, uint8_t co
  */
 static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor *compressor, unsigned char *output,
                                                          size_t output_size, size_t *output_written_size) {
-    const uint16_t window_mask = (1 << compressor->conf_window) - 1;
+    const uint16_t window_mask = (1 << compressor->conf.window) - 1;
     const uint8_t count = compressor->extended_match_count;
     const uint16_t position = compressor->extended_match_position;
     tamp_res res;
@@ -287,7 +283,7 @@ static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor *compres
     if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
 
     // Write window position - with ≤7 bits remaining, up to 22 bits total - fits
-    write_to_bit_buffer(compressor, position, compressor->conf_window);
+    write_to_bit_buffer(compressor, position, compressor->conf.window);
 
     // Final flush
     res = partial_flush(compressor, output, output_size, &flush_bytes);
@@ -314,7 +310,7 @@ static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor *compres
 TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned char *output, size_t output_size,
                                             size_t *output_written_size) {
     tamp_res res;
-    const uint16_t window_mask = (1 << compressor->conf_window) - 1;
+    const uint16_t window_mask = (1 << compressor->conf.window) - 1;
     size_t output_written_size_proxy;
 
     if (!output_written_size) output_written_size = &output_written_size_proxy;
@@ -339,7 +335,7 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
 
 #if TAMP_EXTENDED_COMPRESS
     // Extended: Handle extended match continuation
-    if (TAMP_UNLIKELY(compressor->conf_extended && compressor->extended_match_count)) {
+    if (TAMP_UNLIKELY(compressor->conf.extended && compressor->extended_match_count)) {
         // We're in extended match mode - try to extend the match at the current position
         const uint8_t max_ext_match = compressor->min_pattern_size + 11 + EXTENDED_MATCH_MAX_EXTRA;
         const unsigned char *window = compressor->window;
@@ -394,7 +390,7 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
 
     // Extended: Handle RLE accumulation with persistent state
     // For simplicity in C, we commit RLE immediately when the run ends
-    if (TAMP_UNLIKELY(compressor->conf_extended)) {
+    if (TAMP_UNLIKELY(compressor->conf.extended)) {
         uint8_t last_byte = get_last_window_byte(compressor);
 
         // Count and CONSUME matching bytes
@@ -429,7 +425,7 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
 #endif  // TAMP_EXTENDED_COMPRESS
 
 #if TAMP_LAZY_MATCHING
-    if (compressor->conf_lazy_matching) {
+    if (compressor->conf.lazy_matching) {
         // Check if we have a cached match from lazy matching
         if (TAMP_UNLIKELY(compressor->cached_match_index >= 0)) {
             match_index = compressor->cached_match_index;
@@ -481,14 +477,14 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
         // Write LITERAL
         match_size = 1;
         unsigned char c = read_input(0);
-        if (TAMP_UNLIKELY(c >> compressor->conf_literal)) {
+        if (TAMP_UNLIKELY(c >> compressor->conf.literal)) {
             return TAMP_EXCESS_BITS;
         }
-        write_to_bit_buffer(compressor, IS_LITERAL_FLAG | c, compressor->conf_literal + 1);
+        write_to_bit_buffer(compressor, IS_LITERAL_FLAG | c, compressor->conf.literal + 1);
     } else {
 #if TAMP_EXTENDED_COMPRESS
         // Extended: Check for extended match
-        if (compressor->conf_extended && match_size > compressor->min_pattern_size + 11) {
+        if (compressor->conf.extended && match_size > compressor->min_pattern_size + 11) {
             compressor->extended_match_count = match_size;
             compressor->extended_match_position = match_index;
             // Consume matched bytes from input
@@ -499,8 +495,8 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
 #endif  // TAMP_EXTENDED_COMPRESS
         // Write TOKEN (huffman code + window position)
         uint8_t huffman_index = match_size - compressor->min_pattern_size;
-        write_to_bit_buffer(compressor, (huffman_codes[huffman_index] << compressor->conf_window) | match_index,
-                            huffman_bits[huffman_index] + compressor->conf_window);
+        write_to_bit_buffer(compressor, (huffman_codes[huffman_index] << compressor->conf.window) | match_index,
+                            huffman_bits[huffman_index] + compressor->conf.window);
     }
     // Populate Window
     for (uint8_t i = 0; i < match_size; i++) {
@@ -590,7 +586,7 @@ tamp_res tamp_compressor_flush(TampCompressor *compressor, unsigned char *output
 
 #if TAMP_EXTENDED_COMPRESS
     // Extended: Flush any pending RLE
-    if (compressor->conf_extended && compressor->rle_count >= 1) {
+    if (compressor->conf.extended && compressor->rle_count >= 1) {
         // Partial flush first to make room
         res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
         (*output_written_size) += chunk_output_written_size;
@@ -601,10 +597,10 @@ tamp_res tamp_compressor_flush(TampCompressor *compressor, unsigned char *output
         if (compressor->rle_count == 1) {
             // Single byte - write as literal (can't use RLE token for count < 2)
             uint8_t literal = get_last_window_byte(compressor);
-            write_to_bit_buffer(compressor, IS_LITERAL_FLAG | literal, compressor->conf_literal + 1);
+            write_to_bit_buffer(compressor, IS_LITERAL_FLAG | literal, compressor->conf.literal + 1);
 
             // Write to window
-            const uint16_t window_mask = (1 << compressor->conf_window) - 1;
+            const uint16_t window_mask = (1 << compressor->conf.window) - 1;
             compressor->window[compressor->window_pos] = literal;
             compressor->window_pos = (compressor->window_pos + 1) & window_mask;
         } else {
@@ -622,7 +618,7 @@ tamp_res tamp_compressor_flush(TampCompressor *compressor, unsigned char *output
     }
 
     // Extended: Flush any pending extended match
-    if (compressor->conf_extended && compressor->extended_match_count) {
+    if (compressor->conf.extended && compressor->extended_match_count) {
         // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
         if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
         res = write_extended_match_token(compressor, output, output_size, &chunk_output_written_size);
diff --git a/tamp/_c_src/tamp/compressor.h b/tamp/_c_src/tamp/compressor.h
index ff2f1663..577eac58 100644
--- a/tamp/_c_src/tamp/compressor.h
+++ b/tamp/_c_src/tamp/compressor.h
@@ -22,26 +22,9 @@ typedef struct TampCompressor {
     unsigned char input[16];  // Input ring buffer
 
     /* WARM: read frequently, often cached in locals */
-    uint8_t min_pattern_size;            // Minimum pattern size (2 bits used; 2 or 3)
-    uint8_t conf_window;                 // Window bits (4 bits used; 8-15)
-    uint8_t conf_literal;                // Literal bits (4 bits used; 5-8)
-    uint8_t conf_use_custom_dictionary;  // Custom dictionary (1 bit used; init only)
-    uint8_t conf_extended;               // Extended format enabled (1 bit used)
-#if TAMP_LAZY_MATCHING
-    uint8_t conf_lazy_matching;  // Lazy matching enabled (1 bit used)
-    int16_t cached_match_index;  // Lazy matching cache
-    uint8_t cached_match_size;
-#endif
-
-#if TAMP_EXTENDED_COMPRESS
-    /* Extended state (only needed when extended compression is enabled) */
-    uint8_t rle_count;                 // Current RLE run length (max 225)
-    uint8_t extended_match_count;      // Current extended match size (max ~126)
-    uint16_t extended_match_position;  // Window position for extended match
-#endif
-
-#else  // Use bitfields for reduced memory-usage
-
+    uint8_t min_pattern_size;  // Minimum pattern size (2 bits used; 2 or 3)
+    TampConf conf;
+#else   // Use bitfields for reduced memory-usage
     /* HOT: accessed every iteration of the compression loop */
     unsigned char *window;    // Pointer to window buffer
     uint32_t bit_buffer;      // Bit buffer for output (32 bits)
@@ -51,28 +34,25 @@ typedef struct TampCompressor {
     uint8_t input_pos;        // Current position in input buffer (4 bits used; 0-15)
     unsigned char input[16];  // Input ring buffer
 
-    /* WARM: read frequently, often cached in locals.
-     * Bitfields: min_pattern_size(2) + conf_window(4) + conf_literal(4) +
-     *            conf_use_custom_dictionary(1) + conf_extended(1) + conf_lazy_matching(1) = 13 bits
-     */
-    uint8_t min_pattern_size : 2;            // Minimum pattern size (2 or 3)
-    uint8_t conf_window : 4;                 // Window bits (8-15)
-    uint8_t conf_literal : 4;                // Literal bits (5-8)
-    uint8_t conf_use_custom_dictionary : 1;  // Custom dictionary (init only)
-    uint8_t conf_extended : 1;               // Extended format enabled
+    /* WARM: read frequently, often cached in locals */
+    uint8_t min_pattern_size;  // Minimum pattern size (2 or 3)
+    TampConf conf;
+#endif  // TAMP_ESP32
+
+    /* Fields interleaved to avoid internal padding when both LAZY_MATCHING and EXTENDED_COMPRESS enabled */
 #if TAMP_LAZY_MATCHING
-    uint8_t conf_lazy_matching : 1;  // Lazy matching enabled
-    int16_t cached_match_index;      // Lazy matching cache
-    uint8_t cached_match_size;
+    int16_t cached_match_index;  // Lazy matching cache
 #endif
-
 #if TAMP_EXTENDED_COMPRESS
-    uint8_t rle_count;                 // Current RLE run length (max 225)
-    uint8_t extended_match_count;      // Current extended match size (max ~126)
     uint16_t extended_match_position;  // Window position for extended match
 #endif
-
-#endif  // TAMP_ESP32
+#if TAMP_LAZY_MATCHING
+    uint8_t cached_match_size;
+#endif
+#if TAMP_EXTENDED_COMPRESS
+    uint8_t rle_count;             // Current RLE run length (max 225)
+    uint8_t extended_match_count;  // Current extended match size (max ~126)
+#endif
 } TampCompressor;
 
 /**

From bbbdfe4b9d325f4720e811282e077b7c899293e4 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 10:47:30 -0500
Subject: [PATCH 066/109] Remove micropython viper references from the README

---
 README.md | 49 +++----------------------------------------------
 1 file changed, 3 insertions(+), 46 deletions(-)

diff --git a/README.md b/README.md
index f7c21060..78268418 100644
--- a/README.md
+++ b/README.md
@@ -34,11 +34,8 @@ of RAM and firmware storage.
     - `pip install tamp` will use a python-bound C implementation optimized for
       speed.
   - Micropython:
-    - Native Module (suggested micropython implementation).
+    - Native Module.
       - `mpy_bindings/`
-    - Viper.
-      - `tamp/__init__.py`, `tamp/compressor_viper.py`,
-        `tamp/decompressor_viper.py`
   - C library:
     - `tamp/_c_src/`
   - Javascript/Typescript via Emscripten WASM.
@@ -56,14 +53,12 @@ of RAM and firmware storage.
 
 # Installation
 
-Tamp contains 4 implementations:
+Tamp contains 3 implementations:
 
 1. A reference desktop CPython implementation that is optimized for readability
    (and **not** speed).
 2. A Micropython Native Module implementation (fast).
-3. A Micropython Viper implementation (not recommended, please use Native
-   Module).
-4. A C implementation (with python bindings) for accelerated desktop use and to
+3. A C implementation (with python bindings) for accelerated desktop use and to
    be used in C projects (very fast).
 
 This section instructs how to install each implementation.
@@ -98,42 +93,6 @@ following to `pyproject.toml`.
 tamp = "https://github.com/BrianPugh/tamp/releases/download/v1.7.0/tamp-1.7.0-mpy1.23-armv6m.mpy"
 ```
 
-### MicroPython Viper
-
-**NOT RECOMMENDED, PLEASE USE NATIVE MODULE**
-
-For micropython use, there are 3 main files:
-
-1. `tamp/__init__.py` - Always required.
-2. `tamp/decompressor_viper.py` - Required for on-device decompression.
-3. `tamp/compressor_viper.py` - Required for on-device compression.
-
-For example, if on-device decompression isn't used, then do not include
-`decompressor_viper.py`. If manually installing, just copy these files to your
-microcontroller's `/lib/tamp` folder.
-
-If using
-[mip](https://docs.micropython.org/en/latest/reference/packages.html#installing-packages-with-mip),
-tamp can be installed by specifying the appropriate `package-*.json` file.
-
-```bash
-mip install github:brianpugh/tamp  # Defaults to package.json: Compressor & Decompressor
-mip install github:brianpugh/tamp/package-compressor.json  # Compressor only
-mip install github:brianpugh/tamp/package-decompressor.json  # Decompressor only
-```
-
-If using [Belay](https://github.com/BrianPugh/belay), tamp can be installed by
-adding the following to `pyproject.toml`.
-
-```toml
-[tool.belay.dependencies]
-tamp = [
-   "https://github.com/BrianPugh/tamp/blob/main/tamp/__init__.py",
-   "https://github.com/BrianPugh/tamp/blob/main/tamp/compressor_viper.py",
-   "https://github.com/BrianPugh/tamp/blob/main/tamp/decompressor_viper.py",
-]
-```
-
 ## C
 
 Copy the `tamp/_c_src/tamp` folder into your project. For more information, see
@@ -376,7 +335,6 @@ speed Tamp can achieve. In all tests, a 1KB window (10 bit) was used.
 
 |                                  | Compression (bytes/s) | Decompression (bytes/s) |
 | -------------------------------- | --------------------- | ----------------------- |
-| Tamp (MicroPython Viper)         | 4,300                 | 42,000                  |
 | Tamp (Micropython Native Module) | 31,949                | 1,086,957               |
 | Tamp (C)                         | 36,127                | 1,400,600               |
 | Deflate (micropython builtin)    | 6,885                 | 294,985                 |
@@ -393,7 +351,6 @@ Numbers reported in bytes. Tamp sizes were measured using `arm-none-eabi-gcc`
 
 |                                  | Compressor | Decompressor | Compressor + Decompressor |
 | -------------------------------- | ---------- | ------------ | ------------------------- |
-| Tamp (MicroPython Viper)         | 4676       | 4372         | 7917                      |
 | Tamp (MicroPython Native)        | 3896       | 3559         | 6616                      |
 | Tamp (C, no extended, no stream) | 1648       | 1584         | 3112                      |
 | Tamp (C, no extended)            | 2052       | 2036         | 3968                      |

From 484131a6bb0fc8343f46192c709b342b32302294 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 10:57:21 -0500
Subject: [PATCH 067/109] don't include the stream API in micropython native
 module.

---
 Makefile  | 2 +-
 README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index b9833532..9fd03d22 100644
--- a/Makefile
+++ b/Makefile
@@ -75,7 +75,7 @@ MOD = tamp
 # Override -Os with -O2 for better performance (last flag wins)
 CFLAGS_EXTRA = -O2
 
-CFLAGS += -Itamp/_c_src -DTAMP_COMPRESSOR=$(TAMP_COMPRESSOR) -DTAMP_DECOMPRESSOR=$(TAMP_DECOMPRESSOR)
+CFLAGS += -Itamp/_c_src -DTAMP_COMPRESSOR=$(TAMP_COMPRESSOR) -DTAMP_DECOMPRESSOR=$(TAMP_DECOMPRESSOR) -DTAMP_STREAM=0
 # Compiler-specific flags based on target architecture
 ifeq ($(filter $(ARCH),x86 x64),)
 # Cross-compiling for embedded (ARM, xtensa) - use GCC flags
diff --git a/README.md b/README.md
index 78268418..f191aa95 100644
--- a/README.md
+++ b/README.md
@@ -351,7 +351,7 @@ Numbers reported in bytes. Tamp sizes were measured using `arm-none-eabi-gcc`
 
 |                                  | Compressor | Decompressor | Compressor + Decompressor |
 | -------------------------------- | ---------- | ------------ | ------------------------- |
-| Tamp (MicroPython Native)        | 3896       | 3559         | 6616                      |
+| Tamp (MicroPython Native)        | 4736       | 4339         | 8236                      |
 | Tamp (C, no extended, no stream) | 1648       | 1584         | 3112                      |
 | Tamp (C, no extended)            | 2052       | 2036         | 3968                      |
 | Tamp (C, extended, no stream)    | 2796       | 2436         | 5112                      |

From 6d3d26f4d59e6cb6ac2f9c2defc4f853f9c8601f Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 11:07:05 -0500
Subject: [PATCH 068/109] Add likely/unlikely hints to micropython bindings

---
 mpy_bindings/bindings.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mpy_bindings/bindings.c b/mpy_bindings/bindings.c
index 125f31c5..0cdf9d0a 100644
--- a/mpy_bindings/bindings.c
+++ b/mpy_bindings/bindings.c
@@ -14,9 +14,9 @@ _Static_assert(CHUNK_SIZE >= 32, "CHUNK_SIZE must be >= 32 to hold flush output"
 #define mp_type_bytearray (*(mp_obj_type_t *)(mp_load_global(MP_QSTR_bytearray)))
 
 static void TAMP_CHECK(tamp_res res) {
-    if (res == TAMP_EXCESS_BITS) {
+    if (TAMP_UNLIKELY(res == TAMP_EXCESS_BITS)) {
         nlr_raise(mp_obj_new_exception(mp_load_global(MP_QSTR_ExcessBitsError)));
-    } else if (res < TAMP_OK) {
+    } else if (TAMP_UNLIKELY(res < TAMP_OK)) {
         mp_raise_ValueError("");
     }
 }
@@ -77,7 +77,7 @@ static mp_obj_t compressor_make_new(const mp_obj_type_t *type, size_t n_args, si
 
     mp_buffer_info_t dictionary_buffer_info;
     mp_get_buffer_raise(o->dictionary, &dictionary_buffer_info, MP_BUFFER_RW);
-    if (dictionary_buffer_info.len < (1 << conf.window)) {
+    if (TAMP_UNLIKELY(dictionary_buffer_info.len < (1 << conf.window))) {
         mp_raise_ValueError("");
     }
 
@@ -177,7 +177,7 @@ static mp_obj_t decompressor_make_new(const mp_obj_type_t *type, size_t n_args,
 
     const uint16_t window_size = 1 << conf.window;
     if (o->dictionary == mp_const_none) {
-        if (conf.use_custom_dictionary) {
+        if (TAMP_UNLIKELY(conf.use_custom_dictionary)) {
             mp_raise_ValueError("");
         }
         o->dictionary = mp_obj_new_bytearray_by_ref(window_size, m_malloc(window_size));
@@ -186,7 +186,7 @@ static mp_obj_t decompressor_make_new(const mp_obj_type_t *type, size_t n_args,
     {
         mp_buffer_info_t dictionary_buffer_info;
         mp_get_buffer_raise(o->dictionary, &dictionary_buffer_info, MP_BUFFER_RW);
-        if (dictionary_buffer_info.len < window_size) {
+        if (TAMP_UNLIKELY(dictionary_buffer_info.len < window_size)) {
             mp_raise_ValueError("");
         }
 

From a5d7691a2af82849d2425504041670204b4b604b Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 13:04:58 -0500
Subject: [PATCH 069/109] don't inline write_to_bitbuffer to save around 60
 bytes.

---
 README.md                     | 10 +++++-----
 tamp/_c_src/tamp/compressor.c |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index f191aa95..ef66b64f 100644
--- a/README.md
+++ b/README.md
@@ -351,11 +351,11 @@ Numbers reported in bytes. Tamp sizes were measured using `arm-none-eabi-gcc`
 
 |                                  | Compressor | Decompressor | Compressor + Decompressor |
 | -------------------------------- | ---------- | ------------ | ------------------------- |
-| Tamp (MicroPython Native)        | 4736       | 4339         | 8236                      |
-| Tamp (C, no extended, no stream) | 1648       | 1584         | 3112                      |
-| Tamp (C, no extended)            | 2052       | 2036         | 3968                      |
-| Tamp (C, extended, no stream)    | 2796       | 2436         | 5112                      |
-| Tamp (C, extended)               | 3200       | 2888         | 5968                      |
+| Tamp (MicroPython Native)        | 4708       | 4339         | 8208                      |
+| Tamp (C, no extended, no stream) | 1596       | 1584         | 3060                      |
+| Tamp (C, no extended)            | 2000       | 2036         | 3916                      |
+| Tamp (C, extended, no stream)    | 2736       | 2436         | 5052                      |
+| Tamp (C, extended)               | 3140       | 2888         | 5908                      |
 | Heatshrink (C)                   | 2956       | 3876         | 6832                      |
 | uzlib (C)                        | 2355       | 3963         | 6318                      |
 
diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 991267b7..2f25aeba 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -40,7 +40,7 @@ static const uint8_t huffman_bits[] = {0x2, 0x3, 0x5, 0x5, 0x6, 0x7, 0x7, 0x7, 0
 #define EXTENDED_MATCH_MIN_OUTPUT_BYTES 6
 #endif
 
-static inline void write_to_bit_buffer(TampCompressor *compressor, uint32_t bits, uint8_t n_bits) {
+static TAMP_NOINLINE void write_to_bit_buffer(TampCompressor *compressor, uint32_t bits, uint8_t n_bits) {
     compressor->bit_buffer_pos += n_bits;
     compressor->bit_buffer |= bits << (32 - compressor->bit_buffer_pos);
 }

From d6325dd1980b09de45223665d5b0b00946c4bcaf Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 13:24:29 -0500
Subject: [PATCH 070/109] cache LFS files.

---
 .github/workflows/tests.yaml | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 47a77b95..35c8ca5a 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -110,7 +110,18 @@ jobs:
         uses: actions/checkout@v4
         with:
           submodules: recursive
-          lfs: true
+          lfs: false
+
+      - name: Cache LFS files
+        uses: actions/cache@v4
+        id: lfs-cache
+        with:
+          path: .git/lfs
+          key: lfs-${{ hashFiles('datasets/v1-compressed/**', 'datasets/extended-compressed/**') }}
+          restore-keys: lfs-
+
+      - name: Pull LFS files
+        run: git lfs pull
 
       - name: Set up python 3.13 (for Poetry)
         id: setup-python-system

From 47184212d2c2c9235dddcccd9fa3ff2dacbe5c29 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 13:32:31 -0500
Subject: [PATCH 071/109] immediately update lfs cache

---
 .github/workflows/tests.yaml | 49 ++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 35c8ca5a..f7d50656 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -94,8 +94,48 @@ jobs:
           source .venv/bin/activate
           SKIP=wasm-eslint,wasm-npm-test,wasm-file-validation,typescript-check,package-json-lint pre-commit run --show-diff-on-failure --color=always --all-files
 
+  cache-lfs:
+    name: 'Cache LFS files'
+    runs-on: ubuntu-latest
+    outputs:
+      cache-key: ${{ steps.lfs-key.outputs.key }}
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          lfs: false
+
+      - name: Compute LFS cache key
+        id: lfs-key
+        run: |
+          # Hash pointer files before they get replaced by git lfs pull
+          # Use find for reliable recursive globbing, sort for deterministic order
+          hash=$(find datasets/v1-compressed datasets/extended-compressed -name '*.tamp' -type f | sort | xargs cat | sha256sum | cut -d' ' -f1)
+          echo "key=lfs-${hash}" >> $GITHUB_OUTPUT
+          echo "Cache key: lfs-${hash}"
+
+      - name: Restore LFS cache
+        uses: actions/cache/restore@v4
+        id: lfs-cache
+        with:
+          path: .git/lfs
+          key: ${{ steps.lfs-key.outputs.key }}
+
+      - name: Pull LFS files
+        if: steps.lfs-cache.outputs.cache-hit != 'true'
+        run: git lfs pull
+
+      - name: Save LFS cache
+        if: steps.lfs-cache.outputs.cache-hit != 'true'
+        uses: actions/cache/save@v4
+        with:
+          path: .git/lfs
+          key: ${{ steps.lfs-key.outputs.key }}
+
   test:
     name: 'Test Python ${{ matrix.python-version }}'
+    needs: cache-lfs
     timeout-minutes: 15
     runs-on: ubuntu-latest
     strategy:
@@ -112,13 +152,12 @@ jobs:
           submodules: recursive
           lfs: false
 
-      - name: Cache LFS files
-        uses: actions/cache@v4
-        id: lfs-cache
+      - name: Restore LFS cache
+        uses: actions/cache/restore@v4
         with:
           path: .git/lfs
-          key: lfs-${{ hashFiles('datasets/v1-compressed/**', 'datasets/extended-compressed/**') }}
-          restore-keys: lfs-
+          key: ${{ needs.cache-lfs.outputs.cache-key }}
+          fail-on-cache-miss: true
 
       - name: Pull LFS files
         run: git lfs pull

From ae9e8ffe1f3f3ca07b41c75cf58d2497bedde12b Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 14:09:35 -0500
Subject: [PATCH 072/109] stop testing the viper implementation.

---
 tests/test_compressor.py              |  6 ------
 tests/test_compressor_decompressor.py | 11 +++--------
 tests/test_decompressor.py            | 11 +++++++----
 tests/test_pseudorandom.py            |  6 ------
 4 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/tests/test_compressor.py b/tests/test_compressor.py
index eeeff9db..309dcd78 100644
--- a/tests/test_compressor.py
+++ b/tests/test_compressor.py
@@ -36,12 +36,6 @@
 NativeExcessBitsError = ExcessBitsError
 
 if micropython:
-    from tamp.compressor_viper import Compressor as ViperCompressor
-    from tamp.compressor_viper import compress as viper_compress
-
-    Compressors.append(ViperCompressor)
-    compresses.append(viper_compress)
-
     try:
         from tamp_native import Compressor as NativeCompressor
         from tamp_native import ExcessBitsError as NativeExcessBitsError
diff --git a/tests/test_compressor_decompressor.py b/tests/test_compressor_decompressor.py
index 22b964ad..aa0a8e1e 100644
--- a/tests/test_compressor_decompressor.py
+++ b/tests/test_compressor_decompressor.py
@@ -19,21 +19,16 @@
         CCompressor = None
         CDecompressor = None
 
-    ViperCompressor = None
-    ViperDecompressor = None
     NativeCompressor = None
     NativeDecompressor = None
 else:
-    # MicroPython: only test Viper and Native implementations
+    # MicroPython: only test Native implementation
     # Pure Python and Cython implementations use CPython-specific features
     PyCompressor = None
     PyDecompressor = None
     CCompressor = None
     CDecompressor = None
 
-    from tamp.compressor_viper import Compressor as ViperCompressor
-    from tamp.decompressor_viper import Decompressor as ViperDecompressor
-
     try:
         from tamp_native import Compressor as NativeCompressor
         from tamp_native import Decompressor as NativeDecompressor
@@ -43,8 +38,8 @@
         NativeDecompressor = None
 
 
-Compressors = (PyCompressor, CCompressor, ViperCompressor, NativeCompressor)
-Decompressors = (PyDecompressor, CDecompressor, ViperDecompressor, NativeDecompressor)
+Compressors = (PyCompressor, CCompressor, NativeCompressor)
+Decompressors = (PyDecompressor, CDecompressor, NativeDecompressor)
 
 
 def walk_compressors_decompressors():
diff --git a/tests/test_decompressor.py b/tests/test_decompressor.py
index 26e9d2fd..df4cc0b0 100644
--- a/tests/test_decompressor.py
+++ b/tests/test_decompressor.py
@@ -26,11 +26,14 @@
         pass
 
 else:
-    from tamp.decompressor_viper import Decompressor as ViperDecompressor
-    from tamp.decompressor_viper import decompress as viper_decompress
+    try:
+        from tamp_native import Decompressor as NativeDecompressor
+        from tamp_native import decompress as native_decompress
 
-    Decompressors.append(ViperDecompressor)
-    decompresses.append(viper_decompress)
+        Decompressors.append(NativeDecompressor)
+        decompresses.append(native_decompress)
+    except ImportError:
+        pass
 
 
 class TestDecompressor(unittest.TestCase):
diff --git a/tests/test_pseudorandom.py b/tests/test_pseudorandom.py
index f2ca6f54..62d0e756 100644
--- a/tests/test_pseudorandom.py
+++ b/tests/test_pseudorandom.py
@@ -12,12 +12,6 @@
     micropython = None
 
 if micropython:
-    import tamp.compressor_viper
-    import tamp.decompressor_viper
-
-    modules.append(tamp.compressor_viper)
-    modules.append(tamp.decompressor_viper)
-
     try:
         import tamp_native
 

From 4392f2b805f38c271982f3afeff5abbc04269069 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 14:43:14 -0500
Subject: [PATCH 073/109] skip the dataset tests when building wheels

---
 .github/workflows/build_wheels.yaml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build_wheels.yaml b/.github/workflows/build_wheels.yaml
index 2f9bb482..17e29e85 100644
--- a/.github/workflows/build_wheels.yaml
+++ b/.github/workflows/build_wheels.yaml
@@ -117,7 +117,7 @@ jobs:
           CIBW_ARCHS: ${{ matrix.cibw_archs }}
           CIBW_BUILD: ${{ matrix.cibw_build }}
           CIBW_TEST_REQUIRES: pytest
-          CIBW_TEST_COMMAND: pytest {package}/tests
+          CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset
 
       - uses: actions/upload-artifact@v4
         with:
@@ -164,7 +164,7 @@ jobs:
           CIBW_ARCHS: ${{ matrix.cibw_archs }}
           CIBW_BUILD: ${{ matrix.cibw_build }}
           CIBW_TEST_REQUIRES: pytest
-          CIBW_TEST_COMMAND: pytest {package}/tests
+          CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset
 
       - uses: actions/upload-artifact@v4
         with:
@@ -212,7 +212,7 @@ jobs:
           CIBW_ARCHS: ${{ matrix.cibw_archs }}
           CIBW_BUILD: ${{ matrix.cibw_build }}
           CIBW_TEST_REQUIRES: pytest
-          CIBW_TEST_COMMAND: pytest {package}/tests
+          CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset
 
       - uses: actions/upload-artifact@v4
         with:
@@ -254,7 +254,7 @@ jobs:
           CIBW_ARCHS: ${{ matrix.cibw_archs }}
           CIBW_BUILD: ${{ matrix.cibw_build }}
           CIBW_TEST_REQUIRES: pytest
-          CIBW_TEST_COMMAND: pytest {package}/tests
+          CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset
 
       - uses: actions/upload-artifact@v4
         with:
@@ -302,7 +302,7 @@ jobs:
           CIBW_ARCHS: ${{ matrix.cibw_archs }}
           CIBW_BUILD: ${{ matrix.cibw_build }}
           CIBW_TEST_REQUIRES: pytest
-          CIBW_TEST_COMMAND: pytest {package}/tests
+          CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset
 
       - uses: actions/upload-artifact@v4
         with:
@@ -344,7 +344,7 @@ jobs:
           CIBW_ARCHS: ${{ matrix.cibw_archs }}
           CIBW_BUILD: ${{ matrix.cibw_build }}
           CIBW_TEST_REQUIRES: pytest
-          CIBW_TEST_COMMAND: pytest {package}/tests
+          CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset
 
       - uses: actions/upload-artifact@v4
         with:
@@ -385,7 +385,7 @@ jobs:
           CIBW_ARCHS: ${{ matrix.cibw_archs }}
           CIBW_BUILD: ${{ matrix.cibw_build }}
           CIBW_TEST_REQUIRES: pytest
-          CIBW_TEST_COMMAND: pytest {package}/tests
+          CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset
 
       - uses: actions/upload-artifact@v4
         with:

From 528077044d3f53e7a98d512d9da33b0d97e7acf7 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 14:58:47 -0500
Subject: [PATCH 074/109] update expected enwik8 hash

---
 .github/workflows/tests.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index f7d50656..79071a9b 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -267,7 +267,7 @@ jobs:
         implementation: [desktop, embedded]
     env:
       POETRY_HOME: '~/poetry'
-      EXPECTED_COMPRESSED_HASH: '02e05af059a0040d641988075cf1dfc479a084f9a34b5c8a348354211c5fa038'
+      EXPECTED_COMPRESSED_HASH: '5996293c04a89014580334da4d69374ee414f1ae5087b813bab69209573b6966'
 
     steps:
       - name: Check out repository

From f8ad5d3b672176685d0897147e5c27971964d085 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 21:13:15 -0500
Subject: [PATCH 075/109] Fix window corruption.

---
 tamp/_c_src/tamp/compressor.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 2f25aeba..3a34a33c 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -291,13 +291,28 @@ static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor *compres
     if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
 
     // Write to window (up to end of buffer, no wrap)
+    // Handle overlap: when destination is ahead of source and they overlap,
+    // we must copy in reverse order to avoid reading corrupted data.
     uint16_t remaining = WINDOW_SIZE - compressor->window_pos;
     uint8_t window_write = MIN(count, remaining);
-    for (uint8_t i = 0; i < window_write; i++) {
-        compressor->window[compressor->window_pos] = compressor->window[position + i];
-        compressor->window_pos++;
+
+    // Calculate distance from source to destination in circular buffer
+    const uint16_t src_to_dst = (compressor->window_pos - position) & window_mask;
+
+    if (TAMP_UNLIKELY(src_to_dst < window_write && src_to_dst > 0)) {
+        // Overlap case: copy in reverse order
+        for (uint8_t i = window_write; i-- > 0;) {
+            compressor->window[(compressor->window_pos + i) & window_mask] = compressor->window[position + i];
+        }
+        compressor->window_pos = (compressor->window_pos + window_write) & window_mask;
+    } else {
+        // Normal case: forward copy
+        for (uint8_t i = 0; i < window_write; i++) {
+            compressor->window[compressor->window_pos] = compressor->window[position + i];
+            compressor->window_pos++;
+        }
+        compressor->window_pos &= window_mask;
     }
-    compressor->window_pos &= window_mask;
 
     // Reset extended match state
     compressor->extended_match_count = 0;
@@ -389,7 +404,6 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
     }
 
     // Extended: Handle RLE accumulation with persistent state
-    // For simplicity in C, we commit RLE immediately when the run ends
     if (TAMP_UNLIKELY(compressor->conf.extended)) {
         uint8_t last_byte = get_last_window_byte(compressor);
 
@@ -483,13 +497,14 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
         write_to_bit_buffer(compressor, IS_LITERAL_FLAG | c, compressor->conf.literal + 1);
     } else {
 #if TAMP_EXTENDED_COMPRESS
-        // Extended: Check for extended match
+        // Extended: Start extended match continuation
         if (compressor->conf.extended && match_size > compressor->min_pattern_size + 11) {
             compressor->extended_match_count = match_size;
             compressor->extended_match_position = match_index;
             // Consume matched bytes from input
             compressor->input_pos = input_add(match_size);
             compressor->input_size -= match_size;
+            // Return - continuation code at start of poll will try to extend or emit
             return TAMP_OK;
         }
 #endif  // TAMP_EXTENDED_COMPRESS

From ad98ad3e124b9721d7dd611dc309eba6a398c439 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 21:20:09 -0500
Subject: [PATCH 076/109] share window_copy between compressor/decompressor.

---
 tamp/_c_src/tamp/common.c       | 30 +++++++++++++++++++
 tamp/_c_src/tamp/common.h       | 20 +++++++++++++
 tamp/_c_src/tamp/compressor.c   | 21 +-------------
 tamp/_c_src/tamp/decompressor.c | 51 ++-------------------------------
 4 files changed, 53 insertions(+), 69 deletions(-)

diff --git a/tamp/_c_src/tamp/common.c b/tamp/_c_src/tamp/common.c
index f88dd344..b1ef1b9b 100644
--- a/tamp/_c_src/tamp/common.c
+++ b/tamp/_c_src/tamp/common.c
@@ -38,6 +38,36 @@ int8_t tamp_compute_min_pattern_size(uint8_t window, uint8_t literal) {
     return 2 + (window > (10 + ((literal - 5) << 1)));
 }
 
+void tamp_window_copy(unsigned char *window, uint16_t *window_pos, uint16_t window_offset, uint8_t match_size,
+                      uint16_t window_mask) {
+    /* Calculate distance from source to destination in circular buffer.
+     * src_to_dst = (dst - src) & mask gives the forward distance. */
+    const uint16_t src_to_dst = (*window_pos - window_offset) & window_mask;
+
+    /* Critical overlap case: destination is AHEAD of source and they overlap.
+     * When dst > src by less than match_size, a forward copy corrupts data because
+     * we write to positions before reading from them.
+     *
+     * Example: src=100, dst=105, match_size=8
+     *   - Forward copy at i=5 would read window[105], but we already overwrote it at i=0!
+     *   - Must copy in REVERSE order (end to start) to read source bytes before overwriting.
+     */
+    if (TAMP_UNLIKELY(src_to_dst < match_size && src_to_dst > 0)) {
+        /* Copy in reverse order: start from last byte, work backwards to first byte.
+         * This ensures we read all overlapping source bytes before they're overwritten.
+         * Destination wraps via mask; source doesn't need wrapping (pre-validated bounds). */
+        for (uint8_t i = match_size; i-- > 0;) {
+            window[(*window_pos + i) & window_mask] = window[window_offset + i];
+        }
+        *window_pos = (*window_pos + match_size) & window_mask;
+    } else {
+        for (uint8_t i = 0; i < match_size; i++) {
+            window[*window_pos] = window[window_offset + i];
+            *window_pos = (*window_pos + 1) & window_mask;
+        }
+    }
+}
+
 /*******************************************************************************
  * Built-in I/O handler implementations
  ******************************************************************************/
diff --git a/tamp/_c_src/tamp/common.h b/tamp/_c_src/tamp/common.h
index c33a8cb4..706ce0fa 100644
--- a/tamp/_c_src/tamp/common.h
+++ b/tamp/_c_src/tamp/common.h
@@ -324,6 +324,26 @@ void tamp_initialize_dictionary(unsigned char *buffer, size_t size);
  */
 int8_t tamp_compute_min_pattern_size(uint8_t window, uint8_t literal);
 
+/**
+ * @brief Copy pattern from window to window, updating window_pos.
+ *
+ * Handles potential overlap between source and destination regions by
+ * copying backwards when the destination would "catch up" to the source.
+ *
+ * IMPORTANT: Caller must validate that (window_offset + match_size) does not
+ * exceed window bounds before calling this function. This function assumes
+ * window_offset and match_size are pre-validated and does not perform
+ * bounds checking on source reads.
+ *
+ * @param window Circular buffer (size must be power of 2)
+ * @param window_pos Current write position (updated by this function)
+ * @param window_offset Source position to copy from
+ * @param match_size Number of bytes to copy
+ * @param window_mask Bitmask for wrapping (window_size - 1)
+ */
+void tamp_window_copy(unsigned char *window, uint16_t *window_pos, uint16_t window_offset, uint8_t match_size,
+                      uint16_t window_mask);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 3a34a33c..6d98086b 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -291,28 +291,9 @@ static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor *compres
     if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
 
     // Write to window (up to end of buffer, no wrap)
-    // Handle overlap: when destination is ahead of source and they overlap,
-    // we must copy in reverse order to avoid reading corrupted data.
     uint16_t remaining = WINDOW_SIZE - compressor->window_pos;
     uint8_t window_write = MIN(count, remaining);
-
-    // Calculate distance from source to destination in circular buffer
-    const uint16_t src_to_dst = (compressor->window_pos - position) & window_mask;
-
-    if (TAMP_UNLIKELY(src_to_dst < window_write && src_to_dst > 0)) {
-        // Overlap case: copy in reverse order
-        for (uint8_t i = window_write; i-- > 0;) {
-            compressor->window[(compressor->window_pos + i) & window_mask] = compressor->window[position + i];
-        }
-        compressor->window_pos = (compressor->window_pos + window_write) & window_mask;
-    } else {
-        // Normal case: forward copy
-        for (uint8_t i = 0; i < window_write; i++) {
-            compressor->window[compressor->window_pos] = compressor->window[position + i];
-            compressor->window_pos++;
-        }
-        compressor->window_pos &= window_mask;
-    }
+    tamp_window_copy(compressor->window, &compressor->window_pos, position, window_write, window_mask);
 
     // Reset extended match state
     compressor->extended_match_count = 0;
diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index a17cb1d3..23ab4ed1 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -33,53 +33,6 @@ static const uint8_t HUFFMAN_TABLE[128] = {
     17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,  17,  17,  17,  17, 17, 17,  17,  17,  17,  17, 17,
     17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,  17,  17,  17,  17, 17, 17,  17,  17,  17};
 
-/**
- * @brief Copy pattern from window to window, updating window_pos.
- *
- * Handles potential overlap between source and destination regions by
- * copying backwards when the destination would "catch up" to the source.
- *
- * IMPORTANT: Caller must validate that (window_offset + match_size) does not
- * exceed window bounds before calling this function. This function assumes
- * window_offset and match_size are pre-validated and does not perform
- * bounds checking on source reads.
- *
- * @param window Circular buffer (size must be power of 2)
- * @param window_pos Current write position (updated by this function)
- * @param window_offset Source position to copy from
- * @param match_size Number of bytes to copy
- * @param window_mask Bitmask for wrapping (window_size - 1)
- */
-TAMP_NOINLINE static void window_copy(unsigned char* window, uint16_t* window_pos, uint16_t window_offset,
-                                      uint8_t match_size, uint16_t window_mask) {
-    /* Calculate distance from source to destination in circular buffer.
-     * src_to_dst = (dst - src) & mask gives the forward distance. */
-    const uint16_t src_to_dst = (*window_pos - window_offset) & window_mask;
-
-    /* Critical overlap case: destination is AHEAD of source and they overlap.
-     * When dst > src by less than match_size, a forward copy corrupts data because
-     * we write to positions before reading from them.
-     *
-     * Example: src=100, dst=105, match_size=8
-     *   - Forward copy at i=5 would read window[105], but we already overwrote it at i=0!
-     *   - Must copy in REVERSE order (end to start) to read source bytes before overwriting.
-     */
-    if (TAMP_UNLIKELY(src_to_dst < match_size && src_to_dst > 0)) {
-        /* Copy in reverse order: start from last byte, work backwards to first byte.
-         * This ensures we read all overlapping source bytes before they're overwritten.
-         * Destination wraps via mask; source doesn't need wrapping (pre-validated bounds). */
-        for (uint8_t i = match_size; i-- > 0;) {
-            window[(*window_pos + i) & window_mask] = window[window_offset + i];
-        }
-        *window_pos = (*window_pos + match_size) & window_mask;
-    } else {
-        for (uint8_t i = 0; i < match_size; i++) {
-            window[*window_pos] = window[window_offset + i];
-            *window_pos = (*window_pos + 1) & window_mask;
-        }
-    }
-}
-
 /**
  * @brief Decode huffman symbol + optional trailing bits from bit buffer.
  *
@@ -292,7 +245,7 @@ static tamp_res decode_extended_match(TampDecompressor* d, unsigned char** outpu
         uint16_t wp = d->window_pos;
         uint16_t remaining = window_size - wp;
         uint8_t window_write = (match_size < remaining) ? match_size : remaining;
-        window_copy(d->window, &wp, window_offset, window_write, window_size - 1);
+        tamp_window_copy(d->window, &wp, window_offset, window_write, window_size - 1);
         d->window_pos = wp;
     }
 
@@ -543,7 +496,7 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigne
 
             if (TAMP_LIKELY(decompressor->skip_bytes == 0)) {
                 uint16_t wp = decompressor->window_pos;
-                window_copy(decompressor->window, &wp, window_offset, match_size, window_mask);
+                tamp_window_copy(decompressor->window, &wp, window_offset, match_size, window_mask);
                 decompressor->window_pos = wp;
             }
         }

From 4bd7b10a83553ba14d9d7a76a8291290229e63ef Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Wed, 4 Feb 2026 22:21:58 -0500
Subject: [PATCH 077/109] cache-on-the-stack unpacked bitfields.

---
 README.md                     | 10 +++++-----
 tamp/_c_src/tamp/compressor.c | 22 ++++++++++++++--------
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index ef66b64f..971a760d 100644
--- a/README.md
+++ b/README.md
@@ -351,11 +351,11 @@ Numbers reported in bytes. Tamp sizes were measured using `arm-none-eabi-gcc`
 
 |                                  | Compressor | Decompressor | Compressor + Decompressor |
 | -------------------------------- | ---------- | ------------ | ------------------------- |
-| Tamp (MicroPython Native)        | 4708       | 4339         | 8208                      |
-| Tamp (C, no extended, no stream) | 1596       | 1584         | 3060                      |
-| Tamp (C, no extended)            | 2000       | 2036         | 3916                      |
-| Tamp (C, extended, no stream)    | 2736       | 2436         | 5052                      |
-| Tamp (C, extended)               | 3140       | 2888         | 5908                      |
+| Tamp (MicroPython Native)        | 4708       | 4339         | 8124                      |
+| Tamp (C, no extended, no stream) | 1708       | 1584         | 3088                      |
+| Tamp (C, no extended)            | 2112       | 2036         | 3944                      |
+| Tamp (C, extended, no stream)    | 2768       | 2436         | 5000                      |
+| Tamp (C, extended)               | 3172       | 2888         | 5856                      |
 | Heatshrink (C)                   | 2956       | 3876         | 6832                      |
 | uzlib (C)                        | 2355       | 3963         | 6318                      |
 
diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 6d98086b..6d889a8b 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -306,7 +306,13 @@ static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor *compres
 TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned char *output, size_t output_size,
                                             size_t *output_written_size) {
     tamp_res res;
-    const uint16_t window_mask = (1 << compressor->conf.window) - 1;
+    // Cache bitfield values for faster access in hot path
+    const uint8_t conf_window = compressor->conf.window;
+    const uint8_t conf_literal = compressor->conf.literal;
+    const uint16_t window_mask = (1 << conf_window) - 1;
+#if TAMP_EXTENDED_COMPRESS
+    const bool conf_extended = compressor->conf.extended;
+#endif
     size_t output_written_size_proxy;
 
     if (!output_written_size) output_written_size = &output_written_size_proxy;
@@ -331,7 +337,7 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
 
 #if TAMP_EXTENDED_COMPRESS
     // Extended: Handle extended match continuation
-    if (TAMP_UNLIKELY(compressor->conf.extended && compressor->extended_match_count)) {
+    if (TAMP_UNLIKELY(conf_extended && compressor->extended_match_count)) {
         // We're in extended match mode - try to extend the match at the current position
         const uint8_t max_ext_match = compressor->min_pattern_size + 11 + EXTENDED_MATCH_MAX_EXTRA;
         const unsigned char *window = compressor->window;
@@ -385,7 +391,7 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
     }
 
     // Extended: Handle RLE accumulation with persistent state
-    if (TAMP_UNLIKELY(compressor->conf.extended)) {
+    if (TAMP_UNLIKELY(conf_extended)) {
         uint8_t last_byte = get_last_window_byte(compressor);
 
         // Count and CONSUME matching bytes
@@ -472,14 +478,14 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
         // Write LITERAL
         match_size = 1;
         unsigned char c = read_input(0);
-        if (TAMP_UNLIKELY(c >> compressor->conf.literal)) {
+        if (TAMP_UNLIKELY(c >> conf_literal)) {
             return TAMP_EXCESS_BITS;
         }
-        write_to_bit_buffer(compressor, IS_LITERAL_FLAG | c, compressor->conf.literal + 1);
+        write_to_bit_buffer(compressor, (1 << conf_literal) | c, conf_literal + 1);
     } else {
 #if TAMP_EXTENDED_COMPRESS
         // Extended: Start extended match continuation
-        if (compressor->conf.extended && match_size > compressor->min_pattern_size + 11) {
+        if (conf_extended && match_size > compressor->min_pattern_size + 11) {
             compressor->extended_match_count = match_size;
             compressor->extended_match_position = match_index;
             // Consume matched bytes from input
@@ -491,8 +497,8 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
 #endif  // TAMP_EXTENDED_COMPRESS
         // Write TOKEN (huffman code + window position)
         uint8_t huffman_index = match_size - compressor->min_pattern_size;
-        write_to_bit_buffer(compressor, (huffman_codes[huffman_index] << compressor->conf.window) | match_index,
-                            huffman_bits[huffman_index] + compressor->conf.window);
+        write_to_bit_buffer(compressor, (huffman_codes[huffman_index] << conf_window) | match_index,
+                            huffman_bits[huffman_index] + conf_window);
     }
     // Populate Window
     for (uint8_t i = 0; i < match_size; i++) {

From c582fb757418f145e490338ef83995fa5c13d08e Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 10:49:00 -0500
Subject: [PATCH 078/109] thorough extended-match search.

---
 tamp/_c_src/tamp/compressor.c | 67 ++++++++++++++++++++++++++++++++++-
 1 file changed, 66 insertions(+), 1 deletion(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 6d889a8b..1cc7cae9 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -156,6 +156,56 @@ static inline void find_best_match(TampCompressor *compressor, uint16_t *match_i
 
 #endif
 
+#if TAMP_EXTENDED_COMPRESS
+/**
+ * @brief Search for extended match continuation using implicit pattern comparison.
+ *
+ * Searches for pattern: window[current_pos:current_pos+current_count] + input[0...]
+ * starting from current_pos. Uses implicit comparison - no buffer allocation.
+ *
+ * @param[in] compressor TampCompressor object
+ * @param[in] current_pos Current match position in window (also search start)
+ * @param[in] current_count Current match length
+ * @param[out] new_pos Position of found longer match
+ * @param[out] new_count Length of found match
+ */
+static inline void find_extended_match(TampCompressor *compressor, uint16_t current_pos, uint8_t current_count,
+                                       uint16_t *new_pos, uint8_t *new_count) {
+    *new_count = 0;
+    const unsigned char *window = compressor->window;
+    const uint16_t window_size = WINDOW_SIZE;
+    const uint8_t max_pattern = MIN(current_count + compressor->input_size, MAX_PATTERN_SIZE);
+
+    // Need at least 2 bytes in target to search
+    if (max_pattern < 2) return;
+
+    // First two bytes of pattern (from window at current_pos)
+    const uint8_t first_byte = window[current_pos];
+    const uint8_t second_byte = window[current_pos + 1];
+
+    for (uint16_t cand = current_pos; cand + max_pattern <= window_size; cand++) {
+        // Quick 2-byte check
+        if (TAMP_LIKELY(window[cand] != first_byte)) continue;
+        if (TAMP_LIKELY(window[cand + 1] != second_byte)) continue;
+
+        // Extend match using implicit comparison
+        uint8_t match_len = 2;
+        for (uint8_t i = 2; i < max_pattern; i++) {
+            // Get target byte: from window if i < current_count, else from input
+            uint8_t target = (i < current_count) ? window[current_pos + i] : read_input(i - current_count);
+            if (window[cand + i] != target) break;
+            match_len = i + 1;
+        }
+
+        if (match_len > *new_count) {
+            *new_count = match_len;
+            *new_pos = cand;
+            if (match_len == max_pattern) return;
+        }
+    }
+}
+#endif  // TAMP_EXTENDED_COMPRESS
+
 #if TAMP_LAZY_MATCHING
 /**
  * @brief Check if writing a single byte will overlap with a future match section.
@@ -376,7 +426,22 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
                 compressor->input_size--;
                 // Continue to next iteration to try extending further
             } else {
-                // Match ended - emit current match
+                // O(1) extension failed - search for longer match from current position
+                uint16_t new_pos;
+                uint8_t new_count;
+                find_extended_match(compressor, current_pos, current_count, &new_pos, &new_count);
+
+                if (new_count > current_count) {
+                    // Found longer match - update and continue
+                    uint8_t extra_bytes = new_count - current_count;
+                    compressor->extended_match_position = new_pos;
+                    compressor->extended_match_count = new_count;
+                    compressor->input_pos = input_add(extra_bytes);
+                    compressor->input_size -= extra_bytes;
+                    continue;
+                }
+
+                // No better match - emit current match
                 // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
                 if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
                 size_t token_bytes;

From f016879d7898492582b8576ccdb5f03b70f3f7e6 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 12:50:40 -0500
Subject: [PATCH 079/109] better RLE vs match tradeoff in C compressor

---
 tamp/_c_src/tamp/compressor.c | 156 ++++++++++++++++++++--------------
 1 file changed, 93 insertions(+), 63 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 1cc7cae9..d9c4cdb9 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -161,12 +161,13 @@ static inline void find_best_match(TampCompressor *compressor, uint16_t *match_i
  * @brief Search for extended match continuation using implicit pattern comparison.
  *
  * Searches for pattern: window[current_pos:current_pos+current_count] + input[0...]
- * starting from current_pos. Uses implicit comparison - no buffer allocation.
+ * starting from current_pos. Returns the longest match found (which may be at
+ * current_pos itself if O(1) extension works, or at a different position).
  *
  * @param[in] compressor TampCompressor object
  * @param[in] current_pos Current match position in window (also search start)
  * @param[in] current_count Current match length
- * @param[out] new_pos Position of found longer match
+ * @param[out] new_pos Position of found match (only valid if new_count > current_count)
  * @param[out] new_count Length of found match
  */
 static inline void find_extended_match(TampCompressor *compressor, uint16_t current_pos, uint8_t current_count,
@@ -176,27 +177,46 @@ static inline void find_extended_match(TampCompressor *compressor, uint16_t curr
     const uint16_t window_size = WINDOW_SIZE;
     const uint8_t max_pattern = MIN(current_count + compressor->input_size, MAX_PATTERN_SIZE);
 
-    // Need at least 2 bytes in target to search
-    if (max_pattern < 2) return;
+    // Need at least current_count + 1 to find a longer match, and room in window
+    if (max_pattern <= current_count) return;
+    if (current_pos + current_count + 1 > window_size) return;
 
     // First two bytes of pattern (from window at current_pos)
     const uint8_t first_byte = window[current_pos];
     const uint8_t second_byte = window[current_pos + 1];
 
-    for (uint16_t cand = current_pos; cand + max_pattern <= window_size; cand++) {
+    // The target byte to extend by (input[0], like Python does)
+    const uint8_t extend_byte = read_input(0);
+
+    // Search candidates that can fit at least current_count + 1 bytes
+    for (uint16_t cand = current_pos; cand + current_count + 1 <= window_size; cand++) {
         // Quick 2-byte check
         if (TAMP_LIKELY(window[cand] != first_byte)) continue;
         if (TAMP_LIKELY(window[cand + 1] != second_byte)) continue;
 
-        // Extend match using implicit comparison
-        uint8_t match_len = 2;
-        for (uint8_t i = 2; i < max_pattern; i++) {
-            // Get target byte: from window if i < current_count, else from input
-            uint8_t target = (i < current_count) ? window[current_pos + i] : read_input(i - current_count);
+        // Check if all current_count bytes match
+        bool full_match = true;
+        for (uint8_t i = 2; i < current_count; i++) {
+            if (window[cand + i] != window[current_pos + i]) {
+                full_match = false;
+                break;
+            }
+        }
+        if (!full_match) continue;
+
+        // Check if the extension byte matches
+        if (window[cand + current_count] != extend_byte) continue;
+
+        // Found a match of current_count + 1 bytes - now extend as far as possible
+        const uint8_t cand_max = MIN(max_pattern, window_size - cand);
+        uint8_t match_len = current_count + 1;
+        for (uint8_t i = current_count + 1; i < cand_max; i++) {
+            uint8_t target = read_input(i - current_count);
             if (window[cand + i] != target) break;
             match_len = i + 1;
         }
 
+        // Track this match (guaranteed > current_count)
         if (match_len > *new_count) {
             *new_count = match_len;
             *new_pos = cand;
@@ -388,9 +408,8 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
 #if TAMP_EXTENDED_COMPRESS
     // Extended: Handle extended match continuation
     if (TAMP_UNLIKELY(conf_extended && compressor->extended_match_count)) {
-        // We're in extended match mode - try to extend the match at the current position
+        // We're in extended match mode - try to extend the match
         const uint8_t max_ext_match = compressor->min_pattern_size + 11 + EXTENDED_MATCH_MAX_EXTRA;
-        const unsigned char *window = compressor->window;
 
         while (compressor->input_size > 0) {
             const uint16_t current_pos = compressor->extended_match_position;
@@ -418,38 +437,29 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
                 return TAMP_OK;
             }
 
-            // O(1) extension check: does the next byte at current position match input?
-            if (window[current_pos + current_count] == read_input(0)) {
-                // Extension successful - consume input byte and increment count
-                compressor->extended_match_count++;
-                compressor->input_pos = input_add(1);
-                compressor->input_size--;
-                // Continue to next iteration to try extending further
-            } else {
-                // O(1) extension failed - search for longer match from current position
-                uint16_t new_pos;
-                uint8_t new_count;
-                find_extended_match(compressor, current_pos, current_count, &new_pos, &new_count);
-
-                if (new_count > current_count) {
-                    // Found longer match - update and continue
-                    uint8_t extra_bytes = new_count - current_count;
-                    compressor->extended_match_position = new_pos;
-                    compressor->extended_match_count = new_count;
-                    compressor->input_pos = input_add(extra_bytes);
-                    compressor->input_size -= extra_bytes;
-                    continue;
-                }
-
-                // No better match - emit current match
-                // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
-                if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
-                size_t token_bytes;
-                res = write_extended_match_token(compressor, output, output_size, &token_bytes);
-                (*output_written_size) += token_bytes;
-                if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-                return TAMP_OK;
+            // Search for longer match (includes O(1) extension at same position)
+            uint16_t new_pos;
+            uint8_t new_count;
+            find_extended_match(compressor, current_pos, current_count, &new_pos, &new_count);
+
+            if (new_count > current_count) {
+                // Found longer match - update and continue
+                uint8_t extra_bytes = new_count - current_count;
+                compressor->extended_match_position = new_pos;
+                compressor->extended_match_count = new_count;
+                compressor->input_pos = input_add(extra_bytes);
+                compressor->input_size -= extra_bytes;
+                continue;
             }
+
+            // No longer match found - emit current match
+            // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
+            if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
+            size_t token_bytes;
+            res = write_extended_match_token(compressor, output, output_size, &token_bytes);
+            (*output_written_size) += token_bytes;
+            if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+            return TAMP_OK;
         }
         // Ran out of input while extending - return and wait for more
         return TAMP_OK;
@@ -459,32 +469,52 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
     if (TAMP_UNLIKELY(conf_extended)) {
         uint8_t last_byte = get_last_window_byte(compressor);
 
-        // Count and CONSUME matching bytes
-        while (compressor->input_size > 0 && compressor->rle_count < RLE_MAX_COUNT) {
-            if (read_input(0) == last_byte) {
-                compressor->rle_count++;
-                compressor->input_pos = input_add(1);
-                compressor->input_size--;
-            } else {
-                break;
-            }
+        // Count RLE bytes in current buffer WITHOUT consuming yet
+        uint8_t rle_available = 0;
+        while (rle_available < compressor->input_size && compressor->rle_count + rle_available < RLE_MAX_COUNT &&
+               compressor->input[input_add(rle_available)] == last_byte) {
+            rle_available++;
         }
 
-        // If we consumed whole buffer and haven't hit max, return (accumulate more)
-        if (compressor->input_size == 0 && compressor->rle_count < RLE_MAX_COUNT && compressor->rle_count > 0) {
+        uint8_t total_rle = compressor->rle_count + rle_available;
+        bool rle_ended = (rle_available < compressor->input_size) || (total_rle >= RLE_MAX_COUNT);
+
+        // If RLE hasn't ended and we haven't hit max, consume and wait for more
+        if (!rle_ended && total_rle > 0) {
+            compressor->rle_count = total_rle;
+            compressor->input_pos = input_add(rle_available);
+            compressor->input_size -= rle_available;
             return TAMP_OK;
         }
 
-        // RLE run has ended
-        if (compressor->rle_count >= 2) {
-            // Commit the RLE (simplified approach for C)
-            write_rle_token(compressor, compressor->rle_count);
+        // RLE run has ended - decide between RLE and pattern match
+        if (total_rle >= 2) {
+            bool use_pattern = false;
+
+            // For short RLE runs (all from this call), check if pattern match is better
+            if (total_rle == rle_available && total_rle <= 6) {
+                uint16_t pattern_index;
+                uint8_t pattern_size;
+                find_best_match(compressor, &pattern_index, &pattern_size);
+
+                if (pattern_size > total_rle) {
+                    use_pattern = true;
+                    // Don't consume RLE bytes - fall through to pattern matching
+                }
+            }
+
+            if (!use_pattern) {
+                // Use RLE - consume bytes and write token
+                compressor->input_pos = input_add(rle_available);
+                compressor->input_size -= rle_available;
+                write_rle_token(compressor, total_rle);
+                compressor->rle_count = 0;
+                return TAMP_OK;
+            }
             compressor->rle_count = 0;
-            return TAMP_OK;
-        } else if (compressor->rle_count == 1) {
-            // Single byte - push it back to input for normal literal encoding
-            compressor->input_pos = input_add(-1);
-            compressor->input_size++;
+        } else if (total_rle == 1) {
+            // Single byte - not worth RLE, will be handled as literal/pattern
+            // Byte is still in input buffer (not consumed), just reset RLE state
             compressor->rle_count = 0;
         }
     }

From 57f0c48b35111b7f80ea2247fbc4db6e7bae27ed Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 13:06:14 -0500
Subject: [PATCH 080/109] match python compressor to C compressor
 implementation.

---
 tamp/compressor.py | 100 +++++++++++++++++----------------------------
 1 file changed, 37 insertions(+), 63 deletions(-)

diff --git a/tamp/compressor.py b/tamp/compressor.py
index 2451aa75..f59b2e39 100644
--- a/tamp/compressor.py
+++ b/tamp/compressor.py
@@ -273,90 +273,64 @@ def _compress_input_buffer_single(self) -> int:
 
         # RLE handling with persistent state (v2 only)
         # Accumulate RLE count across compression cycles for better compression of long runs
-        have_match_from_rle = False  # Track if we already did pattern matching in RLE section
-
         if self.extended:
             last_byte = self._window_buffer.last_written_byte
 
-            # Count additional matching bytes in current buffer
-            new_rle_bytes = 0
+            # Count RLE bytes in current buffer WITHOUT consuming yet
+            rle_available = 0
             for byte in self._input_buffer:
-                if byte == last_byte and self._rle_count + new_rle_bytes < self._rle_max_size:
-                    new_rle_bytes += 1
+                if byte == last_byte and self._rle_count + rle_available < self._rle_max_size:
+                    rle_available += 1
                 else:
                     break
 
-            # If we consumed whole buffer and haven't hit max, keep accumulating
-            if new_rle_bytes == len(self._input_buffer) and self._rle_count + new_rle_bytes < self._rle_max_size:
-                # Consume these bytes and wait for more data
-                for _ in range(new_rle_bytes):
+            total_rle = self._rle_count + rle_available
+            rle_ended = (rle_available < len(self._input_buffer)) or (total_rle >= self._rle_max_size)
+
+            # If RLE hasn't ended and we haven't hit max, consume and wait for more
+            if not rle_ended and total_rle > 0:
+                self._rle_count = total_rle
+                for _ in range(rle_available):
                     self._input_buffer.popleft()
-                self._rle_count += new_rle_bytes
                 return bytes_written
 
-            # RLE run has ended or hit max - decide what to encode
-            total_rle_count = self._rle_count + new_rle_bytes
+            # RLE run has ended - decide between RLE and pattern match
+            if total_rle >= 2:
+                use_pattern = False
 
-            if total_rle_count >= 2:
-                # Build search target: accumulated RLE + new RLE + rest of buffer
-                # This allows pattern matching to find longer sequences
-                target = bytes([last_byte]) * total_rle_count + bytes(list(self._input_buffer)[new_rle_bytes:])
-
-                # Do pattern search
-                if self.lazy_matching and self._cached_match_index >= 0:
-                    search_i = self._cached_match_index
-                    match_size = self._cached_match_size
-                    match = self._window_buffer.get(search_i, match_size)
-                    self._cached_match_index = -1
-                else:
+                # For short RLE runs (all from this call), check if pattern match is better
+                if total_rle == rle_available and total_rle <= 6:
+                    target = bytes(self._input_buffer)
                     search_i, match = self._search(target, start=0)
                     match_size = len(match)
 
-                have_match_from_rle = True
+                    if match_size > total_rle:
+                        use_pattern = True
+                        # Don't consume RLE bytes - fall through to pattern matching
 
-                # Simple decision: if pattern match is longer, use it; otherwise use RLE
-                if match_size >= self.min_pattern_size and match_size > total_rle_count:
-                    # Pattern match wins
-                    # Consume bytes from input_buffer (only the bytes actually in the buffer)
-                    # Note: first _rle_count bytes were already consumed in previous calls
-                    bytes_to_consume = match_size - self._rle_count
-                    for _ in range(bytes_to_consume):
-                        self._input_buffer.popleft()
-                    self._rle_count = 0
-
-                    # Write the pattern match immediately and return
-                    # (Don't continue to normal flow which would try to consume bytes again)
-                    if self.extended and match_size > (self.min_pattern_size + 11):
-                        self._extended_match_position = search_i
-                        self._extended_match_count = match_size
-                        bytes_written += self._write_extended_match()
-                    else:
-                        bytes_written += self._write_match(search_i, match)
-                    return bytes_written
-                else:
-                    # RLE wins - commit RLE
-                    for _ in range(new_rle_bytes):
+                if not use_pattern:
+                    # Use RLE - consume bytes and write token
+                    for _ in range(rle_available):
                         self._input_buffer.popleft()
-                    self._rle_count = total_rle_count
+                    self._rle_count = total_rle
                     bytes_written += self._write_rle()
                     return bytes_written
-            elif self._rle_count == 1:
-                # Single byte isn't worth RLE encoding
                 self._rle_count = 0
-                # Fall through to normal pattern matching
+            elif total_rle == 1:
+                # Single byte - not worth RLE, will be handled as literal/pattern
+                self._rle_count = 0
 
-        # Normal pattern matching (when no RLE or RLE was abandoned for pattern)
-        if not have_match_from_rle:
-            target = bytes(self._input_buffer)
+        # Normal pattern matching
+        target = bytes(self._input_buffer)
 
-            if self.lazy_matching and self._cached_match_index >= 0:
-                search_i = self._cached_match_index
-                match_size = self._cached_match_size
-                match = self._window_buffer.get(search_i, match_size)
-                self._cached_match_index = -1
-            else:
-                search_i, match = self._search(target, start=0)
-                match_size = len(match)
+        if self.lazy_matching and self._cached_match_index >= 0:
+            search_i = self._cached_match_index
+            match_size = self._cached_match_size
+            match = self._window_buffer.get(search_i, match_size)
+            self._cached_match_index = -1
+        else:
+            search_i, match = self._search(target, start=0)
+            match_size = len(match)
 
         # Lazy matching logic
         if (

From 161e47674dcfabd26d856e1d815612ab6c872942 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 13:14:49 -0500
Subject: [PATCH 081/109] update readme and expected hash

---
 .github/workflows/tests.yaml |  2 +-
 README.md                    | 60 ++++++++++++++++++------------------
 2 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 79071a9b..1c44aa9f 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -267,7 +267,7 @@ jobs:
         implementation: [desktop, embedded]
     env:
       POETRY_HOME: '~/poetry'
-      EXPECTED_COMPRESSED_HASH: '5996293c04a89014580334da4d69374ee414f1ae5087b813bab69209573b6966'
+      EXPECTED_COMPRESSED_HASH: 'dd5e431b0cbaa6ee001b10493c2b08e6235f42e3f4ce00958e88b1b97581872e'
 
     steps:
       - name: Check out repository
diff --git a/README.md b/README.md
index 971a760d..39af81c3 100644
--- a/README.md
+++ b/README.md
@@ -219,20 +219,20 @@ idea of how these algorithms perform over a variety of input data types.
 
 | dataset         | raw         | tamp        | tamp (LazyMatching) | zlib          | heatshrink |
 | --------------- | ----------- | ----------- | ------------------- | ------------- | ---------- |
-| enwik8          | 100,000,000 | 51,116,968  | **50,725,098**      | 56,205,166    | 56,110,394 |
-| RPI_PICO (.uf2) | 667,648     | **288,704** | 289,735             | 303,763       | -          |
-| silesia/dickens | 10,192,446  | 5,538,712   | **5,503,021**       | 6,049,169     | 6,155,768  |
-| silesia/mozilla | 51,220,480  | 24,499,954  | **24,311,290**      | 25,104,966    | 25,435,908 |
-| silesia/mr      | 9,970,564   | 4,524,424   | **4,396,124**       | 4,864,734     | 5,442,180  |
-| silesia/nci     | 33,553,445  | 7,093,354   | 7,003,632           | **5,765,521** | 8,247,487  |
-| silesia/ooffice | 6,152,192   | 3,779,238   | **3,763,795**       | 4,077,277     | 3,994,589  |
-| silesia/osdb    | 10,085,684  | 8,467,407   | **8,452,497**       | 8,625,159     | 8,747,527  |
-| silesia/reymont | 6,627,202   | 2,825,458   | **2,793,473**       | 2,897,661     | 2,910,251  |
-| silesia/samba   | 21,606,400  | 8,443,932   | **8,395,048**       | 8,862,423     | 9,223,827  |
-| silesia/sao     | 7,251,944   | 6,136,102   | **6,100,071**       | 6,506,417     | 6,400,926  |
-| silesia/webster | 41,458,703  | 18,259,149  | **18,118,788**      | 20,212,235    | 19,942,817 |
-| silesia/x-ray   | 8,474,240   | 7,509,652   | 7,404,794           | **7,351,750** | 8,059,723  |
-| silesia/xml     | 5,345,280   | 1,493,131   | **1,473,832**       | 1,586,985     | 1,665,179  |
+| enwik8          | 100,000,000 | 51,017,102  | **50,626,118**      | 56,205,166    | 56,110,394 |
+| RPI_PICO (.uf2) | 667,648     | **289,204** | 290,442             | 303,763       | -          |
+| silesia/dickens | 10,192,446  | 5,538,353   | **5,502,834**       | 6,049,169     | 6,155,768  |
+| silesia/mozilla | 51,220,480  | 24,412,662  | **24,228,654**      | 25,104,966    | 25,435,908 |
+| silesia/mr      | 9,970,564   | 4,519,402   | **4,393,009**       | 4,864,734     | 5,442,180  |
+| silesia/nci     | 33,553,445  | 6,824,403   | 6,772,379           | **5,765,521** | 8,247,487  |
+| silesia/ooffice | 6,152,192   | 3,773,089   | **3,755,153**       | 4,077,277     | 3,994,589  |
+| silesia/osdb    | 10,085,684  | 8,466,875   | **8,464,328**       | 8,625,159     | 8,747,527  |
+| silesia/reymont | 6,627,202   | 2,818,554   | **2,788,774**       | 2,897,661     | 2,910,251  |
+| silesia/samba   | 21,606,400  | 8,384,183   | **8,345,616**       | 8,862,423     | 9,223,827  |
+| silesia/sao     | 7,251,944   | 6,136,077   | **6,100,061**       | 6,506,417     | 6,400,926  |
+| silesia/webster | 41,458,703  | 18,146,649  | **18,010,980**      | 20,212,235    | 19,942,817 |
+| silesia/x-ray   | 8,474,240   | 7,509,449   | 7,404,794           | **7,351,750** | 8,059,723  |
+| silesia/xml     | 5,345,280   | 1,473,463   | **1,455,877**       | 1,586,985     | 1,665,179  |
 
 Tamp outperforms both heatshrink and zlib on most datasets, winning 12 out of 14
 benchmarks. This is while using around 10x less memory than zlib during both
@@ -270,20 +270,20 @@ compression parameters across all benchmark datasets (`window=10`, `literal=8`).
 
 | dataset         | raw         | Baseline   | +lazy              | +extended          | +lazy +extended    |
 | --------------- | ----------- | ---------- | ------------------ | ------------------ | ------------------ |
-| enwik8          | 100,000,000 | 51,635,633 | 51,252,113 (−0.7%) | 51,116,968 (−1.0%) | 50,725,098 (−1.8%) |
-| RPI_PICO (.uf2) | 667,648     | 331,310    | 329,875 (−0.4%)    | 288,704 (−12.9%)   | 289,735 (−12.5%)   |
-| silesia/dickens | 10,192,446  | 5,546,761  | 5,511,604 (−0.6%)  | 5,538,712 (−0.1%)  | 5,503,021 (−0.8%)  |
-| silesia/mozilla | 51,220,480  | 25,121,385 | 24,936,067 (−0.7%) | 24,499,954 (−2.5%) | 24,311,290 (−3.2%) |
-| silesia/mr      | 9,970,564   | 5,027,032  | 4,886,272 (−2.8%)  | 4,524,424 (−10.0%) | 4,396,124 (−12.6%) |
-| silesia/nci     | 33,553,445  | 8,643,610  | 8,645,299 (+0.0%)  | 7,093,354 (−17.9%) | 7,003,632 (−19.0%) |
-| silesia/ooffice | 6,152,192   | 3,814,938  | 3,798,261 (−0.4%)  | 3,779,238 (−0.9%)  | 3,763,795 (−1.3%)  |
-| silesia/osdb    | 10,085,684  | 8,520,835  | 8,506,443 (−0.2%)  | 8,467,407 (−0.6%)  | 8,452,497 (−0.8%)  |
-| silesia/reymont | 6,627,202   | 2,847,981  | 2,820,870 (−1.0%)  | 2,825,458 (−0.8%)  | 2,793,473 (−1.9%)  |
-| silesia/samba   | 21,606,400  | 9,102,594  | 9,060,692 (−0.5%)  | 8,443,932 (−7.2%)  | 8,395,048 (−7.8%)  |
-| silesia/sao     | 7,251,944   | 6,137,755  | 6,101,744 (−0.6%)  | 6,136,102 (−0.0%)  | 6,100,071 (−0.6%)  |
-| silesia/webster | 41,458,703  | 18,694,172 | 18,567,228 (−0.7%) | 18,259,149 (−2.3%) | 18,118,788 (−3.1%) |
-| silesia/x-ray   | 8,474,240   | 7,510,606  | 7,405,814 (−1.4%)  | 7,509,652 (−0.0%)  | 7,404,794 (−1.4%)  |
-| silesia/xml     | 5,345,280   | 1,681,687  | 1,672,660 (−0.5%)  | 1,493,131 (−11.2%) | 1,473,832 (−12.4%) |
+| enwik8          | 100,000,000 | 51,635,633 | 51,252,694 (−0.7%) | 51,017,102 (−1.2%) | 50,626,118 (−2.0%) |
+| RPI_PICO (.uf2) | 667,648     | 331,310    | 329,893 (−0.4%)    | 289,204 (−12.7%)   | 290,442 (−12.3%)   |
+| silesia/dickens | 10,192,446  | 5,546,761  | 5,511,681 (−0.6%)  | 5,538,353 (−0.2%)  | 5,502,834 (−0.8%)  |
+| silesia/mozilla | 51,220,480  | 25,121,385 | 24,937,036 (−0.7%) | 24,412,662 (−2.8%) | 24,228,654 (−3.6%) |
+| silesia/mr      | 9,970,564   | 5,027,032  | 4,888,930 (−2.7%)  | 4,519,402 (−10.1%) | 4,393,009 (−12.6%) |
+| silesia/nci     | 33,553,445  | 8,643,610  | 8,645,399 (+0.0%)  | 6,824,403 (−21.0%) | 6,772,379 (−21.6%) |
+| silesia/ooffice | 6,152,192   | 3,814,938  | 3,798,393 (−0.4%)  | 3,773,089 (−1.1%)  | 3,755,153 (−1.6%)  |
+| silesia/osdb    | 10,085,684  | 8,520,835  | 8,518,502 (−0.0%)  | 8,466,875 (−0.6%)  | 8,464,328 (−0.7%)  |
+| silesia/reymont | 6,627,202   | 2,847,981  | 2,820,948 (−0.9%)  | 2,818,554 (−1.0%)  | 2,788,774 (−2.1%)  |
+| silesia/samba   | 21,606,400  | 9,102,594  | 9,061,143 (−0.5%)  | 8,384,183 (−7.9%)  | 8,345,616 (−8.3%)  |
+| silesia/sao     | 7,251,944   | 6,137,755  | 6,101,747 (−0.6%)  | 6,136,077 (−0.0%)  | 6,100,061 (−0.6%)  |
+| silesia/webster | 41,458,703  | 18,694,172 | 18,567,618 (−0.7%) | 18,146,649 (−2.9%) | 18,010,980 (−3.7%) |
+| silesia/x-ray   | 8,474,240   | 7,510,606  | 7,406,001 (−1.4%)  | 7,509,449 (−0.0%)  | 7,404,794 (−1.4%)  |
+| silesia/xml     | 5,345,280   | 1,681,687  | 1,672,827 (−0.5%)  | 1,473,463 (−12.4%) | 1,455,877 (−13.4%) |
 
 The `extended` parameter enables additional Huffman codes for longer pattern
 matches, which significantly improves compression on datasets with many long
@@ -354,8 +354,8 @@ Numbers reported in bytes. Tamp sizes were measured using `arm-none-eabi-gcc`
 | Tamp (MicroPython Native)        | 4708       | 4339         | 8124                      |
 | Tamp (C, no extended, no stream) | 1708       | 1584         | 3088                      |
 | Tamp (C, no extended)            | 2112       | 2036         | 3944                      |
-| Tamp (C, extended, no stream)    | 2768       | 2436         | 5000                      |
-| Tamp (C, extended)               | 3172       | 2888         | 5856                      |
+| Tamp (C, extended, no stream)    | 3516       | 2436         | 5748                      |
+| Tamp (C, extended)               | 3920       | 2888         | 6604                      |
 | Heatshrink (C)                   | 2956       | 3876         | 6832                      |
 | uzlib (C)                        | 2355       | 3963         | 6318                      |
 

From c6f23f48019f6fdb35b34ede9f2ffb1f73080ea8 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 13:24:49 -0500
Subject: [PATCH 082/109] don't inline find_best_match

---
 README.md                     | 8 ++++----
 tamp/_c_src/tamp/compressor.c | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 39af81c3..2a722f01 100644
--- a/README.md
+++ b/README.md
@@ -352,10 +352,10 @@ Numbers reported in bytes. Tamp sizes were measured using `arm-none-eabi-gcc`
 |                                  | Compressor | Decompressor | Compressor + Decompressor |
 | -------------------------------- | ---------- | ------------ | ------------------------- |
 | Tamp (MicroPython Native)        | 4708       | 4339         | 8124                      |
-| Tamp (C, no extended, no stream) | 1708       | 1584         | 3088                      |
-| Tamp (C, no extended)            | 2112       | 2036         | 3944                      |
-| Tamp (C, extended, no stream)    | 3516       | 2436         | 5748                      |
-| Tamp (C, extended)               | 3920       | 2888         | 6604                      |
+| Tamp (C, no extended, no stream) | 1620       | 1584         | 3000                      |
+| Tamp (C, no extended)            | 2024       | 2036         | 3856                      |
+| Tamp (C, extended, no stream)    | 3072       | 2436         | 5304                      |
+| Tamp (C, extended)               | 3476       | 2888         | 6160                      |
 | Heatshrink (C)                   | 2956       | 3876         | 6832                      |
 | uzlib (C)                        | 2355       | 3963         | 6318                      |
 
diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index d9c4cdb9..e08731ef 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -114,7 +114,7 @@ extern void find_best_match(TampCompressor *compressor, uint16_t *match_index, u
  * @param[out] match_index  If match_size is 0, this value is undefined.
  * @param[out] match_size Size of best found match.
  */
-static inline void find_best_match(TampCompressor *compressor, uint16_t *match_index, uint8_t *match_size) {
+static TAMP_NOINLINE void find_best_match(TampCompressor *compressor, uint16_t *match_index, uint8_t *match_size) {
     *match_size = 0;
 
     if (TAMP_UNLIKELY(compressor->input_size < compressor->min_pattern_size)) return;

From aa59de5b8ba1b39ac318dadc50dd3c828d09170d Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 13:27:09 -0500
Subject: [PATCH 083/109] fix initialization warning

---
 tamp/_c_src/tamp/compressor.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index e08731ef..4120681e 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -438,7 +438,7 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
             }
 
             // Search for longer match (includes O(1) extension at same position)
-            uint16_t new_pos;
+            uint16_t new_pos = 0;
             uint8_t new_count;
             find_extended_match(compressor, current_pos, current_count, &new_pos, &new_count);
 

From 538b8c0c4347556d679adddc34d66dc8a37b7d23 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 13:40:13 -0500
Subject: [PATCH 084/109] combine if-statement; saving 8 bytes.

---
 tamp/_c_src/tamp/compressor.c | 76 ++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 37 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 4120681e..cbc117a1 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -682,47 +682,49 @@ tamp_res tamp_compressor_flush(TampCompressor *compressor, unsigned char *output
     }
 
 #if TAMP_EXTENDED_COMPRESS
-    // Extended: Flush any pending RLE
-    if (compressor->conf.extended && compressor->rle_count >= 1) {
-        // Partial flush first to make room
-        res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
-        (*output_written_size) += chunk_output_written_size;
-        if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-        output_size -= chunk_output_written_size;
-        output += chunk_output_written_size;
+    if (compressor->conf.extended) {
+        // Flush any pending RLE
+        if (compressor->rle_count >= 1) {
+            // Partial flush first to make room
+            res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
+            (*output_written_size) += chunk_output_written_size;
+            if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+            output_size -= chunk_output_written_size;
+            output += chunk_output_written_size;
 
-        if (compressor->rle_count == 1) {
-            // Single byte - write as literal (can't use RLE token for count < 2)
-            uint8_t literal = get_last_window_byte(compressor);
-            write_to_bit_buffer(compressor, IS_LITERAL_FLAG | literal, compressor->conf.literal + 1);
+            if (compressor->rle_count == 1) {
+                // Single byte - write as literal (can't use RLE token for count < 2)
+                uint8_t literal = get_last_window_byte(compressor);
+                write_to_bit_buffer(compressor, IS_LITERAL_FLAG | literal, compressor->conf.literal + 1);
 
-            // Write to window
-            const uint16_t window_mask = (1 << compressor->conf.window) - 1;
-            compressor->window[compressor->window_pos] = literal;
-            compressor->window_pos = (compressor->window_pos + 1) & window_mask;
-        } else {
-            // count >= 2: write as RLE token
-            write_rle_token(compressor, compressor->rle_count);
-        }
-        compressor->rle_count = 0;
+                // Write to window
+                const uint16_t window_mask = (1 << compressor->conf.window) - 1;
+                compressor->window[compressor->window_pos] = literal;
+                compressor->window_pos = (compressor->window_pos + 1) & window_mask;
+            } else {
+                // count >= 2: write as RLE token
+                write_rle_token(compressor, compressor->rle_count);
+            }
+            compressor->rle_count = 0;
 
-        // Partial flush again after writing token
-        res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
-        (*output_written_size) += chunk_output_written_size;
-        if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-        output_size -= chunk_output_written_size;
-        output += chunk_output_written_size;
-    }
+            // Partial flush again after writing token
+            res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
+            (*output_written_size) += chunk_output_written_size;
+            if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+            output_size -= chunk_output_written_size;
+            output += chunk_output_written_size;
+        }
 
-    // Extended: Flush any pending extended match
-    if (compressor->conf.extended && compressor->extended_match_count) {
-        // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
-        if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
-        res = write_extended_match_token(compressor, output, output_size, &chunk_output_written_size);
-        (*output_written_size) += chunk_output_written_size;
-        if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-        output_size -= chunk_output_written_size;
-        output += chunk_output_written_size;
+        // Flush any pending extended match
+        else if (compressor->extended_match_count) {
+            // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
+            if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
+            res = write_extended_match_token(compressor, output, output_size, &chunk_output_written_size);
+            (*output_written_size) += chunk_output_written_size;
+            if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+            output_size -= chunk_output_written_size;
+            output += chunk_output_written_size;
+        }
     }
 #endif  // TAMP_EXTENDED_COMPRESS
 

From 66401acb279ca49cb9a2ae3b67719e7a33e671f1 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 14:05:35 -0500
Subject: [PATCH 085/109] save 48 bytes in tamp_compressor_flush using some
 gotos

---
 tamp/_c_src/tamp/compressor.c | 147 ++++++++++++++++------------------
 1 file changed, 69 insertions(+), 78 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index cbc117a1..800d6f54 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -40,7 +40,7 @@ static const uint8_t huffman_bits[] = {0x2, 0x3, 0x5, 0x5, 0x6, 0x7, 0x7, 0x7, 0
 #define EXTENDED_MATCH_MIN_OUTPUT_BYTES 6
 #endif
 
-static TAMP_NOINLINE void write_to_bit_buffer(TampCompressor *compressor, uint32_t bits, uint8_t n_bits) {
+static TAMP_NOINLINE void write_to_bit_buffer(TampCompressor* compressor, uint32_t bits, uint8_t n_bits) {
     compressor->bit_buffer_pos += n_bits;
     compressor->bit_buffer |= bits << (32 - compressor->bit_buffer_pos);
 }
@@ -55,7 +55,7 @@ static TAMP_NOINLINE void write_to_bit_buffer(TampCompressor *compressor, uint32
  * @param[in] value The value to encode.
  * @param[in] trailing_bits Number of trailing bits (3 for extended match, 4 for RLE).
  */
-static TAMP_NOINLINE void write_extended_huffman(TampCompressor *compressor, uint8_t value, uint8_t trailing_bits) {
+static TAMP_NOINLINE void write_extended_huffman(TampCompressor* compressor, uint8_t value, uint8_t trailing_bits) {
     uint8_t code_index = value >> trailing_bits;
     // Write huffman code (without literal flag) + trailing bits in one call
     write_to_bit_buffer(compressor, (huffman_codes[code_index] << trailing_bits) | (value & ((1 << trailing_bits) - 1)),
@@ -69,8 +69,8 @@ static TAMP_NOINLINE void write_extended_huffman(TampCompressor *compressor, uin
  *
  * Flushes complete bytes from the bit buffer. Up to 7 bits may remain.
  */
-static TAMP_NOINLINE tamp_res partial_flush(TampCompressor *compressor, unsigned char *output, size_t output_size,
-                                            size_t *output_written_size) {
+static TAMP_NOINLINE tamp_res partial_flush(TampCompressor* compressor, unsigned char* output, size_t output_size,
+                                            size_t* output_written_size) {
     for (*output_written_size = output_size; compressor->bit_buffer_pos >= 8 && output_size;
          output_size--, compressor->bit_buffer_pos -= 8, compressor->bit_buffer <<= 8)
         *output++ = compressor->bit_buffer >> 24;
@@ -78,7 +78,7 @@ static TAMP_NOINLINE tamp_res partial_flush(TampCompressor *compressor, unsigned
     return (compressor->bit_buffer_pos >= 8) ? TAMP_OUTPUT_FULL : TAMP_OK;
 }
 
-inline bool tamp_compressor_full(const TampCompressor *compressor) {
+inline bool tamp_compressor_full(const TampCompressor* compressor) {
     return compressor->input_size == sizeof(compressor->input);
 }
 
@@ -99,7 +99,7 @@ inline bool tamp_compressor_full(const TampCompressor *compressor) {
  */
 
 #if TAMP_ESP32
-extern void find_best_match(TampCompressor *compressor, uint16_t *match_index, uint8_t *match_size);
+extern void find_best_match(TampCompressor* compressor, uint16_t* match_index, uint8_t* match_size);
 
 #elif (defined(__x86_64__) || defined(__aarch64__) || defined(_M_X64) || defined(_M_ARM64)) && !TAMP_USE_EMBEDDED_MATCH
 #include "compressor_find_match_desktop.c"
@@ -114,7 +114,7 @@ extern void find_best_match(TampCompressor *compressor, uint16_t *match_index, u
  * @param[out] match_index  If match_size is 0, this value is undefined.
  * @param[out] match_size Size of best found match.
  */
-static TAMP_NOINLINE void find_best_match(TampCompressor *compressor, uint16_t *match_index, uint8_t *match_size) {
+static TAMP_NOINLINE void find_best_match(TampCompressor* compressor, uint16_t* match_index, uint8_t* match_size) {
     *match_size = 0;
 
     if (TAMP_UNLIKELY(compressor->input_size < compressor->min_pattern_size)) return;
@@ -123,7 +123,7 @@ static TAMP_NOINLINE void find_best_match(TampCompressor *compressor, uint16_t *
     const uint8_t second_byte = read_input(1);
     const uint32_t window_size_minus_1 = WINDOW_SIZE - 1;
     const uint8_t max_pattern_size = MIN(compressor->input_size, MAX_PATTERN_SIZE);
-    const unsigned char *window = compressor->window;
+    const unsigned char* window = compressor->window;
 
     for (uint32_t window_index = 0; window_index < window_size_minus_1; window_index++) {
         if (TAMP_LIKELY(window[window_index] != first_byte)) {
@@ -170,10 +170,10 @@ static TAMP_NOINLINE void find_best_match(TampCompressor *compressor, uint16_t *
  * @param[out] new_pos Position of found match (only valid if new_count > current_count)
  * @param[out] new_count Length of found match
  */
-static inline void find_extended_match(TampCompressor *compressor, uint16_t current_pos, uint8_t current_count,
-                                       uint16_t *new_pos, uint8_t *new_count) {
+static inline void find_extended_match(TampCompressor* compressor, uint16_t current_pos, uint8_t current_count,
+                                       uint16_t* new_pos, uint8_t* new_count) {
     *new_count = 0;
-    const unsigned char *window = compressor->window;
+    const unsigned char* window = compressor->window;
     const uint16_t window_size = WINDOW_SIZE;
     const uint8_t max_pattern = MIN(current_count + compressor->input_size, MAX_PATTERN_SIZE);
 
@@ -241,7 +241,7 @@ static inline bool validate_no_match_overlap(uint16_t write_pos, uint16_t match_
 }
 #endif
 
-tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf, unsigned char *window) {
+tamp_res tamp_compressor_init(TampCompressor* compressor, const TampConf* conf, unsigned char* window) {
     const TampConf conf_default = {
         .window = 10,
         .literal = 8,
@@ -261,7 +261,7 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
 #endif
 
     for (uint8_t i = 0; i < sizeof(TampCompressor); i++)  // Zero-out the struct
-        ((unsigned char *)compressor)[i] = 0;
+        ((unsigned char*)compressor)[i] = 0;
 
     // Build header directly from conf (8 bits total)
     // Layout: [window:3][literal:2][use_custom_dictionary:1][extended:1][more_headers:1]
@@ -287,7 +287,7 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
 /**
  * @brief Get the last byte written to the window.
  */
-static inline uint8_t get_last_window_byte(TampCompressor *compressor) {
+static inline uint8_t get_last_window_byte(TampCompressor* compressor) {
     uint16_t prev_pos = (compressor->window_pos - 1) & ((1 << compressor->conf.window) - 1);
     return compressor->window[prev_pos];
 }
@@ -298,7 +298,7 @@ static inline uint8_t get_last_window_byte(TampCompressor *compressor) {
  * @param[in,out] compressor Compressor state.
  * @param[in] count Number of repeated bytes (must be >= 2).
  */
-static TAMP_NOINLINE void write_rle_token(TampCompressor *compressor, uint8_t count) {
+static TAMP_NOINLINE void write_rle_token(TampCompressor* compressor, uint8_t count) {
     const uint16_t window_mask = (1 << compressor->conf.window) - 1;
     uint8_t symbol = get_last_window_byte(compressor);
 
@@ -329,8 +329,8 @@ static TAMP_NOINLINE void write_rle_token(TampCompressor *compressor, uint8_t co
  * @param[out] output_written_size Bytes written to output.
  * @return TAMP_OK on success, TAMP_OUTPUT_FULL if output buffer is too small.
  */
-static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor *compressor, unsigned char *output,
-                                                         size_t output_size, size_t *output_written_size) {
+static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor* compressor, unsigned char* output,
+                                                         size_t output_size, size_t* output_written_size) {
     const uint16_t window_mask = (1 << compressor->conf.window) - 1;
     const uint8_t count = compressor->extended_match_count;
     const uint16_t position = compressor->extended_match_position;
@@ -373,8 +373,8 @@ static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor *compres
 }
 #endif  // TAMP_EXTENDED_COMPRESS
 
-TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned char *output, size_t output_size,
-                                            size_t *output_written_size) {
+TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned char* output, size_t output_size,
+                                            size_t* output_written_size) {
     tamp_res res;
     // Cache bitfield values for faster access in hot path
     const uint8_t conf_window = compressor->conf.window;
@@ -606,8 +606,8 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
     return TAMP_OK;
 }
 
-void tamp_compressor_sink(TampCompressor *compressor, const unsigned char *input, size_t input_size,
-                          size_t *consumed_size) {
+void tamp_compressor_sink(TampCompressor* compressor, const unsigned char* input, size_t input_size,
+                          size_t* consumed_size) {
     size_t consumed_size_proxy;
     if (TAMP_LIKELY(consumed_size))
         *consumed_size = 0;
@@ -622,9 +622,9 @@ void tamp_compressor_sink(TampCompressor *compressor, const unsigned char *input
     }
 }
 
-tamp_res tamp_compressor_compress_cb(TampCompressor *compressor, unsigned char *output, size_t output_size,
-                                     size_t *output_written_size, const unsigned char *input, size_t input_size,
-                                     size_t *input_consumed_size, tamp_callback_t callback, void *user_data) {
+tamp_res tamp_compressor_compress_cb(TampCompressor* compressor, unsigned char* output, size_t output_size,
+                                     size_t* output_written_size, const unsigned char* input, size_t input_size,
+                                     size_t* input_consumed_size, tamp_callback_t callback, void* user_data) {
     tamp_res res;
     size_t input_consumed_size_proxy = 0, output_written_size_proxy = 0;
     size_t total_input_size = input_size;
@@ -663,8 +663,8 @@ tamp_res tamp_compressor_compress_cb(TampCompressor *compressor, unsigned char *
     return TAMP_OK;
 }
 
-tamp_res tamp_compressor_flush(TampCompressor *compressor, unsigned char *output, size_t output_size,
-                               size_t *output_written_size, bool write_token) {
+tamp_res tamp_compressor_flush(TampCompressor* compressor, unsigned char* output, size_t output_size,
+                               size_t* output_written_size, bool write_token) {
     tamp_res res;
     size_t chunk_output_written_size;
     size_t output_written_size_proxy;
@@ -672,61 +672,52 @@ tamp_res tamp_compressor_flush(TampCompressor *compressor, unsigned char *output
     if (!output_written_size) output_written_size = &output_written_size_proxy;
     *output_written_size = 0;
 
-    while (compressor->input_size) {
-        // Compress the remainder of the input buffer.
+flush_check:
+    if (TAMP_LIKELY(compressor->input_size)) {
         res = tamp_compressor_poll(compressor, output, output_size, &chunk_output_written_size);
+    }
+#if TAMP_EXTENDED_COMPRESS
+    else if (compressor->conf.extended && compressor->rle_count >= 1) {
+        // Partial flush first to make room
+        res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
         (*output_written_size) += chunk_output_written_size;
         if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
         output_size -= chunk_output_written_size;
         output += chunk_output_written_size;
-    }
-
-#if TAMP_EXTENDED_COMPRESS
-    if (compressor->conf.extended) {
-        // Flush any pending RLE
-        if (compressor->rle_count >= 1) {
-            // Partial flush first to make room
-            res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
-            (*output_written_size) += chunk_output_written_size;
-            if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-            output_size -= chunk_output_written_size;
-            output += chunk_output_written_size;
-
-            if (compressor->rle_count == 1) {
-                // Single byte - write as literal (can't use RLE token for count < 2)
-                uint8_t literal = get_last_window_byte(compressor);
-                write_to_bit_buffer(compressor, IS_LITERAL_FLAG | literal, compressor->conf.literal + 1);
-
-                // Write to window
-                const uint16_t window_mask = (1 << compressor->conf.window) - 1;
-                compressor->window[compressor->window_pos] = literal;
-                compressor->window_pos = (compressor->window_pos + 1) & window_mask;
-            } else {
-                // count >= 2: write as RLE token
-                write_rle_token(compressor, compressor->rle_count);
-            }
-            compressor->rle_count = 0;
 
-            // Partial flush again after writing token
-            res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
-            (*output_written_size) += chunk_output_written_size;
-            if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-            output_size -= chunk_output_written_size;
-            output += chunk_output_written_size;
-        }
+        if (compressor->rle_count == 1) {
+            // Single byte - write as literal (can't use RLE token for count < 2)
+            uint8_t literal = get_last_window_byte(compressor);
+            write_to_bit_buffer(compressor, IS_LITERAL_FLAG | literal, compressor->conf.literal + 1);
 
-        // Flush any pending extended match
-        else if (compressor->extended_match_count) {
-            // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
-            if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
-            res = write_extended_match_token(compressor, output, output_size, &chunk_output_written_size);
-            (*output_written_size) += chunk_output_written_size;
-            if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-            output_size -= chunk_output_written_size;
-            output += chunk_output_written_size;
+            // Write to window
+            const uint16_t window_mask = (1 << compressor->conf.window) - 1;
+            compressor->window[compressor->window_pos] = literal;
+            compressor->window_pos = (compressor->window_pos + 1) & window_mask;
+        } else {
+            // count >= 2: write as RLE token
+            write_rle_token(compressor, compressor->rle_count);
         }
+        compressor->rle_count = 0;
+
+        // Partial flush again after writing token
+        res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
+    } else if (compressor->conf.extended && compressor->extended_match_count) {
+        // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
+        if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
+        res = write_extended_match_token(compressor, output, output_size, &chunk_output_written_size);
     }
 #endif  // TAMP_EXTENDED_COMPRESS
+    else {
+        goto flush_done;
+    }
+    (*output_written_size) += chunk_output_written_size;
+    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+    output_size -= chunk_output_written_size;
+    output += chunk_output_written_size;
+    goto flush_check;
+
+flush_done:
 
     // Perform partial flush to see if we need a FLUSH token (check if output buffer in not empty),
     // and to subsequently make room for the FLUSH token.
@@ -760,10 +751,10 @@ tamp_res tamp_compressor_flush(TampCompressor *compressor, unsigned char *output
     return TAMP_OK;
 }
 
-tamp_res tamp_compressor_compress_and_flush_cb(TampCompressor *compressor, unsigned char *output, size_t output_size,
-                                               size_t *output_written_size, const unsigned char *input,
-                                               size_t input_size, size_t *input_consumed_size, bool write_token,
-                                               tamp_callback_t callback, void *user_data) {
+tamp_res tamp_compressor_compress_and_flush_cb(TampCompressor* compressor, unsigned char* output, size_t output_size,
+                                               size_t* output_written_size, const unsigned char* input,
+                                               size_t input_size, size_t* input_consumed_size, bool write_token,
+                                               tamp_callback_t callback, void* user_data) {
     tamp_res res;
     size_t flush_size;
     size_t output_written_size_proxy;
@@ -786,9 +777,9 @@ tamp_res tamp_compressor_compress_and_flush_cb(TampCompressor *compressor, unsig
 
 #if TAMP_STREAM
 
-tamp_res tamp_compress_stream(TampCompressor *compressor, tamp_read_t read_cb, void *read_handle, tamp_write_t write_cb,
-                              void *write_handle, size_t *input_consumed_size, size_t *output_written_size,
-                              tamp_callback_t callback, void *user_data) {
+tamp_res tamp_compress_stream(TampCompressor* compressor, tamp_read_t read_cb, void* read_handle, tamp_write_t write_cb,
+                              void* write_handle, size_t* input_consumed_size, size_t* output_written_size,
+                              tamp_callback_t callback, void* user_data) {
     size_t input_consumed_size_proxy, output_written_size_proxy;
     if (!input_consumed_size) input_consumed_size = &input_consumed_size_proxy;
     if (!output_written_size) output_written_size = &output_written_size_proxy;

From 8e710df60a11598dcd156356f828dbbd848f1226 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 14:10:16 -0500
Subject: [PATCH 086/109] save 12 more bytes

---
 tamp/_c_src/tamp/compressor.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 800d6f54..613a253c 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -699,9 +699,7 @@ tamp_res tamp_compressor_flush(TampCompressor* compressor, unsigned char* output
             write_rle_token(compressor, compressor->rle_count);
         }
         compressor->rle_count = 0;
-
-        // Partial flush again after writing token
-        res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
+        chunk_output_written_size = 0;  // Already handled above; flush_done will flush token bits
     } else if (compressor->conf.extended && compressor->extended_match_count) {
         // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
         if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;

From e55710d15675fb34ef8fd198f44c0df51772442d Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 14:14:35 -0500
Subject: [PATCH 087/109] save 44 more bytes

---
 tamp/_c_src/tamp/compressor.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 613a253c..0de353d5 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -673,18 +673,18 @@ tamp_res tamp_compressor_flush(TampCompressor* compressor, unsigned char* output
     *output_written_size = 0;
 
 flush_check:
+    // Flush pending bits before checking for more work
+    res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
+    (*output_written_size) += chunk_output_written_size;
+    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+    output_size -= chunk_output_written_size;
+    output += chunk_output_written_size;
+
     if (TAMP_LIKELY(compressor->input_size)) {
         res = tamp_compressor_poll(compressor, output, output_size, &chunk_output_written_size);
     }
 #if TAMP_EXTENDED_COMPRESS
     else if (compressor->conf.extended && compressor->rle_count >= 1) {
-        // Partial flush first to make room
-        res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
-        (*output_written_size) += chunk_output_written_size;
-        if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-        output_size -= chunk_output_written_size;
-        output += chunk_output_written_size;
-
         if (compressor->rle_count == 1) {
             // Single byte - write as literal (can't use RLE token for count < 2)
             uint8_t literal = get_last_window_byte(compressor);
@@ -699,7 +699,7 @@ tamp_res tamp_compressor_flush(TampCompressor* compressor, unsigned char* output
             write_rle_token(compressor, compressor->rle_count);
         }
         compressor->rle_count = 0;
-        chunk_output_written_size = 0;  // Already handled above; flush_done will flush token bits
+        chunk_output_written_size = 0;
     } else if (compressor->conf.extended && compressor->extended_match_count) {
         // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
         if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
@@ -717,14 +717,6 @@ tamp_res tamp_compressor_flush(TampCompressor* compressor, unsigned char* output
 
 flush_done:
 
-    // Perform partial flush to see if we need a FLUSH token (check if output buffer in not empty),
-    // and to subsequently make room for the FLUSH token.
-    res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
-    output_size -= chunk_output_written_size;
-    (*output_written_size) += chunk_output_written_size;
-    output += chunk_output_written_size;
-    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-
     // Check if there's enough output buffer space
     if (compressor->bit_buffer_pos) {
         if (output_size == 0) {

From 65f9812077bcac0173367b3155ee47d3746ca04b Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 14:33:13 -0500
Subject: [PATCH 088/109] cleanup flush_done

---
 tamp/_c_src/tamp/compressor.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 0de353d5..d86a30a7 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -716,22 +716,25 @@ tamp_res tamp_compressor_flush(TampCompressor* compressor, unsigned char* output
     goto flush_check;
 
 flush_done:
-
-    // Check if there's enough output buffer space
-    if (compressor->bit_buffer_pos) {
-        if (output_size == 0) {
-            return TAMP_OUTPUT_FULL;
-        }
-        if (write_token) {
-            if (output_size < 2) return TAMP_OUTPUT_FULL;
-            write_to_bit_buffer(compressor, FLUSH_CODE, 9);
-        }
+    // At this point, up to 7 bits may remain in the compressor->bit_buffer
+    // The output buffer may have 0 bytes remaining.
+    if (write_token && compressor->bit_buffer_pos) {
+        // We don't want to write the FLUSH token to the bit_buffer unless
+        // we are confident that it'll wind up in the output buffer
+        // in THIS function call.
+        // Otherwise, if we wind up with a TAMP_OUTPUT_FULL result, we could
+        // end up accidentally writing multiple FLUSH tokens.
+        if (TAMP_UNLIKELY(output_size < 2)) return TAMP_OUTPUT_FULL;
+        write_to_bit_buffer(compressor, FLUSH_CODE, 9);
     }
 
+    // At this point, up to 16 bits may remain in the compressor->bit_buffer
+    // The output buffer may have 0 bytes remaining.
+
     // Flush the remainder of the output bit-buffer
     while (compressor->bit_buffer_pos) {
-        *output = compressor->bit_buffer >> 24;
-        output++;
+        if (TAMP_UNLIKELY(output_size == 0)) return TAMP_OUTPUT_FULL;
+        *output++ = compressor->bit_buffer >> 24;
         compressor->bit_buffer <<= 8;
         compressor->bit_buffer_pos -= MIN(compressor->bit_buffer_pos, 8);
         output_size--;

From 9aefc6e1df2dcb7ab8ae5e2b61f0925e11ee0149 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 14:58:56 -0500
Subject: [PATCH 089/109] update expected javascript hash

---
 .github/workflows/javascript.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/javascript.yaml b/.github/workflows/javascript.yaml
index 00fdd5be..4dc0fa73 100644
--- a/.github/workflows/javascript.yaml
+++ b/.github/workflows/javascript.yaml
@@ -99,9 +99,9 @@ jobs:
           cd build
           HASH=$(sha256sum enwik8-js.tamp | cut -d' ' -f1)
           echo "Compression hash: $HASH"
-          if [ "$HASH" != "02e05af059a0040d641988075cf1dfc479a084f9a34b5c8a348354211c5fa038" ]; then
+          if [ "$HASH" != "dd5e431b0cbaa6ee001b10493c2b08e6235f42e3f4ce00958e88b1b97581872e" ]; then
             echo "❌ Hash mismatch!"
-            echo "Expected: 02e05af059a0040d641988075cf1dfc479a084f9a34b5c8a348354211c5fa038"
+            echo "Expected: dd5e431b0cbaa6ee001b10493c2b08e6235f42e3f4ce00958e88b1b97581872e"
             echo "Got:      $HASH"
             exit 1
           fi

From f47b54740d44a4c0bd38ac175bb6d8f8cd6f3dd6 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 15:09:16 -0500
Subject: [PATCH 090/109] consolidate extended compression functions

---
 tamp/_c_src/tamp/compressor.c | 167 +++++++++++++++++-----------------
 1 file changed, 81 insertions(+), 86 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index d86a30a7..d3e3f40f 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -45,25 +45,6 @@ static TAMP_NOINLINE void write_to_bit_buffer(TampCompressor* compressor, uint32
     compressor->bit_buffer |= bits << (32 - compressor->bit_buffer_pos);
 }
 
-#if TAMP_EXTENDED_COMPRESS
-/**
- * @brief Write extended huffman encoding (huffman + trailing bits).
- *
- * Used for both RLE count and extended match size encoding.
- *
- * @param[in,out] compressor Compressor with bit buffer.
- * @param[in] value The value to encode.
- * @param[in] trailing_bits Number of trailing bits (3 for extended match, 4 for RLE).
- */
-static TAMP_NOINLINE void write_extended_huffman(TampCompressor* compressor, uint8_t value, uint8_t trailing_bits) {
-    uint8_t code_index = value >> trailing_bits;
-    // Write huffman code (without literal flag) + trailing bits in one call
-    write_to_bit_buffer(compressor, (huffman_codes[code_index] << trailing_bits) | (value & ((1 << trailing_bits) - 1)),
-                        (huffman_bits[code_index] - 1) + trailing_bits);
-}
-
-#endif  // TAMP_EXTENDED_COMPRESS
-
 /**
  * @brief Partially flush the internal bit buffer.
  *
@@ -156,7 +137,88 @@ static TAMP_NOINLINE void find_best_match(TampCompressor* compressor, uint16_t*
 
 #endif
 
+#if TAMP_LAZY_MATCHING
+/**
+ * @brief Check if writing a single byte will overlap with a future match section.
+ *
+ * @param[in] write_pos Position where the single byte will be written.
+ * @param[in] match_index Index in window where the match starts.
+ * @param[in] match_size Size of the match to validate.
+ * @return true if no overlap (match is safe), false if there's overlap.
+ */
+static inline bool validate_no_match_overlap(uint16_t write_pos, uint16_t match_index, uint8_t match_size) {
+    // Check if write position falls within the match range [match_index, match_index + match_size - 1]
+    return write_pos < match_index || write_pos >= match_index + match_size;
+}
+#endif
+
+tamp_res tamp_compressor_init(TampCompressor* compressor, const TampConf* conf, unsigned char* window) {
+    const TampConf conf_default = {
+        .window = 10,
+        .literal = 8,
+        .use_custom_dictionary = false,
+#if TAMP_LAZY_MATCHING
+        .lazy_matching = false,
+#endif
+#if TAMP_EXTENDED_COMPRESS
+        .extended = true,  // Default to extended format
+#endif
+    };
+    if (!conf) conf = &conf_default;
+    if (conf->window < 8 || conf->window > 15) return TAMP_INVALID_CONF;
+    if (conf->literal < 5 || conf->literal > 8) return TAMP_INVALID_CONF;
+#if !TAMP_EXTENDED_COMPRESS
+    if (conf->extended) return TAMP_INVALID_CONF;  // Extended requested but not compiled in
+#endif
+
+    for (uint8_t i = 0; i < sizeof(TampCompressor); i++)  // Zero-out the struct
+        ((unsigned char*)compressor)[i] = 0;
+
+    // Build header directly from conf (8 bits total)
+    // Layout: [window:3][literal:2][use_custom_dictionary:1][extended:1][more_headers:1]
+    uint8_t header = ((conf->window - 8) << 5) | ((conf->literal - 5) << 3) | (conf->use_custom_dictionary << 2) |
+                     (conf->extended << 1);
+
+    compressor->conf = *conf;  // Single struct copy
+    compressor->window = window;
+    compressor->min_pattern_size = tamp_compute_min_pattern_size(conf->window, conf->literal);
+
+#if TAMP_LAZY_MATCHING
+    compressor->cached_match_index = -1;  // Initialize cache as invalid
+#endif
+
+    if (!conf->use_custom_dictionary) tamp_initialize_dictionary(window, (1 << conf->window));
+
+    write_to_bit_buffer(compressor, header, 8);
+
+    return TAMP_OK;
+}
+
 #if TAMP_EXTENDED_COMPRESS
+/**
+ * @brief Write extended huffman encoding (huffman + trailing bits).
+ *
+ * Used for both RLE count and extended match size encoding.
+ *
+ * @param[in,out] compressor Compressor with bit buffer.
+ * @param[in] value The value to encode.
+ * @param[in] trailing_bits Number of trailing bits (3 for extended match, 4 for RLE).
+ */
+static TAMP_NOINLINE void write_extended_huffman(TampCompressor* compressor, uint8_t value, uint8_t trailing_bits) {
+    uint8_t code_index = value >> trailing_bits;
+    // Write huffman code (without literal flag) + trailing bits in one call
+    write_to_bit_buffer(compressor, (huffman_codes[code_index] << trailing_bits) | (value & ((1 << trailing_bits) - 1)),
+                        (huffman_bits[code_index] - 1) + trailing_bits);
+}
+
+/**
+ * @brief Get the last byte written to the window.
+ */
+static inline uint8_t get_last_window_byte(TampCompressor* compressor) {
+    uint16_t prev_pos = (compressor->window_pos - 1) & ((1 << compressor->conf.window) - 1);
+    return compressor->window[prev_pos];
+}
+
 /**
  * @brief Search for extended match continuation using implicit pattern comparison.
  *
@@ -224,73 +286,6 @@ static inline void find_extended_match(TampCompressor* compressor, uint16_t curr
         }
     }
 }
-#endif  // TAMP_EXTENDED_COMPRESS
-
-#if TAMP_LAZY_MATCHING
-/**
- * @brief Check if writing a single byte will overlap with a future match section.
- *
- * @param[in] write_pos Position where the single byte will be written.
- * @param[in] match_index Index in window where the match starts.
- * @param[in] match_size Size of the match to validate.
- * @return true if no overlap (match is safe), false if there's overlap.
- */
-static inline bool validate_no_match_overlap(uint16_t write_pos, uint16_t match_index, uint8_t match_size) {
-    // Check if write position falls within the match range [match_index, match_index + match_size - 1]
-    return write_pos < match_index || write_pos >= match_index + match_size;
-}
-#endif
-
-tamp_res tamp_compressor_init(TampCompressor* compressor, const TampConf* conf, unsigned char* window) {
-    const TampConf conf_default = {
-        .window = 10,
-        .literal = 8,
-        .use_custom_dictionary = false,
-#if TAMP_LAZY_MATCHING
-        .lazy_matching = false,
-#endif
-#if TAMP_EXTENDED_COMPRESS
-        .extended = true,  // Default to extended format
-#endif
-    };
-    if (!conf) conf = &conf_default;
-    if (conf->window < 8 || conf->window > 15) return TAMP_INVALID_CONF;
-    if (conf->literal < 5 || conf->literal > 8) return TAMP_INVALID_CONF;
-#if !TAMP_EXTENDED_COMPRESS
-    if (conf->extended) return TAMP_INVALID_CONF;  // Extended requested but not compiled in
-#endif
-
-    for (uint8_t i = 0; i < sizeof(TampCompressor); i++)  // Zero-out the struct
-        ((unsigned char*)compressor)[i] = 0;
-
-    // Build header directly from conf (8 bits total)
-    // Layout: [window:3][literal:2][use_custom_dictionary:1][extended:1][more_headers:1]
-    uint8_t header = ((conf->window - 8) << 5) | ((conf->literal - 5) << 3) | (conf->use_custom_dictionary << 2) |
-                     (conf->extended << 1);
-
-    compressor->conf = *conf;  // Single struct copy
-    compressor->window = window;
-    compressor->min_pattern_size = tamp_compute_min_pattern_size(conf->window, conf->literal);
-
-#if TAMP_LAZY_MATCHING
-    compressor->cached_match_index = -1;  // Initialize cache as invalid
-#endif
-
-    if (!conf->use_custom_dictionary) tamp_initialize_dictionary(window, (1 << conf->window));
-
-    write_to_bit_buffer(compressor, header, 8);
-
-    return TAMP_OK;
-}
-
-#if TAMP_EXTENDED_COMPRESS
-/**
- * @brief Get the last byte written to the window.
- */
-static inline uint8_t get_last_window_byte(TampCompressor* compressor) {
-    uint16_t prev_pos = (compressor->window_pos - 1) & ((1 << compressor->conf.window) - 1);
-    return compressor->window[prev_pos];
-}
 
 /**
  * @brief Write RLE token to bit buffer and update window.

From ea38c62dbdfde8146d957b0b0b8bb7f1373fa3ba Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 15:23:17 -0500
Subject: [PATCH 091/109] simplify find_extended_match, deduplicate checks from
 caller

---
 tamp/_c_src/tamp/compressor.c | 44 +++++++++++------------------------
 1 file changed, 13 insertions(+), 31 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index d3e3f40f..5511c215 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -234,51 +234,33 @@ static inline uint8_t get_last_window_byte(TampCompressor* compressor) {
  */
 static inline void find_extended_match(TampCompressor* compressor, uint16_t current_pos, uint8_t current_count,
                                        uint16_t* new_pos, uint8_t* new_count) {
+    // Preconditions (guaranteed by caller):
+    // - input_size > 0
+    // - current_pos + current_count < WINDOW_SIZE
+    // - current_count < MAX_PATTERN_SIZE
     *new_count = 0;
     const unsigned char* window = compressor->window;
     const uint16_t window_size = WINDOW_SIZE;
     const uint8_t max_pattern = MIN(current_count + compressor->input_size, MAX_PATTERN_SIZE);
-
-    // Need at least current_count + 1 to find a longer match, and room in window
-    if (max_pattern <= current_count) return;
-    if (current_pos + current_count + 1 > window_size) return;
-
-    // First two bytes of pattern (from window at current_pos)
-    const uint8_t first_byte = window[current_pos];
-    const uint8_t second_byte = window[current_pos + 1];
-
-    // The target byte to extend by (input[0], like Python does)
     const uint8_t extend_byte = read_input(0);
 
-    // Search candidates that can fit at least current_count + 1 bytes
     for (uint16_t cand = current_pos; cand + current_count + 1 <= window_size; cand++) {
-        // Quick 2-byte check
-        if (TAMP_LIKELY(window[cand] != first_byte)) continue;
-        if (TAMP_LIKELY(window[cand + 1] != second_byte)) continue;
-
-        // Check if all current_count bytes match
-        bool full_match = true;
-        for (uint8_t i = 2; i < current_count; i++) {
-            if (window[cand + i] != window[current_pos + i]) {
-                full_match = false;
-                break;
-            }
-        }
-        if (!full_match) continue;
-
-        // Check if the extension byte matches
+        // Check extension byte first (most discriminating)
         if (window[cand + current_count] != extend_byte) continue;
 
-        // Found a match of current_count + 1 bytes - now extend as far as possible
+        // Check if current_count bytes match (at cand==current_pos, compares with self)
+        uint8_t i = 0;
+        while (i < current_count && window[cand + i] == window[current_pos + i]) i++;
+        if (i < current_count) continue;
+
+        // Found a match - extend as far as possible
         const uint8_t cand_max = MIN(max_pattern, window_size - cand);
         uint8_t match_len = current_count + 1;
-        for (uint8_t i = current_count + 1; i < cand_max; i++) {
-            uint8_t target = read_input(i - current_count);
-            if (window[cand + i] != target) break;
+        for (i = current_count + 1; i < cand_max; i++) {
+            if (window[cand + i] != read_input(i - current_count)) break;
             match_len = i + 1;
         }
 
-        // Track this match (guaranteed > current_count)
         if (match_len > *new_count) {
             *new_count = match_len;
             *new_pos = cand;

From 178bf19f0199c53a1dacacd3dcc33ca49bcb5bf2 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 15:28:59 -0500
Subject: [PATCH 092/109] don't need to reset extended_match_position.

---
 tamp/_c_src/tamp/compressor.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 5511c215..5dc4fd86 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -342,9 +342,7 @@ static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor* compres
     uint8_t window_write = MIN(count, remaining);
     tamp_window_copy(compressor->window, &compressor->window_pos, position, window_write, window_mask);
 
-    // Reset extended match state
-    compressor->extended_match_count = 0;
-    compressor->extended_match_position = 0;
+    compressor->extended_match_count = 0;  // Position reset not needed - only read when count > 0
 
     return TAMP_OK;
 }

From 330e5f4c85139964746630a17b9d31b883c79b53 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 15:52:01 -0500
Subject: [PATCH 093/109] combine if-statements

---
 tamp/_c_src/tamp/compressor.c | 94 +++++++++++++++++------------------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 5dc4fd86..f5787d68 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -381,28 +381,54 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned
     uint16_t match_index = 0;
 
 #if TAMP_EXTENDED_COMPRESS
-    // Extended: Handle extended match continuation
-    if (TAMP_UNLIKELY(conf_extended && compressor->extended_match_count)) {
-        // We're in extended match mode - try to extend the match
-        const uint8_t max_ext_match = compressor->min_pattern_size + 11 + EXTENDED_MATCH_MAX_EXTRA;
+    if (TAMP_UNLIKELY(conf_extended)) {
+        // Handle extended match continuation
+        if (compressor->extended_match_count) {
+            // We're in extended match mode - try to extend the match
+            const uint8_t max_ext_match = compressor->min_pattern_size + 11 + EXTENDED_MATCH_MAX_EXTRA;
+
+            while (compressor->input_size > 0) {
+                const uint16_t current_pos = compressor->extended_match_position;
+                const uint8_t current_count = compressor->extended_match_count;
+
+                // Check if extending would go beyond window buffer boundary (no wrap-around)
+                if (current_pos + current_count >= WINDOW_SIZE) {
+                    // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
+                    if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
+                    size_t token_bytes;
+                    res = write_extended_match_token(compressor, output, output_size, &token_bytes);
+                    (*output_written_size) += token_bytes;
+                    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+                    return TAMP_OK;
+                }
 
-        while (compressor->input_size > 0) {
-            const uint16_t current_pos = compressor->extended_match_position;
-            const uint8_t current_count = compressor->extended_match_count;
+                // Check if we've reached max extended match size
+                if (current_count >= max_ext_match) {
+                    // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
+                    if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
+                    size_t token_bytes;
+                    res = write_extended_match_token(compressor, output, output_size, &token_bytes);
+                    (*output_written_size) += token_bytes;
+                    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+                    return TAMP_OK;
+                }
 
-            // Check if extending would go beyond window buffer boundary (no wrap-around)
-            if (current_pos + current_count >= WINDOW_SIZE) {
-                // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
-                if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
-                size_t token_bytes;
-                res = write_extended_match_token(compressor, output, output_size, &token_bytes);
-                (*output_written_size) += token_bytes;
-                if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-                return TAMP_OK;
-            }
+                // Search for longer match (includes O(1) extension at same position)
+                uint16_t new_pos = 0;
+                uint8_t new_count;
+                find_extended_match(compressor, current_pos, current_count, &new_pos, &new_count);
+
+                if (new_count > current_count) {
+                    // Found longer match - update and continue
+                    uint8_t extra_bytes = new_count - current_count;
+                    compressor->extended_match_position = new_pos;
+                    compressor->extended_match_count = new_count;
+                    compressor->input_pos = input_add(extra_bytes);
+                    compressor->input_size -= extra_bytes;
+                    continue;
+                }
 
-            // Check if we've reached max extended match size
-            if (current_count >= max_ext_match) {
+                // No longer match found - emit current match
                 // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
                 if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
                 size_t token_bytes;
@@ -411,37 +437,11 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned
                 if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
                 return TAMP_OK;
             }
-
-            // Search for longer match (includes O(1) extension at same position)
-            uint16_t new_pos = 0;
-            uint8_t new_count;
-            find_extended_match(compressor, current_pos, current_count, &new_pos, &new_count);
-
-            if (new_count > current_count) {
-                // Found longer match - update and continue
-                uint8_t extra_bytes = new_count - current_count;
-                compressor->extended_match_position = new_pos;
-                compressor->extended_match_count = new_count;
-                compressor->input_pos = input_add(extra_bytes);
-                compressor->input_size -= extra_bytes;
-                continue;
-            }
-
-            // No longer match found - emit current match
-            // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
-            if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
-            size_t token_bytes;
-            res = write_extended_match_token(compressor, output, output_size, &token_bytes);
-            (*output_written_size) += token_bytes;
-            if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+            // Ran out of input while extending - return and wait for more
             return TAMP_OK;
         }
-        // Ran out of input while extending - return and wait for more
-        return TAMP_OK;
-    }
 
-    // Extended: Handle RLE accumulation with persistent state
-    if (TAMP_UNLIKELY(conf_extended)) {
+        // Handle RLE accumulation with persistent state
         uint8_t last_byte = get_last_window_byte(compressor);
 
         // Count RLE bytes in current buffer WITHOUT consuming yet

From 83d3d8ed09c7f08f64c8262c9afc5ebb9e0adafd Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 16:13:00 -0500
Subject: [PATCH 094/109] consolidate write_extneded_match_token output size
 checks.

---
 tamp/_c_src/tamp/compressor.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index f5787d68..c74da71c 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -308,14 +308,17 @@ static TAMP_NOINLINE void write_rle_token(TampCompressor* compressor, uint8_t co
  */
 static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor* compressor, unsigned char* output,
                                                          size_t output_size, size_t* output_written_size) {
+    *output_written_size = 0;
+
+    // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
+    if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
+
     const uint16_t window_mask = (1 << compressor->conf.window) - 1;
     const uint8_t count = compressor->extended_match_count;
     const uint16_t position = compressor->extended_match_position;
     tamp_res res;
     size_t flush_bytes;
 
-    *output_written_size = 0;
-
     // Write symbol (7 bits) + extended huffman (up to 11 bits) = 18 bits max
     // With ≤7 bits already in buffer, total ≤25 bits - fits in 32-bit buffer
     write_to_bit_buffer(compressor, huffman_codes[TAMP_EXTENDED_MATCH_SYMBOL],
@@ -393,8 +396,6 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned
 
                 // Check if extending would go beyond window buffer boundary (no wrap-around)
                 if (current_pos + current_count >= WINDOW_SIZE) {
-                    // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
-                    if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
                     size_t token_bytes;
                     res = write_extended_match_token(compressor, output, output_size, &token_bytes);
                     (*output_written_size) += token_bytes;
@@ -404,8 +405,6 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned
 
                 // Check if we've reached max extended match size
                 if (current_count >= max_ext_match) {
-                    // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
-                    if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
                     size_t token_bytes;
                     res = write_extended_match_token(compressor, output, output_size, &token_bytes);
                     (*output_written_size) += token_bytes;
@@ -429,8 +428,6 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned
                 }
 
                 // No longer match found - emit current match
-                // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
-                if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
                 size_t token_bytes;
                 res = write_extended_match_token(compressor, output, output_size, &token_bytes);
                 (*output_written_size) += token_bytes;
@@ -676,8 +673,6 @@ tamp_res tamp_compressor_flush(TampCompressor* compressor, unsigned char* output
         compressor->rle_count = 0;
         chunk_output_written_size = 0;
     } else if (compressor->conf.extended && compressor->extended_match_count) {
-        // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
-        if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
         res = write_extended_match_token(compressor, output, output_size, &chunk_output_written_size);
     }
 #endif  // TAMP_EXTENDED_COMPRESS

From d0c2b36001fd27ad984defa13997802a3e2d67e6 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 16:35:57 -0500
Subject: [PATCH 095/109] consolidate output arithmatic to partial_flush.

---
 tamp/_c_src/tamp/compressor.c | 35 +++++++++++++----------------------
 1 file changed, 13 insertions(+), 22 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index c74da71c..5e471b0f 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -50,12 +50,15 @@ static TAMP_NOINLINE void write_to_bit_buffer(TampCompressor* compressor, uint32
  *
  * Flushes complete bytes from the bit buffer. Up to 7 bits may remain.
  */
-static TAMP_NOINLINE tamp_res partial_flush(TampCompressor* compressor, unsigned char* output, size_t output_size,
+static TAMP_NOINLINE tamp_res partial_flush(TampCompressor* compressor, unsigned char** output, size_t* output_size,
                                             size_t* output_written_size) {
-    for (*output_written_size = output_size; compressor->bit_buffer_pos >= 8 && output_size;
-         output_size--, compressor->bit_buffer_pos -= 8, compressor->bit_buffer <<= 8)
-        *output++ = compressor->bit_buffer >> 24;
-    *output_written_size -= output_size;
+    while (compressor->bit_buffer_pos >= 8 && *output_size) {
+        *(*output)++ = compressor->bit_buffer >> 24;
+        (*output_size)--;
+        (*output_written_size)++;
+        compressor->bit_buffer_pos -= 8;
+        compressor->bit_buffer <<= 8;
+    }
     return (compressor->bit_buffer_pos >= 8) ? TAMP_OUTPUT_FULL : TAMP_OK;
 }
 
@@ -317,7 +320,6 @@ static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor* compres
     const uint8_t count = compressor->extended_match_count;
     const uint16_t position = compressor->extended_match_position;
     tamp_res res;
-    size_t flush_bytes;
 
     // Write symbol (7 bits) + extended huffman (up to 11 bits) = 18 bits max
     // With ≤7 bits already in buffer, total ≤25 bits - fits in 32-bit buffer
@@ -326,18 +328,14 @@ static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor* compres
     write_extended_huffman(compressor, count - compressor->min_pattern_size - 11 - 1, TAMP_LEADING_EXTENDED_MATCH_BITS);
 
     // Flush to make room for window position (up to 15 bits)
-    res = partial_flush(compressor, output, output_size, &flush_bytes);
-    *output_written_size += flush_bytes;
-    output += flush_bytes;
-    output_size -= flush_bytes;
+    res = partial_flush(compressor, &output, &output_size, output_written_size);
     if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
 
     // Write window position - with ≤7 bits remaining, up to 22 bits total - fits
     write_to_bit_buffer(compressor, position, compressor->conf.window);
 
     // Final flush
-    res = partial_flush(compressor, output, output_size, &flush_bytes);
-    *output_written_size += flush_bytes;
+    res = partial_flush(compressor, &output, &output_size, output_written_size);
     if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
 
     // Write to window (up to end of buffer, no wrap)
@@ -370,12 +368,8 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned
 
     {
         // Make sure there's enough room in the bit buffer.
-        size_t flush_bytes_written;
-        res = partial_flush(compressor, output, output_size, &flush_bytes_written);
-        (*output_written_size) += flush_bytes_written;
+        res = partial_flush(compressor, &output, &output_size, output_written_size);
         if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-        output_size -= flush_bytes_written;
-        output += flush_bytes_written;  // cppcheck-suppress unreadVariable
     }
 
     if (TAMP_UNLIKELY(output_size == 0)) return TAMP_OUTPUT_FULL;
@@ -413,7 +407,7 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned
                 }
 
                 // Search for longer match (includes O(1) extension at same position)
-                uint16_t new_pos = 0;
+                uint16_t new_pos;
                 uint8_t new_count;
                 find_extended_match(compressor, current_pos, current_count, &new_pos, &new_count);
 
@@ -646,11 +640,8 @@ tamp_res tamp_compressor_flush(TampCompressor* compressor, unsigned char* output
 
 flush_check:
     // Flush pending bits before checking for more work
-    res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
-    (*output_written_size) += chunk_output_written_size;
+    res = partial_flush(compressor, &output, &output_size, output_written_size);
     if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-    output_size -= chunk_output_written_size;
-    output += chunk_output_written_size;
 
     if (TAMP_LIKELY(compressor->input_size)) {
         res = tamp_compressor_poll(compressor, output, output_size, &chunk_output_written_size);

From 7abf720fd4da81b066755f8011f3ee25f25dd441 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 16:43:57 -0500
Subject: [PATCH 096/109] consolidate output arithmatic to
 write_extended_match_token.

---
 tamp/_c_src/tamp/compressor.c | 34 +++++++++++++---------------------
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 5e471b0f..658a7157 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -304,17 +304,15 @@ static TAMP_NOINLINE void write_rle_token(TampCompressor* compressor, uint8_t co
  * Total: up to 33 bits. We flush after symbol+huffman (18 bits max) to ensure window_pos fits.
  *
  * @param[in,out] compressor Compressor state.
- * @param[out] output Output buffer for flushed bytes.
- * @param[in] output_size Available space in output buffer.
- * @param[out] output_written_size Bytes written to output.
+ * @param[in,out] output Output buffer pointer (updated on return).
+ * @param[in,out] output_size Available space (updated on return).
+ * @param[in,out] output_written_size Bytes written (accumulated).
  * @return TAMP_OK on success, TAMP_OUTPUT_FULL if output buffer is too small.
  */
-static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor* compressor, unsigned char* output,
-                                                         size_t output_size, size_t* output_written_size) {
-    *output_written_size = 0;
-
+static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor* compressor, unsigned char** output,
+                                                         size_t* output_size, size_t* output_written_size) {
     // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
-    if (TAMP_UNLIKELY(output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
+    if (TAMP_UNLIKELY(*output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
 
     const uint16_t window_mask = (1 << compressor->conf.window) - 1;
     const uint8_t count = compressor->extended_match_count;
@@ -328,14 +326,14 @@ static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor* compres
     write_extended_huffman(compressor, count - compressor->min_pattern_size - 11 - 1, TAMP_LEADING_EXTENDED_MATCH_BITS);
 
     // Flush to make room for window position (up to 15 bits)
-    res = partial_flush(compressor, &output, &output_size, output_written_size);
+    res = partial_flush(compressor, output, output_size, output_written_size);
     if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
 
     // Write window position - with ≤7 bits remaining, up to 22 bits total - fits
     write_to_bit_buffer(compressor, position, compressor->conf.window);
 
     // Final flush
-    res = partial_flush(compressor, &output, &output_size, output_written_size);
+    res = partial_flush(compressor, output, output_size, output_written_size);
     if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
 
     // Write to window (up to end of buffer, no wrap)
@@ -390,18 +388,14 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned
 
                 // Check if extending would go beyond window buffer boundary (no wrap-around)
                 if (current_pos + current_count >= WINDOW_SIZE) {
-                    size_t token_bytes;
-                    res = write_extended_match_token(compressor, output, output_size, &token_bytes);
-                    (*output_written_size) += token_bytes;
+                    res = write_extended_match_token(compressor, &output, &output_size, output_written_size);
                     if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
                     return TAMP_OK;
                 }
 
                 // Check if we've reached max extended match size
                 if (current_count >= max_ext_match) {
-                    size_t token_bytes;
-                    res = write_extended_match_token(compressor, output, output_size, &token_bytes);
-                    (*output_written_size) += token_bytes;
+                    res = write_extended_match_token(compressor, &output, &output_size, output_written_size);
                     if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
                     return TAMP_OK;
                 }
@@ -422,9 +416,7 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned
                 }
 
                 // No longer match found - emit current match
-                size_t token_bytes;
-                res = write_extended_match_token(compressor, output, output_size, &token_bytes);
-                (*output_written_size) += token_bytes;
+                res = write_extended_match_token(compressor, &output, &output_size, output_written_size);
                 if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
                 return TAMP_OK;
             }
@@ -640,6 +632,7 @@ tamp_res tamp_compressor_flush(TampCompressor* compressor, unsigned char* output
 
 flush_check:
     // Flush pending bits before checking for more work
+    chunk_output_written_size = 0;
     res = partial_flush(compressor, &output, &output_size, output_written_size);
     if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
 
@@ -662,9 +655,8 @@ tamp_res tamp_compressor_flush(TampCompressor* compressor, unsigned char* output
             write_rle_token(compressor, compressor->rle_count);
         }
         compressor->rle_count = 0;
-        chunk_output_written_size = 0;
     } else if (compressor->conf.extended && compressor->extended_match_count) {
-        res = write_extended_match_token(compressor, output, output_size, &chunk_output_written_size);
+        res = write_extended_match_token(compressor, &output, &output_size, output_written_size);
     }
 #endif  // TAMP_EXTENDED_COMPRESS
     else {

From 750b00287d16c20b29fba93fec400527010b4d16 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 16:44:52 -0500
Subject: [PATCH 097/109] update docstrings

---
 tamp/_c_src/tamp/compressor.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 658a7157..6779a67f 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -49,6 +49,12 @@ static TAMP_NOINLINE void write_to_bit_buffer(TampCompressor* compressor, uint32
  * @brief Partially flush the internal bit buffer.
  *
  * Flushes complete bytes from the bit buffer. Up to 7 bits may remain.
+ *
+ * @param[in,out] compressor Compressor state.
+ * @param[in,out] output Output buffer pointer (updated on return).
+ * @param[in,out] output_size Available space (updated on return).
+ * @param[in,out] output_written_size Bytes written (accumulated).
+ * @return TAMP_OK on success, TAMP_OUTPUT_FULL if output buffer is too small.
  */
 static TAMP_NOINLINE tamp_res partial_flush(TampCompressor* compressor, unsigned char** output, size_t* output_size,
                                             size_t* output_written_size) {

From f68aa6ff4210372fd678952d0e75dd57ccaf1104 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 17:00:52 -0500
Subject: [PATCH 098/109] get rid of useless brackets

---
 tamp/_c_src/tamp/compressor.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 6779a67f..6181c5bd 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -370,11 +370,9 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned
 
     if (TAMP_UNLIKELY(compressor->input_size == 0)) return TAMP_OK;
 
-    {
-        // Make sure there's enough room in the bit buffer.
-        res = partial_flush(compressor, &output, &output_size, output_written_size);
-        if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-    }
+    // Make sure there's enough room in the bit buffer.
+    res = partial_flush(compressor, &output, &output_size, output_written_size);
+    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
 
     if (TAMP_UNLIKELY(output_size == 0)) return TAMP_OUTPUT_FULL;
 

From f8b1bfc990c2d1eef2be89b7cb2208587bff2a39 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 19:49:14 -0500
Subject: [PATCH 099/109] update CLAUDE.md

---
 CLAUDE.md | 111 +++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 93 insertions(+), 18 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 1dd02c31..5ba7e8cd 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -26,9 +26,13 @@ different platforms:
 **Shared C Source:** All implementations use the same C source code in
 `tamp/_c_src/tamp/`:
 
-- `common.h/c` - Shared utilities and data structures
-- `compressor.h/c` - Compression implementation
+- `common.h/c` - Shared utilities, data structures, stream I/O callbacks, and
+  dictionary initialization
+- `compressor.h/c` - Compression implementation (sink/poll low-level API and
+  higher-level compress/flush API)
 - `decompressor.h/c` - Decompression implementation
+- `compressor_find_match_desktop.c` - Desktop-optimized match finding (included
+  by `compressor.c` on non-embedded targets)
 
 ## Development Commands
 
@@ -162,38 +166,81 @@ make website-clean         # Clean website build artifacts
 **WebAssembly Build Process:**
 
 1. `wasm/Makefile` compiles C source to WebAssembly using Emscripten
-2. `wasm/scripts/build.js` generates multiple JS/TS distribution formats
+2. `tsup` (via `npm run build:js`) bundles into multiple JS/TS distribution
+   formats (CJS, ESM, `.d.ts`)
 3. Exports specific C functions and runtime methods for JS interop
 
-**Configuration Flags:**
+**Configuration Flags (compile-time `-D` defines):**
 
-- `TAMP_LAZY_MATCHING=1` - Enable lazy matching optimization (default)
-- `TAMP_ESP32=1` - ESP32-specific optimizations
+- `TAMP_LAZY_MATCHING=1` - Enable lazy matching optimization (default in
+  build.py)
+- `TAMP_ESP32=1` - ESP32-specific optimizations (avoids bitfields for speed)
 - `TAMP_COMPRESSOR`/`TAMP_DECOMPRESSOR` - Include/exclude components
+- `TAMP_EXTENDED=1` - Master switch for extended format: RLE and extended match
+  (default: 1). `TAMP_EXTENDED_COMPRESS` and `TAMP_EXTENDED_DECOMPRESS` can
+  individually override.
+- `TAMP_STREAM=1` - Include stream API (default: 1). Disable with
+  `-DTAMP_STREAM=0` to save ~2.8KB.
+- `TAMP_STREAM_WORK_BUFFER_SIZE=32` - Stack-allocated work buffer for stream API
+  (default: 32 bytes, 256+ recommended for performance)
+- `TAMP_STREAM_MEMORY` / `TAMP_STREAM_STDIO` / `TAMP_STREAM_LITTLEFS` /
+  `TAMP_STREAM_FATFS` - Enable built-in I/O handlers for specific backends
+- `TAMP_USE_EMBEDDED_MATCH=1` - Force embedded `find_best_match` implementation
+  on desktop (for testing)
+
+**Build Environment Variables (Python):**
+
+- `TAMP_SANITIZE=1` - Enable AddressSanitizer + UBSan
+- `TAMP_PROFILE=1` - Enable profiling (line trace, debug info)
+- `TAMP_USE_EMBEDDED_MATCH=1` - Force embedded match finding
+- `TAMP_BUILD_C_EXTENSIONS=0` - Skip building C extensions entirely
+- `CIBUILDWHEEL=1` - CI wheel building mode (disables allowed_to_fail)
 
 ### Testing Strategy
 
 **Multi-layered Testing:**
 
-- **Python tests** (`tests/`) - Core algorithm testing using pytest
+- **Python tests** (`tests/`) - Core algorithm testing using pytest. Includes
+  bit reader/writer, compressor, decompressor, round-trip, CLI, dataset
+  regression, and file interface tests.
 - **WebAssembly tests** (`wasm/test/`) - JS/TS API testing with Node.js test
-  runner
+  runner (`node --test`)
 - **C tests** (`ctests/`) - Low-level C API testing using Unity framework
+  (submodule at `ctests/Unity/`). Includes stream API tests and filesystem
+  integration tests with LittleFS and FatFS RAM backends.
 - **Integration tests** - Cross-platform compatibility and performance
   benchmarks
 
 **Test Data Sources:**
 
-- Enwik8 dataset (100MB) for performance benchmarking
-- Silesia corpus for compression ratio evaluation
+- Enwik8 dataset (100MB) for performance benchmarking (`make download-enwik8`)
+- Silesia corpus for compression ratio evaluation (`make download-silesia`)
 - Custom test cases for edge conditions
 
+### Compressor Architecture
+
+The C compressor uses a two-phase low-level API:
+
+1. `tamp_compressor_sink()` - Copies input bytes into a 16-byte internal ring
+   buffer (cheap/fast)
+2. `tamp_compressor_poll()` - Runs one compression iteration on the internal
+   buffer (computationally intensive)
+
+Higher-level convenience functions (`tamp_compressor_compress`,
+`tamp_compressor_compress_and_flush`) wrap these. Callback variants (`_cb`
+suffix) accept a `tamp_callback_t` progress callback.
+
+The stream API (`tamp_compress_stream`, `tamp_decompress_stream`) provides a
+file-oriented interface using read/write callbacks, supporting multiple I/O
+backends (memory, stdio, LittleFS, FatFS).
+
 ### Memory Management Patterns
 
 **Key Principle:** Fixed memory usage during compression/decompression
 
 - Window size determines memory usage: `(1 << windowBits)` bytes
 - No dynamic allocation during compression/decompression operations
+- Stream API uses a stack-allocated work buffer (`TAMP_STREAM_WORK_BUFFER_SIZE`)
 - Streaming interfaces require explicit resource management (`destroy()` calls
   in JS/TS)
 
@@ -202,7 +249,9 @@ make website-clean         # Clean website build artifacts
 ### Making Changes to Core Algorithm
 
 1. **Modify C source** in `tamp/_c_src/tamp/`
-2. **Rebuild all implementations:**
+2. **Update pure Python reference** in `tamp/compressor.py` /
+   `tamp/decompressor.py` to match
+3. **Rebuild all implementations:**
 
    ```bash
    # Python
@@ -212,11 +261,12 @@ make website-clean         # Clean website build artifacts
    cd wasm && npm run build
    ```
 
-3. **Run comprehensive tests:**
+4. **Run comprehensive tests:**
    ```bash
-   make test              # Python + MicroPython
+   poetry run pytest      # Python tests
+   make c-test            # C unit tests with sanitizers
+   make c-test-embedded   # C tests with embedded match finding
    cd wasm && npm test    # WebAssembly
-   make c-test           # C unit tests
    ```
 
 ### Adding New Features
@@ -232,11 +282,13 @@ make website-clean         # Clean website build artifacts
 - **Use provided benchmarking tools:**
   ```bash
   make on-device-compression-benchmark     # MicroPython performance
-  npm run test:enwik8                     # WebAssembly performance
-  python tools/performance-benchmark.sh   # Python performance
+  cd wasm && npm run test:enwik8          # WebAssembly performance
+  bash tools/performance-benchmark.sh     # Python performance
+  make c-benchmark-stream                 # C stream API benchmark
+  make binary-size                        # ARM binary size table
   ```
-- **Profile with:** `tools/profiler.py` for Python, browser dev tools for
-  WebAssembly
+- **Profile with:** `tools/profiler.py` for Python (requires `TAMP_PROFILE=1`),
+  browser dev tools for WebAssembly
 
 ### Release Process
 
@@ -247,6 +299,29 @@ make website-clean         # Clean website build artifacts
    - WebAssembly npm package
 3. **CI/CD handles** cross-platform builds and testing
 
+### Python Import Fallback Chain
+
+`tamp/__init__.py` imports Compressor/Decompressor using this priority:
+
+1. Viper (MicroPython optimized) - only available on MicroPython
+2. Cython C extensions (`_c_compressor`/`_c_decompressor`) - primary on CPython
+3. Pure Python reference (`compressor.py`/`decompressor.py`) - fallback
+
+When modifying compression behavior, changes to the C source must be mirrored in
+the pure Python reference implementation to keep them in sync.
+
+### CI/CD
+
+GitHub Actions workflows (`.github/workflows/`):
+
+- `tests.yaml` - Lint (ruff, pre-commit) and test across Python 3.9/3.12/3.13
+  and multiple OS. Also runs `c-test` and `c-test-embedded`.
+- `build_wheels.yaml` - Cross-platform wheel builds via cibuildwheel
+- `javascript.yaml` - WebAssembly tests on Node 18/20
+- `mpy_native_module.yaml` - MicroPython native module builds for ARM
+  architectures
+- `esp_upload_component.yml` - ESP-IDF component registry upload
+
 ## Documentation Style
 
 - Avoid "fake" subsections (e.g., bold text like `**Error Promotion:**` acting

From 238ec2dc70352dd9ae7db01154ad65aff65f650e Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 19:54:17 -0500
Subject: [PATCH 100/109] note: always inline refill_bit_buffer.

---
 tamp/_c_src/tamp/decompressor.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 23ab4ed1..3b9a568a 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -316,6 +316,9 @@ tamp_res tamp_decompressor_init(TampDecompressor* decompressor, const TampConf*
  * @brief Refill bit buffer from input stream.
  *
  * Consumes bytes from input until bit_buffer has at least 25 bits or input is exhausted.
+ *
+ * NOTE: NOINLINE saves ~192 bytes on armv6m but causes ~10% decompression
+ * speed regression. Keep this inlined for performance.
  */
 static inline void refill_bit_buffer(TampDecompressor* d, const unsigned char** input, const unsigned char* input_end,
                                      size_t* input_consumed_size) {

From 1ab913e0ea2abe1e642c485f57626bf5eaada111 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 19:57:02 -0500
Subject: [PATCH 101/109] further flushing optimization.

---
 tamp/_c_src/tamp/compressor.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 6181c5bd..f080809e 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -222,8 +222,10 @@ static TAMP_NOINLINE void write_extended_huffman(TampCompressor* compressor, uin
 
 /**
  * @brief Get the last byte written to the window.
+ *
+ * NOINLINE: called from 3 sites; outlining saves ~44 bytes on armv6m.
  */
-static inline uint8_t get_last_window_byte(TampCompressor* compressor) {
+static TAMP_NOINLINE uint8_t get_last_window_byte(TampCompressor* compressor) {
     uint16_t prev_pos = (compressor->window_pos - 1) & ((1 << compressor->conf.window) - 1);
     return compressor->window[prev_pos];
 }
@@ -688,17 +690,17 @@ tamp_res tamp_compressor_flush(TampCompressor* compressor, unsigned char* output
     // At this point, up to 16 bits may remain in the compressor->bit_buffer
     // The output buffer may have 0 bytes remaining.
 
-    // Flush the remainder of the output bit-buffer
-    while (compressor->bit_buffer_pos) {
+    // Flush whole bytes, then write trailing partial byte
+    res = partial_flush(compressor, &output, &output_size, output_written_size);
+    if (compressor->bit_buffer_pos) {
         if (TAMP_UNLIKELY(output_size == 0)) return TAMP_OUTPUT_FULL;
-        *output++ = compressor->bit_buffer >> 24;
-        compressor->bit_buffer <<= 8;
-        compressor->bit_buffer_pos -= MIN(compressor->bit_buffer_pos, 8);
-        output_size--;
+        *output = compressor->bit_buffer >> 24;
         (*output_written_size)++;
+        compressor->bit_buffer_pos = 0;
+        compressor->bit_buffer = 0;
     }
 
-    return TAMP_OK;
+    return res;
 }
 
 tamp_res tamp_compressor_compress_and_flush_cb(TampCompressor* compressor, unsigned char* output, size_t output_size,

From 627cb38a2b2895832e00892d18dcea83601befa7 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 20:19:10 -0500
Subject: [PATCH 102/109] TAMP_OPTIMIZE_SIZE macro

---
 tamp/_c_src/tamp/common.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tamp/_c_src/tamp/common.h b/tamp/_c_src/tamp/common.h
index 706ce0fa..0e5bec7e 100644
--- a/tamp/_c_src/tamp/common.h
+++ b/tamp/_c_src/tamp/common.h
@@ -42,12 +42,19 @@ extern "C" {
 #if defined(_MSC_VER)
 #define TAMP_ALWAYS_INLINE __forceinline
 #define TAMP_NOINLINE __declspec(noinline)
-#elif defined(__GNUC__) || defined(__clang__)
+#define TAMP_OPTIMIZE_SIZE /* not supported */
+#elif defined(__GNUC__) && !defined(__clang__)
 #define TAMP_ALWAYS_INLINE inline __attribute__((always_inline))
 #define TAMP_NOINLINE __attribute__((noinline))
+#define TAMP_OPTIMIZE_SIZE __attribute__((optimize("Os")))
+#elif defined(__clang__)
+#define TAMP_ALWAYS_INLINE inline __attribute__((always_inline))
+#define TAMP_NOINLINE __attribute__((noinline))
+#define TAMP_OPTIMIZE_SIZE /* clang doesn't support per-function optimize */
 #else
 #define TAMP_ALWAYS_INLINE inline
 #define TAMP_NOINLINE
+#define TAMP_OPTIMIZE_SIZE
 #endif
 
 /* Include stream API (tamp_compress_stream, tamp_decompress_stream).

From 95c770a77921b6f0e2fb010b592f34c771bcf72c Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 20:29:45 -0500
Subject: [PATCH 103/109] extract out extended bits to its own private polling
 function.

---
 tamp/_c_src/tamp/compressor.c | 207 +++++++++++++++++-----------------
 1 file changed, 102 insertions(+), 105 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index f080809e..a9be3145 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -24,6 +24,10 @@
 
 #define FLUSH_CODE (0xAB)
 
+// Internal return value for poll_extended_handling: signals caller to
+// proceed with normal pattern matching rather than returning immediately.
+#define TAMP_POLL_CONTINUE ((tamp_res)127)
+
 // encodes [min_pattern_bytes, min_pattern_bytes + 13] pattern lengths
 static const uint8_t huffman_codes[] = {0x0, 0x3, 0x8, 0xb, 0x14, 0x24, 0x26, 0x2b, 0x4b, 0x54, 0x94, 0x95, 0xaa, 0x27};
 // These bit lengths pre-add the 1 bit for the 0-value is_literal flag.
@@ -237,14 +241,18 @@ static TAMP_NOINLINE uint8_t get_last_window_byte(TampCompressor* compressor) {
  * starting from current_pos. Returns the longest match found (which may be at
  * current_pos itself if O(1) extension works, or at a different position).
  *
+ * NOINLINE + Os: Called only during extended match continuation (rare path).
+ * Outlining saves ~100 bytes in poll on armv6m.
+ *
  * @param[in] compressor TampCompressor object
  * @param[in] current_pos Current match position in window (also search start)
  * @param[in] current_count Current match length
  * @param[out] new_pos Position of found match (only valid if new_count > current_count)
  * @param[out] new_count Length of found match
  */
-static inline void find_extended_match(TampCompressor* compressor, uint16_t current_pos, uint8_t current_count,
-                                       uint16_t* new_pos, uint8_t* new_count) {
+static TAMP_NOINLINE TAMP_OPTIMIZE_SIZE void find_extended_match(TampCompressor* compressor, uint16_t current_pos,
+                                                                 uint8_t current_count, uint16_t* new_pos,
+                                                                 uint8_t* new_count) {
     // Preconditions (guaranteed by caller):
     // - input_size > 0
     // - current_pos + current_count < WINDOW_SIZE
@@ -353,6 +361,92 @@ static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor* compres
 
     return TAMP_OK;
 }
+
+/**
+ * @brief Handle all extended-specific logic in poll (match continuation + RLE).
+ *
+ * NOINLINE + Os: Extended paths are rarely executed. Outlining from poll saves
+ * significant code size on register-constrained Cortex-M0+ where the compiler
+ * otherwise spills heavily to stack (~48 bytes saved on armv6m).
+ *
+ * @return TAMP_OK if fully handled (caller should return TAMP_OK),
+ *         TAMP_POLL_CONTINUE if caller should proceed to normal pattern matching,
+ *         other tamp_res on error.
+ */
+static TAMP_NOINLINE TAMP_OPTIMIZE_SIZE tamp_res poll_extended_handling(TampCompressor* compressor,
+                                                                        unsigned char** output, size_t* output_size,
+                                                                        size_t* output_written_size) {
+    // Handle extended match continuation
+    if (compressor->extended_match_count) {
+        const uint8_t max_ext_match = compressor->min_pattern_size + 11 + EXTENDED_MATCH_MAX_EXTRA;
+
+        while (compressor->input_size > 0) {
+            const uint16_t current_pos = compressor->extended_match_position;
+            const uint8_t current_count = compressor->extended_match_count;
+
+            if (current_pos + current_count >= WINDOW_SIZE || current_count >= max_ext_match) {
+                return write_extended_match_token(compressor, output, output_size, output_written_size);
+            }
+
+            uint16_t new_pos;
+            uint8_t new_count;
+            find_extended_match(compressor, current_pos, current_count, &new_pos, &new_count);
+
+            if (new_count > current_count) {
+                uint8_t extra_bytes = new_count - current_count;
+                compressor->extended_match_position = new_pos;
+                compressor->extended_match_count = new_count;
+                compressor->input_pos = input_add(extra_bytes);
+                compressor->input_size -= extra_bytes;
+                continue;
+            }
+
+            return write_extended_match_token(compressor, output, output_size, output_written_size);
+        }
+        return TAMP_OK;
+    }
+
+    // Handle RLE accumulation
+    uint8_t last_byte = get_last_window_byte(compressor);
+
+    uint8_t rle_available = 0;
+    while (rle_available < compressor->input_size && compressor->rle_count + rle_available < RLE_MAX_COUNT &&
+           compressor->input[input_add(rle_available)] == last_byte) {
+        rle_available++;
+    }
+
+    uint8_t total_rle = compressor->rle_count + rle_available;
+    bool rle_ended = (rle_available < compressor->input_size) || (total_rle >= RLE_MAX_COUNT);
+
+    if (!rle_ended && total_rle > 0) {
+        compressor->rle_count = total_rle;
+        compressor->input_pos = input_add(rle_available);
+        compressor->input_size -= rle_available;
+        return TAMP_OK;
+    }
+
+    if (total_rle >= 2) {
+        if (total_rle == rle_available && total_rle <= 6) {
+            uint16_t pattern_index;
+            uint8_t pattern_size;
+            find_best_match(compressor, &pattern_index, &pattern_size);
+
+            if (pattern_size > total_rle) {
+                compressor->rle_count = 0;
+                return TAMP_POLL_CONTINUE;  // Proceed to pattern matching
+            }
+        }
+
+        compressor->input_pos = input_add(rle_available);
+        compressor->input_size -= rle_available;
+        write_rle_token(compressor, total_rle);
+        compressor->rle_count = 0;
+        return TAMP_OK;
+    }
+
+    if (total_rle == 1) compressor->rle_count = 0;
+    return TAMP_POLL_CONTINUE;  // Proceed to pattern matching
+}
 #endif  // TAMP_EXTENDED_COMPRESS
 
 TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned char* output, size_t output_size,
@@ -362,9 +456,6 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned
     const uint8_t conf_window = compressor->conf.window;
     const uint8_t conf_literal = compressor->conf.literal;
     const uint16_t window_mask = (1 << conf_window) - 1;
-#if TAMP_EXTENDED_COMPRESS
-    const bool conf_extended = compressor->conf.extended;
-#endif
     size_t output_written_size_proxy;
 
     if (!output_written_size) output_written_size = &output_written_size_proxy;
@@ -382,105 +473,11 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned
     uint16_t match_index = 0;
 
 #if TAMP_EXTENDED_COMPRESS
-    if (TAMP_UNLIKELY(conf_extended)) {
-        // Handle extended match continuation
-        if (compressor->extended_match_count) {
-            // We're in extended match mode - try to extend the match
-            const uint8_t max_ext_match = compressor->min_pattern_size + 11 + EXTENDED_MATCH_MAX_EXTRA;
-
-            while (compressor->input_size > 0) {
-                const uint16_t current_pos = compressor->extended_match_position;
-                const uint8_t current_count = compressor->extended_match_count;
-
-                // Check if extending would go beyond window buffer boundary (no wrap-around)
-                if (current_pos + current_count >= WINDOW_SIZE) {
-                    res = write_extended_match_token(compressor, &output, &output_size, output_written_size);
-                    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-                    return TAMP_OK;
-                }
-
-                // Check if we've reached max extended match size
-                if (current_count >= max_ext_match) {
-                    res = write_extended_match_token(compressor, &output, &output_size, output_written_size);
-                    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-                    return TAMP_OK;
-                }
-
-                // Search for longer match (includes O(1) extension at same position)
-                uint16_t new_pos;
-                uint8_t new_count;
-                find_extended_match(compressor, current_pos, current_count, &new_pos, &new_count);
-
-                if (new_count > current_count) {
-                    // Found longer match - update and continue
-                    uint8_t extra_bytes = new_count - current_count;
-                    compressor->extended_match_position = new_pos;
-                    compressor->extended_match_count = new_count;
-                    compressor->input_pos = input_add(extra_bytes);
-                    compressor->input_size -= extra_bytes;
-                    continue;
-                }
-
-                // No longer match found - emit current match
-                res = write_extended_match_token(compressor, &output, &output_size, output_written_size);
-                if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-                return TAMP_OK;
-            }
-            // Ran out of input while extending - return and wait for more
-            return TAMP_OK;
-        }
-
-        // Handle RLE accumulation with persistent state
-        uint8_t last_byte = get_last_window_byte(compressor);
-
-        // Count RLE bytes in current buffer WITHOUT consuming yet
-        uint8_t rle_available = 0;
-        while (rle_available < compressor->input_size && compressor->rle_count + rle_available < RLE_MAX_COUNT &&
-               compressor->input[input_add(rle_available)] == last_byte) {
-            rle_available++;
-        }
-
-        uint8_t total_rle = compressor->rle_count + rle_available;
-        bool rle_ended = (rle_available < compressor->input_size) || (total_rle >= RLE_MAX_COUNT);
-
-        // If RLE hasn't ended and we haven't hit max, consume and wait for more
-        if (!rle_ended && total_rle > 0) {
-            compressor->rle_count = total_rle;
-            compressor->input_pos = input_add(rle_available);
-            compressor->input_size -= rle_available;
-            return TAMP_OK;
-        }
-
-        // RLE run has ended - decide between RLE and pattern match
-        if (total_rle >= 2) {
-            bool use_pattern = false;
-
-            // For short RLE runs (all from this call), check if pattern match is better
-            if (total_rle == rle_available && total_rle <= 6) {
-                uint16_t pattern_index;
-                uint8_t pattern_size;
-                find_best_match(compressor, &pattern_index, &pattern_size);
-
-                if (pattern_size > total_rle) {
-                    use_pattern = true;
-                    // Don't consume RLE bytes - fall through to pattern matching
-                }
-            }
-
-            if (!use_pattern) {
-                // Use RLE - consume bytes and write token
-                compressor->input_pos = input_add(rle_available);
-                compressor->input_size -= rle_available;
-                write_rle_token(compressor, total_rle);
-                compressor->rle_count = 0;
-                return TAMP_OK;
-            }
-            compressor->rle_count = 0;
-        } else if (total_rle == 1) {
-            // Single byte - not worth RLE, will be handled as literal/pattern
-            // Byte is still in input buffer (not consumed), just reset RLE state
-            compressor->rle_count = 0;
-        }
+    if (TAMP_UNLIKELY(compressor->conf.extended)) {
+        // Handle extended match continuation + RLE (outlined for code size)
+        res = poll_extended_handling(compressor, &output, &output_size, output_written_size);
+        if (res != TAMP_POLL_CONTINUE) return res;
+        // TAMP_POLL_CONTINUE: proceed to pattern matching below
     }
 #endif  // TAMP_EXTENDED_COMPRESS
 
@@ -544,7 +541,7 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned
     } else {
 #if TAMP_EXTENDED_COMPRESS
         // Extended: Start extended match continuation
-        if (conf_extended && match_size > compressor->min_pattern_size + 11) {
+        if (compressor->conf.extended && match_size > compressor->min_pattern_size + 11) {
             compressor->extended_match_count = match_size;
             compressor->extended_match_position = match_index;
             // Consume matched bytes from input

From b31ce735f186f1e35e5e1b1949e70ec3d0c4df4b Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Thu, 5 Feb 2026 20:54:13 -0500
Subject: [PATCH 104/109] more TAMP_OPTIMIZE_SIZE attributes

---
 tamp/_c_src/tamp/common.c       |  4 ++--
 tamp/_c_src/tamp/compressor.c   | 36 +++++++++++++++++++--------------
 tamp/_c_src/tamp/decompressor.c | 14 +++++++------
 3 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/tamp/_c_src/tamp/common.c b/tamp/_c_src/tamp/common.c
index b1ef1b9b..be0099a1 100644
--- a/tamp/_c_src/tamp/common.c
+++ b/tamp/_c_src/tamp/common.c
@@ -24,7 +24,7 @@ static inline uint32_t xorshift32(uint32_t *state) {
     return x;
 }
 
-void tamp_initialize_dictionary(unsigned char *buffer, size_t size) {
+TAMP_OPTIMIZE_SIZE void tamp_initialize_dictionary(unsigned char *buffer, size_t size) {
     uint32_t seed = 3758097560;  // This was experimentally discovered with tools/find_seed.py
     uint32_t randbuf = 0;
     for (size_t i = 0; i < size; i++) {
@@ -34,7 +34,7 @@ void tamp_initialize_dictionary(unsigned char *buffer, size_t size) {
     }
 }
 
-int8_t tamp_compute_min_pattern_size(uint8_t window, uint8_t literal) {
+TAMP_OPTIMIZE_SIZE int8_t tamp_compute_min_pattern_size(uint8_t window, uint8_t literal) {
     return 2 + (window > (10 + ((literal - 5) << 1)));
 }
 
diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index a9be3145..0fbb7780 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -60,8 +60,8 @@ static TAMP_NOINLINE void write_to_bit_buffer(TampCompressor* compressor, uint32
  * @param[in,out] output_written_size Bytes written (accumulated).
  * @return TAMP_OK on success, TAMP_OUTPUT_FULL if output buffer is too small.
  */
-static TAMP_NOINLINE tamp_res partial_flush(TampCompressor* compressor, unsigned char** output, size_t* output_size,
-                                            size_t* output_written_size) {
+static TAMP_NOINLINE TAMP_OPTIMIZE_SIZE tamp_res partial_flush(TampCompressor* compressor, unsigned char** output,
+                                                               size_t* output_size, size_t* output_written_size) {
     while (compressor->bit_buffer_pos >= 8 && *output_size) {
         *(*output)++ = compressor->bit_buffer >> 24;
         (*output_size)--;
@@ -165,7 +165,8 @@ static inline bool validate_no_match_overlap(uint16_t write_pos, uint16_t match_
 }
 #endif
 
-tamp_res tamp_compressor_init(TampCompressor* compressor, const TampConf* conf, unsigned char* window) {
+TAMP_OPTIMIZE_SIZE tamp_res tamp_compressor_init(TampCompressor* compressor, const TampConf* conf,
+                                                 unsigned char* window) {
     const TampConf conf_default = {
         .window = 10,
         .literal = 8,
@@ -217,7 +218,8 @@ tamp_res tamp_compressor_init(TampCompressor* compressor, const TampConf* conf,
  * @param[in] value The value to encode.
  * @param[in] trailing_bits Number of trailing bits (3 for extended match, 4 for RLE).
  */
-static TAMP_NOINLINE void write_extended_huffman(TampCompressor* compressor, uint8_t value, uint8_t trailing_bits) {
+static TAMP_NOINLINE TAMP_OPTIMIZE_SIZE void write_extended_huffman(TampCompressor* compressor, uint8_t value,
+                                                                    uint8_t trailing_bits) {
     uint8_t code_index = value >> trailing_bits;
     // Write huffman code (without literal flag) + trailing bits in one call
     write_to_bit_buffer(compressor, (huffman_codes[code_index] << trailing_bits) | (value & ((1 << trailing_bits) - 1)),
@@ -229,7 +231,7 @@ static TAMP_NOINLINE void write_extended_huffman(TampCompressor* compressor, uin
  *
  * NOINLINE: called from 3 sites; outlining saves ~44 bytes on armv6m.
  */
-static TAMP_NOINLINE uint8_t get_last_window_byte(TampCompressor* compressor) {
+static TAMP_NOINLINE TAMP_OPTIMIZE_SIZE uint8_t get_last_window_byte(TampCompressor* compressor) {
     uint16_t prev_pos = (compressor->window_pos - 1) & ((1 << compressor->conf.window) - 1);
     return compressor->window[prev_pos];
 }
@@ -583,9 +585,11 @@ void tamp_compressor_sink(TampCompressor* compressor, const unsigned char* input
     }
 }
 
-tamp_res tamp_compressor_compress_cb(TampCompressor* compressor, unsigned char* output, size_t output_size,
-                                     size_t* output_written_size, const unsigned char* input, size_t input_size,
-                                     size_t* input_consumed_size, tamp_callback_t callback, void* user_data) {
+TAMP_OPTIMIZE_SIZE tamp_res tamp_compressor_compress_cb(TampCompressor* compressor, unsigned char* output,
+                                                        size_t output_size, size_t* output_written_size,
+                                                        const unsigned char* input, size_t input_size,
+                                                        size_t* input_consumed_size, tamp_callback_t callback,
+                                                        void* user_data) {
     tamp_res res;
     size_t input_consumed_size_proxy = 0, output_written_size_proxy = 0;
     size_t total_input_size = input_size;
@@ -700,10 +704,11 @@ tamp_res tamp_compressor_flush(TampCompressor* compressor, unsigned char* output
     return res;
 }
 
-tamp_res tamp_compressor_compress_and_flush_cb(TampCompressor* compressor, unsigned char* output, size_t output_size,
-                                               size_t* output_written_size, const unsigned char* input,
-                                               size_t input_size, size_t* input_consumed_size, bool write_token,
-                                               tamp_callback_t callback, void* user_data) {
+TAMP_OPTIMIZE_SIZE tamp_res tamp_compressor_compress_and_flush_cb(TampCompressor* compressor, unsigned char* output,
+                                                                  size_t output_size, size_t* output_written_size,
+                                                                  const unsigned char* input, size_t input_size,
+                                                                  size_t* input_consumed_size, bool write_token,
+                                                                  tamp_callback_t callback, void* user_data) {
     tamp_res res;
     size_t flush_size;
     size_t output_written_size_proxy;
@@ -726,9 +731,10 @@ tamp_res tamp_compressor_compress_and_flush_cb(TampCompressor* compressor, unsig
 
 #if TAMP_STREAM
 
-tamp_res tamp_compress_stream(TampCompressor* compressor, tamp_read_t read_cb, void* read_handle, tamp_write_t write_cb,
-                              void* write_handle, size_t* input_consumed_size, size_t* output_written_size,
-                              tamp_callback_t callback, void* user_data) {
+TAMP_OPTIMIZE_SIZE tamp_res tamp_compress_stream(TampCompressor* compressor, tamp_read_t read_cb, void* read_handle,
+                                                 tamp_write_t write_cb, void* write_handle, size_t* input_consumed_size,
+                                                 size_t* output_written_size, tamp_callback_t callback,
+                                                 void* user_data) {
     size_t input_consumed_size_proxy, output_written_size_proxy;
     if (!input_consumed_size) input_consumed_size = &input_consumed_size_proxy;
     if (!output_written_size) output_written_size = &output_written_size_proxy;
diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 3b9a568a..016c3a05 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -273,9 +273,10 @@ tamp_res tamp_decompressor_read_header(TampConf* conf, const unsigned char* inpu
  *   * window
  *   * window_bits_max
  */
-static tamp_res tamp_decompressor_populate_from_conf(TampDecompressor* decompressor, uint8_t conf_window,
-                                                     uint8_t conf_literal, uint8_t conf_use_custom_dictionary,
-                                                     uint8_t conf_extended) {
+static TAMP_OPTIMIZE_SIZE tamp_res tamp_decompressor_populate_from_conf(TampDecompressor* decompressor,
+                                                                        uint8_t conf_window, uint8_t conf_literal,
+                                                                        uint8_t conf_use_custom_dictionary,
+                                                                        uint8_t conf_extended) {
     if (conf_window < 8 || conf_window > 15) return TAMP_INVALID_CONF;
     if (conf_literal < 5 || conf_literal > 8) return TAMP_INVALID_CONF;
     if (conf_window > decompressor->window_bits_max) return TAMP_INVALID_CONF;
@@ -511,9 +512,10 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigne
 
 #if TAMP_STREAM
 
-tamp_res tamp_decompress_stream(TampDecompressor* decompressor, tamp_read_t read_cb, void* read_handle,
-                                tamp_write_t write_cb, void* write_handle, size_t* input_consumed_size,
-                                size_t* output_written_size, tamp_callback_t callback, void* user_data) {
+TAMP_OPTIMIZE_SIZE tamp_res tamp_decompress_stream(TampDecompressor* decompressor, tamp_read_t read_cb,
+                                                   void* read_handle, tamp_write_t write_cb, void* write_handle,
+                                                   size_t* input_consumed_size, size_t* output_written_size,
+                                                   tamp_callback_t callback, void* user_data) {
     size_t input_consumed_size_proxy, output_written_size_proxy;
     if (!input_consumed_size) input_consumed_size = &input_consumed_size_proxy;
     if (!output_written_size) output_written_size = &output_written_size_proxy;

From cde27cf52db3e343395f958cde395d35d93a3a06 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Fri, 6 Feb 2026 08:58:11 -0500
Subject: [PATCH 105/109] some more gcc pragmas to shrink implementation

---
 tamp/_c_src/tamp/compressor.c   | 21 +++++++++++++++++++++
 tamp/_c_src/tamp/decompressor.c |  7 +++++++
 2 files changed, 28 insertions(+)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 0fbb7780..5d4bc12e 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -327,6 +327,10 @@ static TAMP_NOINLINE void write_rle_token(TampCompressor* compressor, uint8_t co
  * @param[in,out] output_written_size Bytes written (accumulated).
  * @return TAMP_OK on success, TAMP_OUTPUT_FULL if output buffer is too small.
  */
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC optimize("-fno-reorder-blocks")
+#endif
 static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor* compressor, unsigned char** output,
                                                          size_t* output_size, size_t* output_written_size) {
     // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
@@ -363,6 +367,9 @@ static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor* compres
 
     return TAMP_OK;
 }
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
 
 /**
  * @brief Handle all extended-specific logic in poll (match continuation + RLE).
@@ -451,6 +458,10 @@ static TAMP_NOINLINE TAMP_OPTIMIZE_SIZE tamp_res poll_extended_handling(TampComp
 }
 #endif  // TAMP_EXTENDED_COMPRESS
 
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC optimize("-fno-schedule-insns2")
+#endif
 TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned char* output, size_t output_size,
                                             size_t* output_written_size) {
     tamp_res res;
@@ -568,6 +579,9 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned
 
     return TAMP_OK;
 }
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
 
 void tamp_compressor_sink(TampCompressor* compressor, const unsigned char* input, size_t input_size,
                           size_t* consumed_size) {
@@ -628,6 +642,10 @@ TAMP_OPTIMIZE_SIZE tamp_res tamp_compressor_compress_cb(TampCompressor* compress
     return TAMP_OK;
 }
 
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC optimize("-fno-tree-pre")
+#endif
 tamp_res tamp_compressor_flush(TampCompressor* compressor, unsigned char* output, size_t output_size,
                                size_t* output_written_size, bool write_token) {
     tamp_res res;
@@ -703,6 +721,9 @@ tamp_res tamp_compressor_flush(TampCompressor* compressor, unsigned char* output
 
     return res;
 }
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
 
 TAMP_OPTIMIZE_SIZE tamp_res tamp_compressor_compress_and_flush_cb(TampCompressor* compressor, unsigned char* output,
                                                                   size_t output_size, size_t* output_written_size,
diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 016c3a05..4d312790 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -331,6 +331,10 @@ static inline void refill_bit_buffer(TampDecompressor* d, const unsigned char**
     }
 }
 
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC optimize("-fno-tree-pre")
+#endif
 tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigned char* output, size_t output_size,
                                          size_t* output_written_size, const unsigned char* input, size_t input_size,
                                          size_t* input_consumed_size, tamp_callback_t callback, void* user_data) {
@@ -509,6 +513,9 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigne
     }
     return TAMP_INPUT_EXHAUSTED;
 }
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
 
 #if TAMP_STREAM
 

From 1d82747ad90abaf4e8453758e7a471c04487b073 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Fri, 6 Feb 2026 15:09:55 -0500
Subject: [PATCH 106/109] update readme binary-size table

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 2a722f01..2f338fc7 100644
--- a/README.md
+++ b/README.md
@@ -352,10 +352,10 @@ Numbers reported in bytes. Tamp sizes were measured using `arm-none-eabi-gcc`
 |                                  | Compressor | Decompressor | Compressor + Decompressor |
 | -------------------------------- | ---------- | ------------ | ------------------------- |
 | Tamp (MicroPython Native)        | 4708       | 4339         | 8124                      |
-| Tamp (C, no extended, no stream) | 1620       | 1584         | 3000                      |
-| Tamp (C, no extended)            | 2024       | 2036         | 3856                      |
-| Tamp (C, extended, no stream)    | 3072       | 2436         | 5304                      |
-| Tamp (C, extended)               | 3476       | 2888         | 6160                      |
+| Tamp (C, no extended, no stream) | 1466       | 1312         | 2592                      |
+| Tamp (C, no extended)            | 1748       | 1550         | 3112                      |
+| Tamp (C, extended, no stream)    | 2558       | 2072         | 4444                      |
+| Tamp (C, extended)               | 2840       | 2310         | 4964                      |
 | Heatshrink (C)                   | 2956       | 3876         | 6832                      |
 | uzlib (C)                        | 2355       | 3963         | 6318                      |
 

From c52f165f65ea9b366998e8f507c9b3cc5840b7d3 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Fri, 6 Feb 2026 15:36:54 -0500
Subject: [PATCH 107/109] avoid memset

---
 tamp/_c_src/tamp/compressor.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 5d4bc12e..432d97e2 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -185,8 +185,10 @@ TAMP_OPTIMIZE_SIZE tamp_res tamp_compressor_init(TampCompressor* compressor, con
     if (conf->extended) return TAMP_INVALID_CONF;  // Extended requested but not compiled in
 #endif
 
-    for (uint8_t i = 0; i < sizeof(TampCompressor); i++)  // Zero-out the struct
-        ((unsigned char*)compressor)[i] = 0;
+    {  // volatile prevents any GCC version/target from converting this loop to a memset call
+        volatile unsigned char* p = (volatile unsigned char*)compressor;
+        for (uint8_t i = 0; i < sizeof(TampCompressor); i++) p[i] = 0;
+    }
 
     // Build header directly from conf (8 bits total)
     // Layout: [window:3][literal:2][use_custom_dictionary:1][extended:1][more_headers:1]

From bf1c3772fb4c97352f36021f2c496dfc81e9ca70 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sun, 8 Feb 2026 21:08:58 -0500
Subject: [PATCH 108/109] TAMP_USE_MEMSET macro

---
 Makefile                        |  2 +-
 tamp/_c_src/tamp/common.h       | 19 +++++++++++++++++++
 tamp/_c_src/tamp/compressor.c   |  5 +----
 tamp/_c_src/tamp/decompressor.c |  8 +++-----
 4 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/Makefile b/Makefile
index 9fd03d22..eb8d8255 100644
--- a/Makefile
+++ b/Makefile
@@ -75,7 +75,7 @@ MOD = tamp
 # Override -Os with -O2 for better performance (last flag wins)
 CFLAGS_EXTRA = -O2
 
-CFLAGS += -Itamp/_c_src -DTAMP_COMPRESSOR=$(TAMP_COMPRESSOR) -DTAMP_DECOMPRESSOR=$(TAMP_DECOMPRESSOR) -DTAMP_STREAM=0
+CFLAGS += -Itamp/_c_src -DTAMP_COMPRESSOR=$(TAMP_COMPRESSOR) -DTAMP_DECOMPRESSOR=$(TAMP_DECOMPRESSOR) -DTAMP_STREAM=0 -DTAMP_USE_MEMSET=0
 # Compiler-specific flags based on target architecture
 ifeq ($(filter $(ARCH),x86 x64),)
 # Cross-compiling for embedded (ARM, xtensa) - use GCC flags
diff --git a/tamp/_c_src/tamp/common.h b/tamp/_c_src/tamp/common.h
index 0e5bec7e..984411b4 100644
--- a/tamp/_c_src/tamp/common.h
+++ b/tamp/_c_src/tamp/common.h
@@ -57,6 +57,25 @@ extern "C" {
 #define TAMP_OPTIMIZE_SIZE
 #endif
 
+/* TAMP_USE_MEMSET: Use libc memset (default: 1).
+ * Set to 0 for environments without libc (e.g. MicroPython native modules).
+ * When disabled, uses a volatile loop that prevents GCC from emitting a
+ * memset call at the cost of inhibiting store coalescing. */
+#ifndef TAMP_USE_MEMSET
+#define TAMP_USE_MEMSET 1
+#endif
+
+#if TAMP_USE_MEMSET
+#include <string.h>
+#define TAMP_MEMSET(dst, val, n) memset((dst), (val), (n))
+#else
+#define TAMP_MEMSET(dst, val, n)                                                     \
+    do {                                                                             \
+        volatile unsigned char *_tamp_p = (volatile unsigned char *)(dst);           \
+        for (size_t _tamp_i = 0; _tamp_i < (n); _tamp_i++) _tamp_p[_tamp_i] = (val); \
+    } while (0)
+#endif
+
 /* Include stream API (tamp_compress_stream, tamp_decompress_stream).
  * Enabled by default. Disable with -DTAMP_STREAM=0 to save ~2.8KB.
  */
diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 432d97e2..78527497 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -185,10 +185,7 @@ TAMP_OPTIMIZE_SIZE tamp_res tamp_compressor_init(TampCompressor* compressor, con
     if (conf->extended) return TAMP_INVALID_CONF;  // Extended requested but not compiled in
 #endif
 
-    {  // volatile prevents any GCC version/target from converting this loop to a memset call
-        volatile unsigned char* p = (volatile unsigned char*)compressor;
-        for (uint8_t i = 0; i < sizeof(TampCompressor); i++) p[i] = 0;
-    }
+    TAMP_MEMSET(compressor, 0, sizeof(TampCompressor));
 
     // Build header directly from conf (8 bits total)
     // Layout: [window:3][literal:2][use_custom_dictionary:1][extended:1][more_headers:1]
diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 4d312790..70db78c2 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -131,9 +131,8 @@ static tamp_res decode_rle(TampDecompressor* d, unsigned char** output, const un
     }
 
     /* Write repeated bytes to output */
-    for (uint8_t i = 0; i < to_write; i++) {
-        *(*output)++ = symbol;
-    }
+    TAMP_MEMSET(*output, symbol, to_write);
+    *output += to_write;
     *output_written_size += to_write;
 
     /* Update window only on first chunk (skip==0).
@@ -301,8 +300,7 @@ tamp_res tamp_decompressor_init(TampDecompressor* decompressor, const TampConf*
     // Validate window_bits parameter
     if (window_bits < 8 || window_bits > 15) return TAMP_INVALID_CONF;
 
-    for (uint8_t i = 0; i < sizeof(TampDecompressor); i++)  // Zero-out the struct
-        ((unsigned char*)decompressor)[i] = 0;
+    TAMP_MEMSET(decompressor, 0, sizeof(TampDecompressor));
     decompressor->window = window;
     decompressor->window_bits_max = window_bits;
     if (conf) {

From 9a66abf319acea473f22cdbba3391ab3893db543 Mon Sep 17 00:00:00 2001
From: Brian Pugh <bnp117@gmail.com>
Date: Sun, 8 Feb 2026 21:51:03 -0500
Subject: [PATCH 109/109] xtensa-specific nonsense

---
 tamp/_c_src/tamp/common.h       | 12 ++++++++++++
 tamp/_c_src/tamp/compressor.c   | 12 ++++++------
 tamp/_c_src/tamp/decompressor.c |  4 ++--
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/tamp/_c_src/tamp/common.h b/tamp/_c_src/tamp/common.h
index 984411b4..11006f43 100644
--- a/tamp/_c_src/tamp/common.h
+++ b/tamp/_c_src/tamp/common.h
@@ -39,6 +39,14 @@ extern "C" {
 #define TAMP_UNLIKELY(c) (c)
 #endif
 
+/* Per-function optimize attributes and #pragma GCC push/pop_options require
+ * GCC on a target that supports them. Xtensa GCC does not. */
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__XTENSA__)
+#define TAMP_HAS_GCC_OPTIMIZE 1
+#else
+#define TAMP_HAS_GCC_OPTIMIZE 0
+#endif
+
 #if defined(_MSC_VER)
 #define TAMP_ALWAYS_INLINE __forceinline
 #define TAMP_NOINLINE __declspec(noinline)
@@ -46,7 +54,11 @@ extern "C" {
 #elif defined(__GNUC__) && !defined(__clang__)
 #define TAMP_ALWAYS_INLINE inline __attribute__((always_inline))
 #define TAMP_NOINLINE __attribute__((noinline))
+#if TAMP_HAS_GCC_OPTIMIZE
 #define TAMP_OPTIMIZE_SIZE __attribute__((optimize("Os")))
+#else
+#define TAMP_OPTIMIZE_SIZE
+#endif
 #elif defined(__clang__)
 #define TAMP_ALWAYS_INLINE inline __attribute__((always_inline))
 #define TAMP_NOINLINE __attribute__((noinline))
diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 78527497..97760e2d 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -326,7 +326,7 @@ static TAMP_NOINLINE void write_rle_token(TampCompressor* compressor, uint8_t co
  * @param[in,out] output_written_size Bytes written (accumulated).
  * @return TAMP_OK on success, TAMP_OUTPUT_FULL if output buffer is too small.
  */
-#if defined(__GNUC__) && !defined(__clang__)
+#if TAMP_HAS_GCC_OPTIMIZE
 #pragma GCC push_options
 #pragma GCC optimize("-fno-reorder-blocks")
 #endif
@@ -366,7 +366,7 @@ static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor* compres
 
     return TAMP_OK;
 }
-#if defined(__GNUC__) && !defined(__clang__)
+#if TAMP_HAS_GCC_OPTIMIZE
 #pragma GCC pop_options
 #endif
 
@@ -457,7 +457,7 @@ static TAMP_NOINLINE TAMP_OPTIMIZE_SIZE tamp_res poll_extended_handling(TampComp
 }
 #endif  // TAMP_EXTENDED_COMPRESS
 
-#if defined(__GNUC__) && !defined(__clang__)
+#if TAMP_HAS_GCC_OPTIMIZE
 #pragma GCC push_options
 #pragma GCC optimize("-fno-schedule-insns2")
 #endif
@@ -578,7 +578,7 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned
 
     return TAMP_OK;
 }
-#if defined(__GNUC__) && !defined(__clang__)
+#if TAMP_HAS_GCC_OPTIMIZE
 #pragma GCC pop_options
 #endif
 
@@ -641,7 +641,7 @@ TAMP_OPTIMIZE_SIZE tamp_res tamp_compressor_compress_cb(TampCompressor* compress
     return TAMP_OK;
 }
 
-#if defined(__GNUC__) && !defined(__clang__)
+#if TAMP_HAS_GCC_OPTIMIZE
 #pragma GCC push_options
 #pragma GCC optimize("-fno-tree-pre")
 #endif
@@ -720,7 +720,7 @@ tamp_res tamp_compressor_flush(TampCompressor* compressor, unsigned char* output
 
     return res;
 }
-#if defined(__GNUC__) && !defined(__clang__)
+#if TAMP_HAS_GCC_OPTIMIZE
 #pragma GCC pop_options
 #endif
 
diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index 70db78c2..92af94c6 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -329,7 +329,7 @@ static inline void refill_bit_buffer(TampDecompressor* d, const unsigned char**
     }
 }
 
-#if defined(__GNUC__) && !defined(__clang__)
+#if TAMP_HAS_GCC_OPTIMIZE
 #pragma GCC push_options
 #pragma GCC optimize("-fno-tree-pre")
 #endif
@@ -511,7 +511,7 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigne
     }
     return TAMP_INPUT_EXHAUSTED;
 }
-#if defined(__GNUC__) && !defined(__clang__)
+#if TAMP_HAS_GCC_OPTIMIZE
 #pragma GCC pop_options
 #endif