diff --git a/.gitattributes b/.gitattributes
index 5f6c222..a1dce8f 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1,2 @@
 datasets/v1-compressed/** filter=lfs diff=lfs merge=lfs -text
+datasets/extended-compressed/** filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/build_wheels.yaml b/.github/workflows/build_wheels.yaml
index 2f9bb48..17e29e8 100644
--- a/.github/workflows/build_wheels.yaml
+++ b/.github/workflows/build_wheels.yaml
@@ -117,7 +117,7 @@ jobs:
           CIBW_ARCHS: ${{ matrix.cibw_archs }}
           CIBW_BUILD: ${{ matrix.cibw_build }}
           CIBW_TEST_REQUIRES: pytest
-          CIBW_TEST_COMMAND: pytest {package}/tests
+          CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset
 
       - uses: actions/upload-artifact@v4
         with:
@@ -164,7 +164,7 @@ jobs:
           CIBW_ARCHS: ${{ matrix.cibw_archs }}
           CIBW_BUILD: ${{ matrix.cibw_build }}
           CIBW_TEST_REQUIRES: pytest
-          CIBW_TEST_COMMAND: pytest {package}/tests
+          CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset
 
       - uses: actions/upload-artifact@v4
         with:
@@ -212,7 +212,7 @@ jobs:
           CIBW_ARCHS: ${{ matrix.cibw_archs }}
           CIBW_BUILD: ${{ matrix.cibw_build }}
           CIBW_TEST_REQUIRES: pytest
-          CIBW_TEST_COMMAND: pytest {package}/tests
+          CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset
 
       - uses: actions/upload-artifact@v4
         with:
@@ -254,7 +254,7 @@ jobs:
           CIBW_ARCHS: ${{ matrix.cibw_archs }}
           CIBW_BUILD: ${{ matrix.cibw_build }}
           CIBW_TEST_REQUIRES: pytest
-          CIBW_TEST_COMMAND: pytest {package}/tests
+          CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset
 
       - uses: actions/upload-artifact@v4
         with:
@@ -302,7 +302,7 @@ jobs:
           CIBW_ARCHS: ${{ matrix.cibw_archs }}
           CIBW_BUILD: ${{ matrix.cibw_build }}
           CIBW_TEST_REQUIRES: pytest
-          CIBW_TEST_COMMAND: pytest {package}/tests
+          CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset
 
       - uses: actions/upload-artifact@v4
         with:
@@ -344,7 +344,7 @@ jobs:
           CIBW_ARCHS: ${{ matrix.cibw_archs }}
           CIBW_BUILD: ${{ matrix.cibw_build }}
           CIBW_TEST_REQUIRES: pytest
-          CIBW_TEST_COMMAND: pytest {package}/tests
+          CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset
 
       - uses: actions/upload-artifact@v4
         with:
@@ -385,7 +385,7 @@ jobs:
           CIBW_ARCHS: ${{ matrix.cibw_archs }}
           CIBW_BUILD: ${{ matrix.cibw_build }}
           CIBW_TEST_REQUIRES: pytest
-          CIBW_TEST_COMMAND: pytest {package}/tests
+          CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset
 
       - uses: actions/upload-artifact@v4
         with:
diff --git a/.github/workflows/javascript.yaml b/.github/workflows/javascript.yaml
index 00fdd5b..4dc0fa7 100644
--- a/.github/workflows/javascript.yaml
+++ b/.github/workflows/javascript.yaml
@@ -99,9 +99,9 @@ jobs:
           cd build
           HASH=$(sha256sum enwik8-js.tamp | cut -d' ' -f1)
           echo "Compression hash: $HASH"
-          if [ "$HASH" != "02e05af059a0040d641988075cf1dfc479a084f9a34b5c8a348354211c5fa038" ]; then
+          if [ "$HASH" != "dd5e431b0cbaa6ee001b10493c2b08e6235f42e3f4ce00958e88b1b97581872e" ]; then
             echo "❌ Hash mismatch!"
-            echo "Expected: 02e05af059a0040d641988075cf1dfc479a084f9a34b5c8a348354211c5fa038"
+            echo "Expected: dd5e431b0cbaa6ee001b10493c2b08e6235f42e3f4ce00958e88b1b97581872e"
             echo "Got:      $HASH"
             exit 1
           fi
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 47a77b9..1c44aa9 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -94,8 +94,48 @@ jobs:
           source .venv/bin/activate
           SKIP=wasm-eslint,wasm-npm-test,wasm-file-validation,typescript-check,package-json-lint pre-commit run --show-diff-on-failure --color=always --all-files
 
+  cache-lfs:
+    name: 'Cache LFS files'
+    runs-on: ubuntu-latest
+    outputs:
+      cache-key: ${{ steps.lfs-key.outputs.key }}
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          lfs: false
+
+      - name: Compute LFS cache key
+        id: lfs-key
+        run: |
+          # Hash pointer files before they get replaced by git lfs pull
+          # Use find for reliable recursive globbing, sort for deterministic order
+          hash=$(find datasets/v1-compressed datasets/extended-compressed -name '*.tamp' -type f | sort | xargs cat | sha256sum | cut -d' ' -f1)
+          echo "key=lfs-${hash}" >> $GITHUB_OUTPUT
+          echo "Cache key: lfs-${hash}"
+
+      - name: Restore LFS cache
+        uses: actions/cache/restore@v4
+        id: lfs-cache
+        with:
+          path: .git/lfs
+          key: ${{ steps.lfs-key.outputs.key }}
+
+      - name: Pull LFS files
+        if: steps.lfs-cache.outputs.cache-hit != 'true'
+        run: git lfs pull
+
+      - name: Save LFS cache
+        if: steps.lfs-cache.outputs.cache-hit != 'true'
+        uses: actions/cache/save@v4
+        with:
+          path: .git/lfs
+          key: ${{ steps.lfs-key.outputs.key }}
+
   test:
     name: 'Test Python ${{ matrix.python-version }}'
+    needs: cache-lfs
     timeout-minutes: 15
     runs-on: ubuntu-latest
     strategy:
@@ -110,7 +150,17 @@ jobs:
         uses: actions/checkout@v4
         with:
           submodules: recursive
-          lfs: true
+          lfs: false
+
+      - name: Restore LFS cache
+        uses: actions/cache/restore@v4
+        with:
+          path: .git/lfs
+          key: ${{ needs.cache-lfs.outputs.cache-key }}
+          fail-on-cache-miss: true
+
+      - name: Pull LFS files
+        run: git lfs pull
 
       - name: Set up python 3.13 (for Poetry)
         id: setup-python-system
@@ -217,7 +267,7 @@ jobs:
         implementation: [desktop, embedded]
     env:
       POETRY_HOME: '~/poetry'
-      EXPECTED_COMPRESSED_HASH: '02e05af059a0040d641988075cf1dfc479a084f9a34b5c8a348354211c5fa038'
+      EXPECTED_COMPRESSED_HASH: 'dd5e431b0cbaa6ee001b10493c2b08e6235f42e3f4ce00958e88b1b97581872e'
 
     steps:
       - name: Check out repository
diff --git a/.gitignore b/.gitignore
index 4344bd2..e69f804 100644
--- a/.gitignore
+++ b/.gitignore
@@ -249,10 +249,12 @@ Temporary Items
 # Compression benchmark datasets
 datasets/*
 !datasets/v1-compressed/
+!datasets/extended-compressed/
 enwik8*
 *.pkl
 *.tamp
 !datasets/v1-compressed/**
+!datasets/extended-compressed/**
 
 # Cython-generated files
 tamp/_c_compressor.c
@@ -435,6 +437,9 @@ wasm/build/
 *.swo
 *~
 
+# clangd (C/C++ language server)
+.clangd
+
 # Emacs
 *~
 \#*\#
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1c1cbfc..686769a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -57,10 +57,13 @@ repos:
         args: ['-style=file', '-i']
         exclude: ^espidf/tamp/private/tamp_search\.hpp$
 
-  - repo: https://github.com/pocc/pre-commit-hooks
-    rev: v1.3.5
+  - repo: local
     hooks:
       - id: cppcheck
+        name: cppcheck
+        entry: cppcheck
+        language: system
+        files: \.(c|h|cpp|hpp)$
         exclude: ^(espidf|mpy_bindings|ctests|tools)/
         args: [
             '-Itamp/_c_src',
diff --git a/CLAUDE.md b/CLAUDE.md
index 1dd02c3..5ba7e8c 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -26,9 +26,13 @@ different platforms:
 **Shared C Source:** All implementations use the same C source code in
 `tamp/_c_src/tamp/`:
 
-- `common.h/c` - Shared utilities and data structures
-- `compressor.h/c` - Compression implementation
+- `common.h/c` - Shared utilities, data structures, stream I/O callbacks, and
+  dictionary initialization
+- `compressor.h/c` - Compression implementation (sink/poll low-level API and
+  higher-level compress/flush API)
 - `decompressor.h/c` - Decompression implementation
+- `compressor_find_match_desktop.c` - Desktop-optimized match finding (included
+  by `compressor.c` on non-embedded targets)
 
 ## Development Commands
 
@@ -162,38 +166,81 @@ make website-clean         # Clean website build artifacts
 **WebAssembly Build Process:**
 
 1. `wasm/Makefile` compiles C source to WebAssembly using Emscripten
-2. `wasm/scripts/build.js` generates multiple JS/TS distribution formats
+2. `tsup` (via `npm run build:js`) bundles into multiple JS/TS distribution
+   formats (CJS, ESM, `.d.ts`)
 3. Exports specific C functions and runtime methods for JS interop
 
-**Configuration Flags:**
+**Configuration Flags (compile-time `-D` defines):**
 
-- `TAMP_LAZY_MATCHING=1` - Enable lazy matching optimization (default)
-- `TAMP_ESP32=1` - ESP32-specific optimizations
+- `TAMP_LAZY_MATCHING=1` - Enable lazy matching optimization (default in
+  build.py)
+- `TAMP_ESP32=1` - ESP32-specific optimizations (avoids bitfields for speed)
 - `TAMP_COMPRESSOR`/`TAMP_DECOMPRESSOR` - Include/exclude components
+- `TAMP_EXTENDED=1` - Master switch for extended format: RLE and extended match
+  (default: 1). `TAMP_EXTENDED_COMPRESS` and `TAMP_EXTENDED_DECOMPRESS` can
+  individually override.
+- `TAMP_STREAM=1` - Include stream API (default: 1). Disable with
+  `-DTAMP_STREAM=0` to save ~2.8KB.
+- `TAMP_STREAM_WORK_BUFFER_SIZE=32` - Stack-allocated work buffer for stream API
+  (default: 32 bytes, 256+ recommended for performance)
+- `TAMP_STREAM_MEMORY` / `TAMP_STREAM_STDIO` / `TAMP_STREAM_LITTLEFS` /
+  `TAMP_STREAM_FATFS` - Enable built-in I/O handlers for specific backends
+- `TAMP_USE_EMBEDDED_MATCH=1` - Force embedded `find_best_match` implementation
+  on desktop (for testing)
+
+**Build Environment Variables (Python):**
+
+- `TAMP_SANITIZE=1` - Enable AddressSanitizer + UBSan
+- `TAMP_PROFILE=1` - Enable profiling (line trace, debug info)
+- `TAMP_USE_EMBEDDED_MATCH=1` - Force embedded match finding
+- `TAMP_BUILD_C_EXTENSIONS=0` - Skip building C extensions entirely
+- `CIBUILDWHEEL=1` - CI wheel building mode (disables allowed_to_fail)
 
 ### Testing Strategy
 
 **Multi-layered Testing:**
 
-- **Python tests** (`tests/`) - Core algorithm testing using pytest
+- **Python tests** (`tests/`) - Core algorithm testing using pytest. Includes
+  bit reader/writer, compressor, decompressor, round-trip, CLI, dataset
+  regression, and file interface tests.
 - **WebAssembly tests** (`wasm/test/`) - JS/TS API testing with Node.js test
-  runner
+  runner (`node --test`)
 - **C tests** (`ctests/`) - Low-level C API testing using Unity framework
+  (submodule at `ctests/Unity/`). Includes stream API tests and filesystem
+  integration tests with LittleFS and FatFS RAM backends.
 - **Integration tests** - Cross-platform compatibility and performance
   benchmarks
 
 **Test Data Sources:**
 
-- Enwik8 dataset (100MB) for performance benchmarking
-- Silesia corpus for compression ratio evaluation
+- Enwik8 dataset (100MB) for performance benchmarking (`make download-enwik8`)
+- Silesia corpus for compression ratio evaluation (`make download-silesia`)
 - Custom test cases for edge conditions
 
+### Compressor Architecture
+
+The C compressor uses a two-phase low-level API:
+
+1. `tamp_compressor_sink()` - Copies input bytes into a 16-byte internal ring
+   buffer (cheap/fast)
+2. `tamp_compressor_poll()` - Runs one compression iteration on the internal
+   buffer (computationally intensive)
+
+Higher-level convenience functions (`tamp_compressor_compress`,
+`tamp_compressor_compress_and_flush`) wrap these. Callback variants (`_cb`
+suffix) accept a `tamp_callback_t` progress callback.
+
+The stream API (`tamp_compress_stream`, `tamp_decompress_stream`) provides a
+file-oriented interface using read/write callbacks, supporting multiple I/O
+backends (memory, stdio, LittleFS, FatFS).
+
 ### Memory Management Patterns
 
 **Key Principle:** Fixed memory usage during compression/decompression
 
 - Window size determines memory usage: `(1 << windowBits)` bytes
 - No dynamic allocation during compression/decompression operations
+- Stream API uses a stack-allocated work buffer (`TAMP_STREAM_WORK_BUFFER_SIZE`)
 - Streaming interfaces require explicit resource management (`destroy()` calls
   in JS/TS)
 
@@ -202,7 +249,9 @@ make website-clean         # Clean website build artifacts
 ### Making Changes to Core Algorithm
 
 1. **Modify C source** in `tamp/_c_src/tamp/`
-2. **Rebuild all implementations:**
+2. **Update pure Python reference** in `tamp/compressor.py` /
+   `tamp/decompressor.py` to match
+3. **Rebuild all implementations:**
 
    ```bash
    # Python
@@ -212,11 +261,12 @@ make website-clean         # Clean website build artifacts
    cd wasm && npm run build
    ```
 
-3. **Run comprehensive tests:**
+4. **Run comprehensive tests:**
    ```bash
-   make test              # Python + MicroPython
+   poetry run pytest      # Python tests
+   make c-test            # C unit tests with sanitizers
+   make c-test-embedded   # C tests with embedded match finding
    cd wasm && npm test    # WebAssembly
-   make c-test           # C unit tests
    ```
 
 ### Adding New Features
@@ -232,11 +282,13 @@ make website-clean         # Clean website build artifacts
 - **Use provided benchmarking tools:**
   ```bash
   make on-device-compression-benchmark     # MicroPython performance
-  npm run test:enwik8                     # WebAssembly performance
-  python tools/performance-benchmark.sh   # Python performance
+  cd wasm && npm run test:enwik8          # WebAssembly performance
+  bash tools/performance-benchmark.sh     # Python performance
+  make c-benchmark-stream                 # C stream API benchmark
+  make binary-size                        # ARM binary size table
   ```
-- **Profile with:** `tools/profiler.py` for Python, browser dev tools for
-  WebAssembly
+- **Profile with:** `tools/profiler.py` for Python (requires `TAMP_PROFILE=1`),
+  browser dev tools for WebAssembly
 
 ### Release Process
 
@@ -247,6 +299,29 @@ make website-clean         # Clean website build artifacts
    - WebAssembly npm package
 3. **CI/CD handles** cross-platform builds and testing
 
+### Python Import Fallback Chain
+
+`tamp/__init__.py` imports Compressor/Decompressor using this priority:
+
+1. Viper (MicroPython optimized) - only available on MicroPython
+2. Cython C extensions (`_c_compressor`/`_c_decompressor`) - primary on CPython
+3. Pure Python reference (`compressor.py`/`decompressor.py`) - fallback
+
+When modifying compression behavior, changes to the C source must be mirrored in
+the pure Python reference implementation to keep them in sync.
+
+### CI/CD
+
+GitHub Actions workflows (`.github/workflows/`):
+
+- `tests.yaml` - Lint (ruff, pre-commit) and test across Python 3.9/3.12/3.13
+  and multiple OS. Also runs `c-test` and `c-test-embedded`.
+- `build_wheels.yaml` - Cross-platform wheel builds via cibuildwheel
+- `javascript.yaml` - WebAssembly tests on Node 18/20
+- `mpy_native_module.yaml` - MicroPython native module builds for ARM
+  architectures
+- `esp_upload_component.yml` - ESP-IDF component registry upload
+
 ## Documentation Style
 
 - Avoid "fake" subsections (e.g., bold text like `**Error Promotion:**` acting
diff --git a/Makefile b/Makefile
index 9e21e42..eb8d825 100644
--- a/Makefile
+++ b/Makefile
@@ -27,6 +27,8 @@ help:
 	@echo "  make tamp-c-library     Build static C library"
 	@echo "  make website-build      Build website for deployment"
 
+.PHONY: clean test collect-data venv download
+
 
 ###########################
 # MicroPython Native Module
@@ -73,7 +75,7 @@ MOD = tamp
 # Override -Os with -O2 for better performance (last flag wins)
 CFLAGS_EXTRA = -O2
 
-CFLAGS += -Itamp/_c_src -DTAMP_COMPRESSOR=$(TAMP_COMPRESSOR) -DTAMP_DECOMPRESSOR=$(TAMP_DECOMPRESSOR)
+CFLAGS += -Itamp/_c_src -DTAMP_COMPRESSOR=$(TAMP_COMPRESSOR) -DTAMP_DECOMPRESSOR=$(TAMP_DECOMPRESSOR) -DTAMP_STREAM=0 -DTAMP_USE_MEMSET=0
 # Compiler-specific flags based on target architecture
 ifeq ($(filter $(ARCH),x86 x64),)
 # Cross-compiling for embedded (ARM, xtensa) - use GCC flags
@@ -180,7 +182,14 @@ build/enwik8-100kb: download-enwik8
 	@head -c 100000 datasets/enwik8 > build/enwik8-100kb
 
 build/enwik8-100kb.tamp: build/enwik8-100kb
-	@poetry run tamp compress build/enwik8-100kb -o build/enwik8-100kb.tamp
+	@# Use Python implementation for extended format compression
+	@poetry run tamp compress --implementation=python build/enwik8-100kb -o build/enwik8-100kb.tamp
+
+download-micropython:
+	mkdir -p datasets
+	cd datasets && curl -O https://micropython.org/resources/firmware/RPI_PICO-20250415-v1.25.0.uf2
+
+download: download-enwik8 download-silesia download-micropython
 
 
 ##################
@@ -218,7 +227,7 @@ define mpremote-sync
 	fi
 endef
 
-on-device-compression-benchmark: mpy build/enwik8-100kb build/enwik8-100kb.tamp
+on-device-compression-benchmark: mpy build/enwik8-100kb
 	$(MPREMOTE) rm :enwik8-100kb.tamp || true
 	@# Remove any viper implementation that may exist from previous belay syncs
 	$(MPREMOTE) rm :tamp/__init__.py :tamp/compressor_viper.py :tamp/decompressor_viper.py :tamp/compressor.py :tamp/decompressor.py :tamp/__main__.py :tamp/py.typed 2>/dev/null || true
@@ -229,7 +238,8 @@ on-device-compression-benchmark: mpy build/enwik8-100kb build/enwik8-100kb.tamp
 	$(MPREMOTE) soft-reset
 	$(MPREMOTE) run tools/on-device-compression-benchmark.py
 	$(MPREMOTE) cp :enwik8-100kb.tamp build/on-device-enwik8-100kb.tamp
-	cmp build/enwik8-100kb.tamp build/on-device-enwik8-100kb.tamp
+	poetry run tamp decompress build/on-device-enwik8-100kb.tamp -o build/on-device-enwik8-100kb-decompressed
+	cmp build/enwik8-100kb build/on-device-enwik8-100kb-decompressed
 	@echo "Success!"
 
 on-device-decompression-benchmark: mpy build/enwik8-100kb.tamp
@@ -283,7 +293,7 @@ mpy-viper-size:
 	size_comp=$$(wc -c < /tmp/_tamp_comp.mpy | tr -d ' '); \
 	size_decomp=$$(wc -c < /tmp/_tamp_decomp.mpy | tr -d ' '); \
 	rm -f /tmp/_tamp_init.mpy /tmp/_tamp_comp.mpy /tmp/_tamp_decomp.mpy; \
-	printf 'Tamp (MicroPython Viper)   %d  %d  %d\n' \
+	printf '%-34s %10d %12d %25d\n' "Tamp (MicroPython Viper)" \
 		$$((size_init + size_comp)) $$((size_init + size_decomp)) $$((size_init + size_comp + size_decomp))
 
 mpy-native-size:
@@ -299,7 +309,7 @@ endif
 		rm -rf tamp.mpy build/tamp build/mpy_bindings build/tamp.native.mpy && \
 		$(MAKE) -s _mpy-build MPY_DIR=$(MPY_DIR) ARCH=armv6m TAMP_COMPRESSOR=1 TAMP_DECOMPRESSOR=1 >/dev/null 2>&1 && \
 		size_both=$$(wc -c < tamp.mpy | tr -d ' ') && \
-		printf 'Tamp (MicroPython Native)  %s  %s  %s\n' $$size_comp $$size_decomp $$size_both
+		printf '%-34s %10s %12s %25s\n' "Tamp (MicroPython Native)" $$size_comp $$size_decomp $$size_both
 
 mpy-compression-benchmark:
 	@time belay run micropython -X heapsize=300M tools/micropython-compression-benchmark.py
@@ -482,7 +492,7 @@ tamp-c-library: build/tamp.a
 # Binary Sizes
 ###############
 # Generate binary size information for README table (armv6m with -O3).
-.PHONY: binary-size c-size
+.PHONY: binary-size c-size c-size-no-extended c-size-extended
 
 ARM_CC := arm-none-eabi-gcc
 ARM_AR := arm-none-eabi-ar
@@ -493,70 +503,85 @@ C_SRC_COMMON = tamp/_c_src/tamp/common.c
 C_SRC_COMP = tamp/_c_src/tamp/compressor.c
 C_SRC_DECOMP = tamp/_c_src/tamp/decompressor.c
 
-# Build compressor-only library (without stream API)
-build/arm/tamp_comp.a: $(C_SRC_COMMON) $(C_SRC_COMP)
-	@mkdir -p build/arm
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common_c.o
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
-	$(ARM_AR) rcs $@ build/arm/common_c.o build/arm/compressor.o
-
-# Build decompressor-only library (without stream API)
-build/arm/tamp_decomp.a: $(C_SRC_COMMON) $(C_SRC_DECOMP)
-	@mkdir -p build/arm
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common_d.o
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
-	$(ARM_AR) rcs $@ build/arm/common_d.o build/arm/decompressor.o
-
-# Build full library (without stream API)
-build/arm/tamp_full.a: $(C_SRC_COMMON) $(C_SRC_COMP) $(C_SRC_DECOMP)
-	@mkdir -p build/arm
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common_f.o
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor_f.o
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor_f.o
-	$(ARM_AR) rcs $@ build/arm/common_f.o build/arm/compressor_f.o build/arm/decompressor_f.o
-
-# Build compressor-only library (with stream API, the default)
-build/arm/tamp_comp_stream.a: $(C_SRC_COMMON) $(C_SRC_COMP)
-	@mkdir -p build/arm
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMMON) -o build/arm/common_cs.o
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMP) -o build/arm/compressor_s.o
-	$(ARM_AR) rcs $@ build/arm/common_cs.o build/arm/compressor_s.o
-
-# Build decompressor-only library (with stream API, the default)
-build/arm/tamp_decomp_stream.a: $(C_SRC_COMMON) $(C_SRC_DECOMP)
-	@mkdir -p build/arm
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common_ds.o
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor_s.o
-	$(ARM_AR) rcs $@ build/arm/common_ds.o build/arm/decompressor_s.o
-
-# Build full library (with stream API, the default)
-build/arm/tamp_full_stream.a: $(C_SRC_COMMON) $(C_SRC_COMP) $(C_SRC_DECOMP)
-	@mkdir -p build/arm
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common_fs.o
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMP) -o build/arm/compressor_fs.o
-	$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor_fs.o
-	$(ARM_AR) rcs $@ build/arm/common_fs.o build/arm/compressor_fs.o build/arm/decompressor_fs.o
-
-c-size:
-	@rm -rf build/arm
-	@$(MAKE) --no-print-directory build/arm/tamp_comp_stream.a build/arm/tamp_decomp_stream.a build/arm/tamp_full_stream.a build/arm/tamp_comp.a build/arm/tamp_decomp.a build/arm/tamp_full.a
-	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/tamp_comp.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \
-	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/tamp_decomp.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \
-	size_full=$$($(ARM_SIZE) -B --totals build/arm/tamp_full.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \
-	printf 'Tamp (C, -DTAMP_STREAM=0)  %d  %d  %d\n' $$size_comp $$size_decomp $$size_full
-	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/tamp_comp_stream.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \
-	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/tamp_decomp_stream.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \
-	size_full=$$($(ARM_SIZE) -B --totals build/arm/tamp_full_stream.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \
-	printf 'Tamp (C)                   %d  %d  %d\n' $$size_comp $$size_decomp $$size_full
+# Flags to disable extended format support
+NO_EXTENDED_FLAGS = -DTAMP_EXTENDED=0
+
+c-size-no-extended:
+	@rm -rf build/arm && mkdir -p build/arm
+	@# No-extended without stream API
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_AR) rcs build/arm/noext_comp.a build/arm/common.o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/noext_decomp.a build/arm/common.o build/arm/decompressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/noext_full.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o
+	@# No-extended with stream API
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_AR) rcs build/arm/noext_comp_s.a build/arm/common.o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/noext_decomp_s.a build/arm/common.o build/arm/decompressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/noext_full_s.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o
+	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/noext_comp.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/noext_decomp.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_full=$$($(ARM_SIZE) -B --totals build/arm/noext_full.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	printf '%-34s %10d %12d %25d\n' "Tamp (C, no extended, no stream)" $$size_comp $$size_decomp $$size_full
+	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/noext_comp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/noext_decomp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_full=$$($(ARM_SIZE) -B --totals build/arm/noext_full_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	printf '%-34s %10d %12d %25d\n' "Tamp (C, no extended)" $$size_comp $$size_decomp $$size_full
+
+c-size-extended:
+	@rm -rf build/arm && mkdir -p build/arm
+	@# Extended without stream API
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_AR) rcs build/arm/ext_comp.a build/arm/common.o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/ext_decomp.a build/arm/common.o build/arm/decompressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/ext_full.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o
+	@# Extended with stream API
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_AR) rcs build/arm/ext_comp_s.a build/arm/common.o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/ext_decomp_s.a build/arm/common.o build/arm/decompressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMP) -o build/arm/compressor.o
+	@$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o
+	@$(ARM_AR) rcs build/arm/ext_full_s.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o
+	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/ext_comp.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/ext_decomp.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_full=$$($(ARM_SIZE) -B --totals build/arm/ext_full.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	printf '%-34s %10d %12d %25d\n' "Tamp (C, extended, no stream)" $$size_comp $$size_decomp $$size_full
+	@size_comp=$$($(ARM_SIZE) -B --totals build/arm/ext_comp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_decomp=$$($(ARM_SIZE) -B --totals build/arm/ext_decomp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	size_full=$$($(ARM_SIZE) -B --totals build/arm/ext_full_s.a | grep TOTALS | awk '{print $$1+$$2}'); \
+	printf '%-34s %10d %12d %25d\n' "Tamp (C, extended)" $$size_comp $$size_decomp $$size_full
+
+c-size: c-size-no-extended c-size-extended
 
 binary-size:
 	@echo "Binary sizes for armv6m (bytes):"
 	@echo ""
-	@printf '%-27s %-10s %-12s %s\n' "" "Compressor" "Decompressor" "Compressor + Decompressor"
-	@printf '%-27s %-10s %-12s %s\n' "---------------------------" "----------" "------------" "-------------------------"
-	@output=$$($(MAKE) -s mpy-viper-size 2>&1) && echo "$$output" || echo "Tamp (MicroPython Viper)   (requires mpy-cross)"
-	@output=$$($(MAKE) -s mpy-native-size 2>&1) && echo "$$output" || echo "Tamp (MicroPython Native)  (requires MPY_DIR)"
-	@output=$$($(MAKE) -s c-size 2>&1) && echo "$$output" || echo "Tamp (C)                   (requires arm-none-eabi-gcc)"
+	@printf '%-34s %10s %12s %25s\n' "" "Compressor" "Decompressor" "Compressor + Decompressor"
+	@printf '%-34s %10s %12s %25s\n' "----------------------------------" "----------" "------------" "-------------------------"
+	@output=$$($(MAKE) -s mpy-viper-size 2>&1) && echo "$$output" || echo "Tamp (MicroPython Viper)           (requires mpy-cross)"
+	@output=$$($(MAKE) -s mpy-native-size 2>&1) && echo "$$output" || echo "Tamp (MicroPython Native)          (requires MPY_DIR)"
+	@output=$$($(MAKE) -s c-size 2>&1) && echo "$$output" || echo "Tamp (C)                           (requires arm-none-eabi-gcc)"
 
 
 ##########
diff --git a/README.md b/README.md
index 73fdf5b..2f338fc 100644
--- a/README.md
+++ b/README.md
@@ -34,11 +34,8 @@ of RAM and firmware storage.
     - `pip install tamp` will use a python-bound C implementation optimized for
       speed.
   - Micropython:
-    - Native Module (suggested micropython implementation).
+    - Native Module.
       - `mpy_bindings/`
-    - Viper.
-      - `tamp/__init__.py`, `tamp/compressor_viper.py`,
-        `tamp/decompressor_viper.py`
   - C library:
     - `tamp/_c_src/`
   - Javascript/Typescript via Emscripten WASM.
@@ -56,14 +53,12 @@ of RAM and firmware storage.
 
 # Installation
 
-Tamp contains 4 implementations:
+Tamp contains 3 implementations:
 
 1. A reference desktop CPython implementation that is optimized for readability
    (and **not** speed).
 2. A Micropython Native Module implementation (fast).
-3. A Micropython Viper implementation (not recommended, please use Native
-   Module).
-4. A C implementation (with python bindings) for accelerated desktop use and to
+3. A C implementation (with python bindings) for accelerated desktop use and to
    be used in C projects (very fast).
 
 This section instructs how to install each implementation.
@@ -98,42 +93,6 @@ following to `pyproject.toml`.
 tamp = "https://github.com/BrianPugh/tamp/releases/download/v1.7.0/tamp-1.7.0-mpy1.23-armv6m.mpy"
 ```
 
-### MicroPython Viper
-
-**NOT RECOMMENDED, PLEASE USE NATIVE MODULE**
-
-For micropython use, there are 3 main files:
-
-1. `tamp/__init__.py` - Always required.
-2. `tamp/decompressor_viper.py` - Required for on-device decompression.
-3. `tamp/compressor_viper.py` - Required for on-device compression.
-
-For example, if on-device decompression isn't used, then do not include
-`decompressor_viper.py`. If manually installing, just copy these files to your
-microcontroller's `/lib/tamp` folder.
-
-If using
-[mip](https://docs.micropython.org/en/latest/reference/packages.html#installing-packages-with-mip),
-tamp can be installed by specifying the appropriate `package-*.json` file.
-
-```bash
-mip install github:brianpugh/tamp  # Defaults to package.json: Compressor & Decompressor
-mip install github:brianpugh/tamp/package-compressor.json  # Compressor only
-mip install github:brianpugh/tamp/package-decompressor.json  # Decompressor only
-```
-
-If using [Belay](https://github.com/BrianPugh/belay), tamp can be installed by
-adding the following to `pyproject.toml`.
-
-```toml
-[tool.belay.dependencies]
-tamp = [
-   "https://github.com/BrianPugh/tamp/blob/main/tamp/__init__.py",
-   "https://github.com/BrianPugh/tamp/blob/main/tamp/compressor_viper.py",
-   "https://github.com/BrianPugh/tamp/blob/main/tamp/decompressor_viper.py",
-]
-```
-
 ## C
 
 Copy the `tamp/_c_src/tamp` folder into your project. For more information, see
@@ -258,31 +217,30 @@ input data sourced from the
 [Enwik8](https://mattmahoney.net/dc/textdata.html). This should give a general
 idea of how these algorithms perform over a variety of input data types.
 
-| dataset         | raw         | tamp           | tamp (LazyMatching) | zlib           | heatshrink |
-| --------------- | ----------- | -------------- | ------------------- | -------------- | ---------- |
-| enwik8          | 100,000,000 | **51,635,633** | 51,252,113          | 56,205,166     | 56,110,394 |
-| silesia/dickens | 10,192,446  | **5,546,761**  | 5,511,604           | 6,049,169      | 6,155,768  |
-| silesia/mozilla | 51,220,480  | 25,121,385     | 24,936,067          | **25,104,966** | 25,435,908 |
-| silesia/mr      | 9,970,564   | 5,027,032      | 4,886,272           | **4,864,734**  | 5,442,180  |
-| silesia/nci     | 33,553,445  | 8,643,610      | 8,645,299           | **5,765,521**  | 8,247,487  |
-| silesia/ooffice | 6,152,192   | **3,814,938**  | 3,798,261           | 4,077,277      | 3,994,589  |
-| silesia/osdb    | 10,085,684  | **8,520,835**  | 8,506,443           | 8,625,159      | 8,747,527  |
-| silesia/reymont | 6,627,202   | **2,847,981**  | 2,820,870           | 2,897,661      | 2,910,251  |
-| silesia/samba   | 21,606,400  | 9,102,594      | 9,060,692           | **8,862,423**  | 9,223,827  |
-| silesia/sao     | 7,251,944   | **6,137,755**  | 6,101,744           | 6,506,417      | 6,400,926  |
-| silesia/webster | 41,458,703  | **18,694,172** | 18,567,288          | 20,212,235     | 19,942,817 |
-| silesia/x-ray   | 8,474,240   | 7,510,606      | 7,405,814           | **7,351,750**  | 8,059,723  |
-| silesia/xml     | 5,345,280   | 1,681,687      | 1,672,660           | **1,586,985**  | 1,665,179  |
-
-Tamp usually out-performs heatshrink, and is generally very competitive with
-zlib. While trying to be an apples-to-apples comparison, zlib still uses
-significantly more memory during both compression and decompression (see next
-section). Tamp accomplishes competitive performance while using around 10x less
-memory.
+| dataset         | raw         | tamp        | tamp (LazyMatching) | zlib          | heatshrink |
+| --------------- | ----------- | ----------- | ------------------- | ------------- | ---------- |
+| enwik8          | 100,000,000 | 51,017,102  | **50,626,118**      | 56,205,166    | 56,110,394 |
+| RPI_PICO (.uf2) | 667,648     | **289,204** | 290,442             | 303,763       | -          |
+| silesia/dickens | 10,192,446  | 5,538,353   | **5,502,834**       | 6,049,169     | 6,155,768  |
+| silesia/mozilla | 51,220,480  | 24,412,662  | **24,228,654**      | 25,104,966    | 25,435,908 |
+| silesia/mr      | 9,970,564   | 4,519,402   | **4,393,009**       | 4,864,734     | 5,442,180  |
+| silesia/nci     | 33,553,445  | 6,824,403   | 6,772,379           | **5,765,521** | 8,247,487  |
+| silesia/ooffice | 6,152,192   | 3,773,089   | **3,755,153**       | 4,077,277     | 3,994,589  |
+| silesia/osdb    | 10,085,684  | 8,466,875   | **8,464,328**       | 8,625,159     | 8,747,527  |
+| silesia/reymont | 6,627,202   | 2,818,554   | **2,788,774**       | 2,897,661     | 2,910,251  |
+| silesia/samba   | 21,606,400  | 8,384,183   | **8,345,616**       | 8,862,423     | 9,223,827  |
+| silesia/sao     | 7,251,944   | 6,136,077   | **6,100,061**       | 6,506,417     | 6,400,926  |
+| silesia/webster | 41,458,703  | 18,146,649  | **18,010,980**      | 20,212,235    | 19,942,817 |
+| silesia/x-ray   | 8,474,240   | 7,509,449   | 7,404,794           | **7,351,750** | 8,059,723  |
+| silesia/xml     | 5,345,280   | 1,473,463   | **1,455,877**       | 1,586,985     | 1,665,179  |
+
+Tamp outperforms both heatshrink and zlib on most datasets, winning 12 out of 14
+benchmarks. This is while using around 10x less memory than zlib during both
+compression and decompression (see next section).
 
 Lazy Matching is a simple technique to improve compression ratios at the expense
 of CPU while requiring very little code. One can expect **50-75%** more CPU
-usage for modest compression gains (around 0.5 - 2.0%). Because of this poor
+usage for modest compression gains (around 0.5 - 2.0%). Because of this
 trade-off, it is disabled by default; however, in applications where we want to
 compress once on a powerful machine (like a desktop/server) and decompress on an
 embedded device, it may be worth it to spend a bit more compute. Lazy matched
@@ -305,6 +263,33 @@ repeating data more efficiently. Given Tamp's excellent performance in most of
 the other data compression benchmark files, this is a good tradeoff for most
 real-world scenarios.
 
+### Ablation Study
+
+The following table shows the effect of the `extended` and `lazy_matching`
+compression parameters across all benchmark datasets (`window=10`, `literal=8`).
+
+| dataset         | raw         | Baseline   | +lazy              | +extended          | +lazy +extended    |
+| --------------- | ----------- | ---------- | ------------------ | ------------------ | ------------------ |
+| enwik8          | 100,000,000 | 51,635,633 | 51,252,694 (−0.7%) | 51,017,102 (−1.2%) | 50,626,118 (−2.0%) |
+| RPI_PICO (.uf2) | 667,648     | 331,310    | 329,893 (−0.4%)    | 289,204 (−12.7%)   | 290,442 (−12.3%)   |
+| silesia/dickens | 10,192,446  | 5,546,761  | 5,511,681 (−0.6%)  | 5,538,353 (−0.2%)  | 5,502,834 (−0.8%)  |
+| silesia/mozilla | 51,220,480  | 25,121,385 | 24,937,036 (−0.7%) | 24,412,662 (−2.8%) | 24,228,654 (−3.6%) |
+| silesia/mr      | 9,970,564   | 5,027,032  | 4,888,930 (−2.7%)  | 4,519,402 (−10.1%) | 4,393,009 (−12.6%) |
+| silesia/nci     | 33,553,445  | 8,643,610  | 8,645,399 (+0.0%)  | 6,824,403 (−21.0%) | 6,772,379 (−21.6%) |
+| silesia/ooffice | 6,152,192   | 3,814,938  | 3,798,393 (−0.4%)  | 3,773,089 (−1.1%)  | 3,755,153 (−1.6%)  |
+| silesia/osdb    | 10,085,684  | 8,520,835  | 8,518,502 (−0.0%)  | 8,466,875 (−0.6%)  | 8,464,328 (−0.7%)  |
+| silesia/reymont | 6,627,202   | 2,847,981  | 2,820,948 (−0.9%)  | 2,818,554 (−1.0%)  | 2,788,774 (−2.1%)  |
+| silesia/samba   | 21,606,400  | 9,102,594  | 9,061,143 (−0.5%)  | 8,384,183 (−7.9%)  | 8,345,616 (−8.3%)  |
+| silesia/sao     | 7,251,944   | 6,137,755  | 6,101,747 (−0.6%)  | 6,136,077 (−0.0%)  | 6,100,061 (−0.6%)  |
+| silesia/webster | 41,458,703  | 18,694,172 | 18,567,618 (−0.7%) | 18,146,649 (−2.9%) | 18,010,980 (−3.7%) |
+| silesia/x-ray   | 8,474,240   | 7,510,606  | 7,406,001 (−1.4%)  | 7,509,449 (−0.0%)  | 7,404,794 (−1.4%)  |
+| silesia/xml     | 5,345,280   | 1,681,687  | 1,672,827 (−0.5%)  | 1,473,463 (−12.4%) | 1,455,877 (−13.4%) |
+
+The `extended` parameter enables additional Huffman codes for longer pattern
+matches, which significantly improves compression on datasets with many long
+repeating patterns (e.g., nci, samba, xml). Extended support was added in
+v2.0.0.
+
 ## Memory Usage
 
 The following table shows approximately how much memory each algorithm uses
@@ -331,7 +316,7 @@ on an M3 Macbook Air.
 |                              | Compression (s) | Decompression (s) |
 | ---------------------------- | --------------- | ----------------- |
 | Tamp (Pure Python Reference) | 136.2           | 105.0             |
-| Tamp (C bindings)            | 5.56            | 0.544             |
+| Tamp (C bindings)            | 5.45            | 0.544             |
 | ZLib                         | 3.65            | 0.578             |
 | Heatshrink (with index)      | 4.42            | 0.67              |
 | Heatshrink (without index)   | 27.40           | 0.67              |
@@ -350,8 +335,7 @@ speed Tamp can achieve. In all tests, a 1KB window (10 bit) was used.
 
 |                                  | Compression (bytes/s) | Decompression (bytes/s) |
 | -------------------------------- | --------------------- | ----------------------- |
-| Tamp (MicroPython Viper)         | 4,300                 | 42,000                  |
-| Tamp (Micropython Native Module) | 31,192                | 1,086,957               |
+| Tamp (Micropython Native Module) | 31,949                | 1,086,957               |
 | Tamp (C)                         | 36,127                | 1,400,600               |
 | Deflate (micropython builtin)    | 6,885                 | 294,985                 |
 
@@ -365,19 +349,20 @@ compiled for the Pi Pico (`armv6m`). All libraries were compiled with `-O3`.
 Numbers reported in bytes. Tamp sizes were measured using `arm-none-eabi-gcc`
 15.2.1 and MicroPython v1.27, and can be regenerated with `make binary-size`.
 
-|                           | Compressor | Decompressor | Compressor + Decompressor |
-| ------------------------- | ---------- | ------------ | ------------------------- |
-| Tamp (MicroPython Viper)  | 4676       | 4372         | 7917                      |
-| Tamp (MicroPython Native) | 3896       | 3559         | 6616                      |
-| Tamp (C, -DTAMP_STREAM=0) | 2028       | 1992         | 3900                      |
-| Tamp (C)                  | 2472       | 2444         | 4796                      |
-| Heatshrink (C)            | 2956       | 3876         | 6832                      |
-| uzlib (C)                 | 2355       | 3963         | 6318                      |
-
-Tamp C includes a high-level stream API by default. Even with `-DTAMP_STREAM=0`,
-Tamp includes buffer-looping functions (like `tamp_compressor_compress`) that
-Heatshrink lacks (Heatshrink only provides poll/sink primitives). In an
-apples-to-apples comparison, Tamp would be even smaller.
+|                                  | Compressor | Decompressor | Compressor + Decompressor |
+| -------------------------------- | ---------- | ------------ | ------------------------- |
+| Tamp (MicroPython Native)        | 4708       | 4339         | 8124                      |
+| Tamp (C, no extended, no stream) | 1466       | 1312         | 2592                      |
+| Tamp (C, no extended)            | 1748       | 1550         | 3112                      |
+| Tamp (C, extended, no stream)    | 2558       | 2072         | 4444                      |
+| Tamp (C, extended)               | 2840       | 2310         | 4964                      |
+| Heatshrink (C)                   | 2956       | 3876         | 6832                      |
+| uzlib (C)                        | 2355       | 3963         | 6318                      |
+
+Tamp C "extended" includes `tamp_compressor_compress_and_flush`. Tamp C includes
+a high-level stream API by default. Even with `no stream`, Tamp includes
+buffer-looping functions (like `tamp_compressor_compress`) that Heatshrink lacks
+(Heatshrink only provides poll/sink primitives).
 
 ## Acknowledgement
 
diff --git a/datasets/extended-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp b/datasets/extended-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp
new file mode 100644
index 0000000..1232a50
--- /dev/null
+++ b/datasets/extended-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fb7efbdc484cbf400817074910275c3f2a89aec0ab96c8984fd58423e5e9290
+size 291036
diff --git a/datasets/extended-compressed/dickens.tamp b/datasets/extended-compressed/dickens.tamp
new file mode 100644
index 0000000..a1ed82b
--- /dev/null
+++ b/datasets/extended-compressed/dickens.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db20dcfa7e76829a809a4c9d253f1b4e53b294e86db789490bc4fadb19ab5dc0
+size 5538332
diff --git a/datasets/extended-compressed/enwik8.tamp b/datasets/extended-compressed/enwik8.tamp
new file mode 100644
index 0000000..edf7491
--- /dev/null
+++ b/datasets/extended-compressed/enwik8.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24f3e70e49e7344ebbe3ab23d274910f7aff5bb1fb1212658b1f136a99d244f4
+size 51019055
diff --git a/datasets/extended-compressed/mozilla.tamp b/datasets/extended-compressed/mozilla.tamp
new file mode 100644
index 0000000..b1ee2c5
--- /dev/null
+++ b/datasets/extended-compressed/mozilla.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cedba7ad7710757f3c5e7bf8176fd92eede9b8e5be2e8e697f9a6dc15d45718
+size 24415401
diff --git a/datasets/extended-compressed/mr.tamp b/datasets/extended-compressed/mr.tamp
new file mode 100644
index 0000000..404f417
--- /dev/null
+++ b/datasets/extended-compressed/mr.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a619cf3440c067f6bd5eebcf6b2145d583feca83067244c0c2585aecb4b3cae
+size 4519929
diff --git a/datasets/extended-compressed/nci.tamp b/datasets/extended-compressed/nci.tamp
new file mode 100644
index 0000000..085b3bf
--- /dev/null
+++ b/datasets/extended-compressed/nci.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:adf1eac94e48d44092a9e64a7e8d14e1452b357f176e2aba7ed26eb2b7340946
+size 6855616
diff --git a/datasets/extended-compressed/ooffice.tamp b/datasets/extended-compressed/ooffice.tamp
new file mode 100644
index 0000000..d824344
--- /dev/null
+++ b/datasets/extended-compressed/ooffice.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9376fe3e9f0286c1edbfafeadb1a8d997dd8524a32dbce7438636f270d61789
+size 3773219
diff --git a/datasets/extended-compressed/osdb.tamp b/datasets/extended-compressed/osdb.tamp
new file mode 100644
index 0000000..5dac28b
--- /dev/null
+++ b/datasets/extended-compressed/osdb.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b3188714fc3361691b684fecbff75b2d2cb9e6e690887aec456469d7505a586
+size 8466736
diff --git a/datasets/extended-compressed/reymont.tamp b/datasets/extended-compressed/reymont.tamp
new file mode 100644
index 0000000..1407234
--- /dev/null
+++ b/datasets/extended-compressed/reymont.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e45bf12f0ca5971b47cfca38b2cc47216c93e13915b9b3ac19aa4195b9e87d5
+size 2818601
diff --git a/datasets/extended-compressed/samba.tamp b/datasets/extended-compressed/samba.tamp
new file mode 100644
index 0000000..259d6cc
--- /dev/null
+++ b/datasets/extended-compressed/samba.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac74af80542ad88dd617b95047e2a77e820cfcac3ef17abb8904949b87fd605f
+size 8386303
diff --git a/datasets/extended-compressed/sao.tamp b/datasets/extended-compressed/sao.tamp
new file mode 100644
index 0000000..46c39e2
--- /dev/null
+++ b/datasets/extended-compressed/sao.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c05ac1c7d78b04874f07e10265cd254ecf9d6dcf1a3f0d1ea695815509ff0b1
+size 6136077
diff --git a/datasets/extended-compressed/webster.tamp b/datasets/extended-compressed/webster.tamp
new file mode 100644
index 0000000..6c6835a
--- /dev/null
+++ b/datasets/extended-compressed/webster.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1227df26234f9c9cf53d9be82d6a38ecd344db70bd6b25109a6e5ae1d4c1673f
+size 18146647
diff --git a/datasets/extended-compressed/x-ray.tamp b/datasets/extended-compressed/x-ray.tamp
new file mode 100644
index 0000000..981eb1d
--- /dev/null
+++ b/datasets/extended-compressed/x-ray.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ba0c1fb79addae24888c12a466e84b73c32ca608836c458487226d224a63fc3
+size 7509449
diff --git a/datasets/extended-compressed/xml.tamp b/datasets/extended-compressed/xml.tamp
new file mode 100644
index 0000000..67e6882
--- /dev/null
+++ b/datasets/extended-compressed/xml.tamp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6adb3788d5644d28c56ea785ea4149a6f0b8ba5562c4f8dcc4a4ba8371920e97
+size 1473552
diff --git a/docs/source/c_library.rst b/docs/source/c_library.rst
index dadf934..42e950a 100644
--- a/docs/source/c_library.rst
+++ b/docs/source/c_library.rst
@@ -5,6 +5,61 @@ C Library
 Tamp provides a C library optimized for low-memory-usage, fast runtime, and small binary footprint.
 This page describes how to use the provided library.
 
+Compile-Time Flags
+^^^^^^^^^^^^^^^^^^
+Tamp's C library can be customized via compile-time flags to control features, code size, and performance.
+Pass these flags to your compiler (e.g., ``-DTAMP_STREAM=0``).
+
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| Flag                             | Default           | Description                                                                  |
++==================================+===================+==============================================================================+
+| ``TAMP_EXTENDED``                | ``1``             | Default value for extended format support (RLE, extended match encoding).    |
+|                                  |                   | Set to ``0`` to disable extended support in both compressor and decompressor.|
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_EXTENDED_COMPRESS``       | ``TAMP_EXTENDED`` | Enable extended format compression. Defaults to ``TAMP_EXTENDED`` but can    |
+|                                  |                   | be individually overridden for compressor-only or decompressor-only builds.  |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_EXTENDED_DECOMPRESS``     | ``TAMP_EXTENDED`` | Enable extended format decompression. Defaults to ``TAMP_EXTENDED`` but can  |
+|                                  |                   | be individually overridden for compressor-only or decompressor-only builds.  |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_LAZY_MATCHING``           | ``0``             | Enable lazy matching support. When enabled, ``TampConf.lazy_matching``       |
+|                                  |                   | becomes available. Improves compression ratio by 0.5-2% at the cost of       |
+|                                  |                   | 50-75% slower compression. Most embedded systems should leave disabled.      |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_STREAM``                  | ``1``             | Include stream API (``tamp_compress_stream``, ``tamp_decompress_stream``).   |
+|                                  |                   | Disable with ``-DTAMP_STREAM=0`` to save ~2.8KB if only using low-level API. |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_STREAM_WORK_BUFFER_SIZE`` | ``32``            | Stack-allocated work buffer size (bytes) for stream API. Split in half       |
+|                                  |                   | for input/output. Larger values reduce I/O callback invocations,             |
+|                                  |                   | improving decompression speed. 256+ bytes recommended when stack permits.    |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_STREAM_STDIO``            | ``0``             | Enable stdio (``FILE*``) stream handlers. Works with standard C library,     |
+|                                  |                   | ESP-IDF VFS, and POSIX-compatible systems.                                   |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_STREAM_MEMORY``           | ``0``             | Enable memory buffer stream handlers (``TampMemReader``, ``TampMemWriter``). |
+|                                  |                   | Useful for file-to-memory or memory-to-file operations.                      |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_STREAM_LITTLEFS``         | ``0``             | Enable LittleFS stream handlers. Requires LittleFS headers.                  |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_STREAM_FATFS``            | ``0``             | Enable FatFs (ChaN's FAT filesystem) stream handlers. Requires FatFs headers.|
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+| ``TAMP_ESP32``                   | ``0``             | Use ESP32-optimized variant. Avoids bitfields for speed at the cost of       |
+|                                  |                   | slightly higher memory usage. Automatically enabled via Kconfig on ESP-IDF.  |
++----------------------------------+-------------------+------------------------------------------------------------------------------+
+
+**Example: Minimal decompressor-only build**
+
+.. code-block:: bash
+
+   gcc -DTAMP_EXTENDED_COMPRESS=0 -DTAMP_STREAM=0 -c decompressor.c common.c
+
+**Example: Full-featured build with LittleFS support**
+
+.. code-block:: bash
+
+   gcc -DTAMP_LAZY_MATCHING=1 -DTAMP_STREAM_LITTLEFS=1 -DTAMP_STREAM_WORK_BUFFER_SIZE=256 \
+       -c compressor.c decompressor.c common.c
+
 Overview
 ^^^^^^^^
 To use Tamp in your C project, simply copy the contents of ``tamp/_c_src`` into your project.
diff --git a/docs/source/javascript.rst b/docs/source/javascript.rst
index 39bbe21..6972bef 100644
--- a/docs/source/javascript.rst
+++ b/docs/source/javascript.rst
@@ -51,6 +51,12 @@ Customize compression behavior with options:
      // For general use, 8 (the whole byte) is appropriate.
      literal: 7,
 
+     // Enable extended format (RLE, extended match) for better compression ratios.
+     // The extended format provides better compression for typical data at the
+     // cost of slightly more complex encoding.
+     // Default: true
+     extended: true,
+
      // Enable lazy matching to slightly improve compression (0.5-2.0%) ratios
      // at the cost of 50-75% slower compression.
      // Most embedded systems will **not** want to use this feature and disable it.
@@ -136,6 +142,7 @@ Configure compression parameters by passing in options:
    const options = {
      window: 12,           // Larger window for (usually) better compression
      literal: 7,           // ASCII text only requires 7 bits.
+     extended: true,       // Enable extended format (RLE, extended match)
      lazy_matching: true   // Better compression ratios; slower to compress
    };
 
diff --git a/docs/source/specification.rst b/docs/source/specification.rst
index f612893..31bb102 100644
--- a/docs/source/specification.rst
+++ b/docs/source/specification.rst
@@ -26,7 +26,8 @@ The bit-location 0 is equivalent to typical MSb position 7 of the first byte.
 | [2]     | custom_dictionary | A custom dictionary initialization method was used                  |
 |         |                   | and must be provided at decompression.                              |
 +---------+-------------------+---------------------------------------------------------------------+
-| [1]     | reserved          | Reserved for future use. Must be 0.                                 |
+| [1]     | extended          | Enables extended format features (RLE, extended match encoding).    |
+|         |                   | Generally improves compression, introduced in tamp v2.0.0.          |
 +---------+-------------------+---------------------------------------------------------------------+
 | [0]     | more_header       | If ``True``, then the next byte in the stream is more header data.  |
 |         |                   | Currently always ``False``, but allows for future expandability.    |
@@ -60,8 +61,9 @@ Modifications are made to make the implementation simpler/faster.
       and points at the offset from the beginning of the dictionary buffer to the pattern.
       The shortest pattern-length is either going to be 2 or 3 bytes, depending on ``window``
       and ``literal`` parameters. The shortest pattern-length encoding must be shorter than
-      an equivalent stream of literals. The longest pattern-length will the minimum
-      pattern-length plus 13.
+      an equivalent stream of literals. In the basic (non-extended) format, the longest
+      pattern-length is the minimum pattern-length plus 13. When the ``extended`` flag
+      is set, longer matches are possible via extended match encoding.
 
 Classically, the ``offset`` is from the current position in the buffer. Doing so results
 in the ``offset`` distribution slightly favoring smaller numbers. Intuitively, it makes
@@ -167,6 +169,87 @@ The maximum match-size is more likely than the second-highest match-size because
 
 For any given huffman coding schema, a equivalent coding can be obtained by inverting all the bits (reflecting the huffman tree). The single-bit, most common code ``0b0`` representing a pattern-size 2 is intentionally represented as ``0b0`` instead of ``0b1``. This makes the MSb of all other codes be 1, simplifying the decoding procedure because the number of bits read doesn't strictly have to be recorded.
 
+Extended Format (v2.0.0+)
+^^^^^^^^^^^^^^^^^^^^^^^^^
+When the ``extended`` header bit is set, two additional token types are available:
+RLE (Run-Length Encoding) and Extended Match. These use Huffman symbols 12 and 13
+respectively, which in the basic format would represent match sizes ``min_pattern_size + 12``
+and ``min_pattern_size + 13``.
+
+Extended Huffman Encoding
+-------------------------
+Both RLE and Extended Match use a secondary Huffman encoding to represent their payload values.
+This encoding combines a Huffman code (without the literal flag) with trailing bits:
+
+1. Read the Huffman symbol (12 for RLE, 13 for Extended Match) with the literal flag (``0b0``).
+2. Decode an additional Huffman code (reusing the same table, but without the leading literal flag bit).
+3. Read trailing bits (4 bits for RLE, 3 bits for Extended Match).
+4. Combine: ``value = (huffman_index << trailing_bits) + trailing_bits_value``
+
+RLE Token (Symbol 12)
+---------------------
+RLE encodes runs of repeated bytes efficiently. The repeated byte is implicitly
+the last byte written to the window buffer. If no bytes have been written yet
+(i.e., ``window_pos == 0``), the byte at position ``window_size - 1`` of the
+initial dictionary is used.
+
+Format: ``0b0 | huffman_code[12] | extended_huffman(count - 2, trailing=4)``
+
+Where:
+
+- ``huffman_code[12]`` = ``0xAA`` (9 bits including literal flag)
+- ``extended_huffman`` encodes ``count - 2`` with 4 trailing bits
+- ``count`` ranges from 2 to 225: ``(13 << 4) + 15 + 2 = 225``
+
+Window update: Only the first 8 bytes are written to the dictionary (no wrap-around).
+If fewer than 8 bytes remain before the end of the window buffer, only those bytes
+are written. This bounds the window update cost while still allowing the decompressor
+to find subsequent pattern matches.
+
+.. code-block:: text
+
+   RLE Token Structure:
+   +---+------------+-------------------+----------------+
+   | 0 | huffman[12]| huffman(cnt>>4)   | cnt & 0xF      |
+   +---+------------+-------------------+----------------+
+   |1b |   8 bits   | 1-8 bits          | 4 bits         |
+   +---+------------+-------------------+----------------+
+
+Extended Match Token (Symbol 13)
+--------------------------------
+Extended Match allows pattern matches longer than the basic format's maximum of
+``min_pattern_size + 13``. It is used when a match exceeds ``min_pattern_size + 11``.
+
+Format: ``0b0 | huffman_code[13] | extended_huffman(size - min_pattern_size - 12, trailing=3) | offset``
+
+Where:
+
+- ``huffman_code[13]`` = ``0x27`` (7 bits including literal flag)
+- ``extended_huffman`` encodes ``size - min_pattern_size - 12`` with 3 trailing bits
+- ``offset`` is ``window`` bits, pointing to the start of the pattern
+- Maximum extra size: ``(13 << 3) + 7 + 1 = 112``
+- Maximum total match size: ``min_pattern_size + 11 + 112 = min_pattern_size + 123``
+
+The ``-12`` offset ensures extended matches start at ``min_pattern_size + 12``, leaving
+symbols 0-11 for basic matches (0-11 maps to ``min_pattern_size`` through ``min_pattern_size + 11``).
+
+Window constraints: The source pattern cannot span past the window buffer boundary;
+the compressor terminates extended matches early if they would cross this boundary.
+Similarly, destination writes do not wrap-around; only bytes up to the end of the
+window buffer are written. This simplifies implementation while having minimal
+impact on compression ratio (approximately 0.02% loss).
+
+.. code-block:: text
+
+   Extended Match Token Structure:
+   +---+------------+-------------------+----------------+--------+
+   | 0 | huffman[13]| huffman(sz>>3)    | sz & 0x7       | offset |
+   +---+------------+-------------------+----------------+--------+
+   |1b |   6 bits   | 1-8 bits          | 3 bits         | window |
+   +---+------------+-------------------+----------------+--------+
+
+   Where sz = match_size - min_pattern_size - 12
+
 Flush Symbol
 ------------
 A special FLUSH symbol is encoded as the least likely Huffman code.
diff --git a/espidf/tamp/compressor_esp32.cpp b/espidf/tamp/compressor_esp32.cpp
index afa0c9b..a17570c 100644
--- a/espidf/tamp/compressor_esp32.cpp
+++ b/espidf/tamp/compressor_esp32.cpp
@@ -27,7 +27,7 @@ typedef uint32_t u16;
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 
 #define MAX_PATTERN_SIZE (compressor->min_pattern_size + 13)
-#define WINDOW_SIZE (1 << compressor->conf_window)
+#define WINDOW_SIZE (1 << compressor->conf.window)
 
 static inline void write_to_bit_buffer(TampCompressor *compressor, uint32_t bits, u8 n_bits) {
     compressor->bit_buffer_pos += n_bits;
diff --git a/mpy_bindings/bindings.c b/mpy_bindings/bindings.c
index d75d559..0cdf9d0 100644
--- a/mpy_bindings/bindings.c
+++ b/mpy_bindings/bindings.c
@@ -9,13 +9,14 @@
  **********/
 
 #include "tamp/common.h"
-#define CHUNK_SIZE 32  // Must be <= 65535
+#define CHUNK_SIZE 32  // Must be >= 32 and <= 65535
+_Static_assert(CHUNK_SIZE >= 32, "CHUNK_SIZE must be >= 32 to hold flush output");
 #define mp_type_bytearray (*(mp_obj_type_t *)(mp_load_global(MP_QSTR_bytearray)))
 
 static void TAMP_CHECK(tamp_res res) {
-    if (res == TAMP_EXCESS_BITS) {
+    if (TAMP_UNLIKELY(res == TAMP_EXCESS_BITS)) {
         nlr_raise(mp_obj_new_exception(mp_load_global(MP_QSTR_ExcessBitsError)));
-    } else if (res < TAMP_OK) {
+    } else if (TAMP_UNLIKELY(res < TAMP_OK)) {
         mp_raise_ValueError("");
     }
 }
@@ -67,6 +68,7 @@ static mp_obj_t compressor_make_new(const mp_obj_type_t *type, size_t n_args, si
         .window = mp_obj_get_int(args_in[1]),
         .literal = mp_obj_get_int(args_in[2]),
         .use_custom_dictionary = mp_obj_get_int(args_in[4]),
+        .extended = mp_obj_get_int(args_in[5]),
     };
 
     mp_obj_compressor_t *o = mp_obj_malloc(mp_obj_compressor_t, type);
@@ -75,7 +77,7 @@ static mp_obj_t compressor_make_new(const mp_obj_type_t *type, size_t n_args, si
 
     mp_buffer_info_t dictionary_buffer_info;
     mp_get_buffer_raise(o->dictionary, &dictionary_buffer_info, MP_BUFFER_RW);
-    if (dictionary_buffer_info.len < (1 << conf.window)) {
+    if (TAMP_UNLIKELY(dictionary_buffer_info.len < (1 << conf.window))) {
         mp_raise_ValueError("");
     }
 
@@ -175,7 +177,7 @@ static mp_obj_t decompressor_make_new(const mp_obj_type_t *type, size_t n_args,
 
     const uint16_t window_size = 1 << conf.window;
     if (o->dictionary == mp_const_none) {
-        if (conf.use_custom_dictionary) {
+        if (TAMP_UNLIKELY(conf.use_custom_dictionary)) {
             mp_raise_ValueError("");
         }
         o->dictionary = mp_obj_new_bytearray_by_ref(window_size, m_malloc(window_size));
@@ -184,7 +186,7 @@ static mp_obj_t decompressor_make_new(const mp_obj_type_t *type, size_t n_args,
     {
         mp_buffer_info_t dictionary_buffer_info;
         mp_get_buffer_raise(o->dictionary, &dictionary_buffer_info, MP_BUFFER_RW);
-        if (dictionary_buffer_info.len < window_size) {
+        if (TAMP_UNLIKELY(dictionary_buffer_info.len < window_size)) {
             mp_raise_ValueError("");
         }
 
diff --git a/mpy_bindings/bindings_compressor.py b/mpy_bindings/bindings_compressor.py
index 414ae6a..9fcbb81 100644
--- a/mpy_bindings/bindings_compressor.py
+++ b/mpy_bindings/bindings_compressor.py
@@ -9,6 +9,7 @@ def __init__(
         window=10,
         literal=8,
         dictionary=None,
+        extended=True,
     ):
         self._cf = False  # shorter name to save binary space
         if not hasattr(f, "write"):  # It's probably a path-like object.
@@ -18,7 +19,7 @@ def __init__(
         custom = dictionary is not None
         if not dictionary:
             dictionary = bytearray(1 << window)
-        self._c = _C(f, window, literal, dictionary, custom)
+        self._c = _C(f, window, literal, dictionary, custom, extended)
 
         self.write = self._c.write
 
diff --git a/pyproject.toml b/pyproject.toml
index cf5b40e..d2b2b71 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -102,6 +102,9 @@ unittest = [
 ]
 uprofiler = "https://github.com/BrianPugh/micropython-libs/blob/main/lib/uprofiler.py"
 
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+
 [tool.coverage.run]
 branch = true
 omit = [
diff --git a/tamp/_c_compressor.pyx b/tamp/_c_compressor.pyx
index a3e403d..9c19f0b 100644
--- a/tamp/_c_compressor.pyx
+++ b/tamp/_c_compressor.pyx
@@ -35,6 +35,7 @@ cdef class Compressor:
         int literal=8,
         dictionary=None,
         bool lazy_matching=False,
+        bool extended=True,
     ):
         cdef ctamp.TampConf conf
 
@@ -55,6 +56,7 @@ cdef class Compressor:
         # Set lazy_matching - this field is conditionally compiled based on TAMP_LAZY_MATCHING
         # The build system defines this macro, so the field should be available
         conf.lazy_matching = lazy_matching
+        conf.extended = extended
 
         self._window_buffer = dictionary if dictionary else bytearray(1 << window)
         self._window_buffer_ptr = <unsigned char *>self._window_buffer
@@ -102,7 +104,7 @@ cdef class Compressor:
 
     cpdef int flush(self, bool write_token = True) except -1:
         cdef ctamp.tamp_res res
-        cdef bytearray buffer = bytearray(24)
+        cdef bytearray buffer = bytearray(32)
         cdef size_t output_written_size = 0
 
         res = ctamp.tamp_compressor_flush(
diff --git a/tamp/_c_src/tamp/common.c b/tamp/_c_src/tamp/common.c
index f88dd34..be0099a 100644
--- a/tamp/_c_src/tamp/common.c
+++ b/tamp/_c_src/tamp/common.c
@@ -24,7 +24,7 @@ static inline uint32_t xorshift32(uint32_t *state) {
     return x;
 }
 
-void tamp_initialize_dictionary(unsigned char *buffer, size_t size) {
+TAMP_OPTIMIZE_SIZE void tamp_initialize_dictionary(unsigned char *buffer, size_t size) {
     uint32_t seed = 3758097560;  // This was experimentally discovered with tools/find_seed.py
     uint32_t randbuf = 0;
     for (size_t i = 0; i < size; i++) {
@@ -34,10 +34,40 @@ void tamp_initialize_dictionary(unsigned char *buffer, size_t size) {
     }
 }
 
-int8_t tamp_compute_min_pattern_size(uint8_t window, uint8_t literal) {
+TAMP_OPTIMIZE_SIZE int8_t tamp_compute_min_pattern_size(uint8_t window, uint8_t literal) {
     return 2 + (window > (10 + ((literal - 5) << 1)));
 }
 
+void tamp_window_copy(unsigned char *window, uint16_t *window_pos, uint16_t window_offset, uint8_t match_size,
+                      uint16_t window_mask) {
+    /* Calculate distance from source to destination in circular buffer.
+     * src_to_dst = (dst - src) & mask gives the forward distance. */
+    const uint16_t src_to_dst = (*window_pos - window_offset) & window_mask;
+
+    /* Critical overlap case: destination is AHEAD of source and they overlap.
+     * When dst > src by less than match_size, a forward copy corrupts data because
+     * we write to positions before reading from them.
+     *
+     * Example: src=100, dst=105, match_size=8
+     *   - Forward copy at i=5 would read window[105], but we already overwrote it at i=0!
+     *   - Must copy in REVERSE order (end to start) to read source bytes before overwriting.
+     */
+    if (TAMP_UNLIKELY(src_to_dst < match_size && src_to_dst > 0)) {
+        /* Copy in reverse order: start from last byte, work backwards to first byte.
+         * This ensures we read all overlapping source bytes before they're overwritten.
+         * Destination wraps via mask; source doesn't need wrapping (pre-validated bounds). */
+        for (uint8_t i = match_size; i-- > 0;) {
+            window[(*window_pos + i) & window_mask] = window[window_offset + i];
+        }
+        *window_pos = (*window_pos + match_size) & window_mask;
+    } else {
+        for (uint8_t i = 0; i < match_size; i++) {
+            window[*window_pos] = window[window_offset + i];
+            *window_pos = (*window_pos + 1) & window_mask;
+        }
+    }
+}
+
 /*******************************************************************************
  * Built-in I/O handler implementations
  ******************************************************************************/
diff --git a/tamp/_c_src/tamp/common.h b/tamp/_c_src/tamp/common.h
index 0b2b8e4..11006f4 100644
--- a/tamp/_c_src/tamp/common.h
+++ b/tamp/_c_src/tamp/common.h
@@ -39,15 +39,53 @@ extern "C" {
 #define TAMP_UNLIKELY(c) (c)
 #endif
 
+/* Per-function optimize attributes and #pragma GCC push/pop_options require
+ * GCC on a target that supports them. Xtensa GCC does not. */
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__XTENSA__)
+#define TAMP_HAS_GCC_OPTIMIZE 1
+#else
+#define TAMP_HAS_GCC_OPTIMIZE 0
+#endif
+
 #if defined(_MSC_VER)
 #define TAMP_ALWAYS_INLINE __forceinline
 #define TAMP_NOINLINE __declspec(noinline)
-#elif defined(__GNUC__) || defined(__clang__)
+#define TAMP_OPTIMIZE_SIZE /* not supported */
+#elif defined(__GNUC__) && !defined(__clang__)
+#define TAMP_ALWAYS_INLINE inline __attribute__((always_inline))
+#define TAMP_NOINLINE __attribute__((noinline))
+#if TAMP_HAS_GCC_OPTIMIZE
+#define TAMP_OPTIMIZE_SIZE __attribute__((optimize("Os")))
+#else
+#define TAMP_OPTIMIZE_SIZE
+#endif
+#elif defined(__clang__)
 #define TAMP_ALWAYS_INLINE inline __attribute__((always_inline))
 #define TAMP_NOINLINE __attribute__((noinline))
+#define TAMP_OPTIMIZE_SIZE /* clang doesn't support per-function optimize */
 #else
 #define TAMP_ALWAYS_INLINE inline
 #define TAMP_NOINLINE
+#define TAMP_OPTIMIZE_SIZE
+#endif
+
+/* TAMP_USE_MEMSET: Use libc memset (default: 1).
+ * Set to 0 for environments without libc (e.g. MicroPython native modules).
+ * When disabled, uses a volatile loop that prevents GCC from emitting a
+ * memset call at the cost of inhibiting store coalescing. */
+#ifndef TAMP_USE_MEMSET
+#define TAMP_USE_MEMSET 1
+#endif
+
+#if TAMP_USE_MEMSET
+#include <string.h>
+#define TAMP_MEMSET(dst, val, n) memset((dst), (val), (n))
+#else
+#define TAMP_MEMSET(dst, val, n)                                                     \
+    do {                                                                             \
+        volatile unsigned char *_tamp_p = (volatile unsigned char *)(dst);           \
+        for (size_t _tamp_i = 0; _tamp_i < (n); _tamp_i++) _tamp_p[_tamp_i] = (val); \
+    } while (0)
 #endif
 
 /* Include stream API (tamp_compress_stream, tamp_decompress_stream).
@@ -68,6 +106,32 @@ extern "C" {
 #define TAMP_STREAM_WORK_BUFFER_SIZE 32
 #endif
 
+/* Extended format support (RLE, extended match).
+ * Enabled by default. Disable to save code size on minimal builds.
+ *
+ * TAMP_EXTENDED is the master switch (default: 1).
+ * TAMP_EXTENDED_COMPRESS and TAMP_EXTENDED_DECOMPRESS default to TAMP_EXTENDED,
+ * but can be individually overridden for compressor-only or decompressor-only builds.
+ */
+#ifndef TAMP_EXTENDED
+#define TAMP_EXTENDED 1
+#endif
+#ifndef TAMP_EXTENDED_DECOMPRESS
+#define TAMP_EXTENDED_DECOMPRESS TAMP_EXTENDED
+#endif
+#ifndef TAMP_EXTENDED_COMPRESS
+#define TAMP_EXTENDED_COMPRESS TAMP_EXTENDED
+#endif
+
+/* Extended encoding constants */
+#if TAMP_EXTENDED_DECOMPRESS || TAMP_EXTENDED_COMPRESS
+#define TAMP_RLE_SYMBOL 12
+#define TAMP_EXTENDED_MATCH_SYMBOL 13
+#define TAMP_LEADING_EXTENDED_MATCH_BITS 3
+#define TAMP_LEADING_RLE_BITS 4
+#define TAMP_RLE_MAX_WINDOW 8
+#endif
+
 enum {
     /* Normal/Recoverable status >= 0 */
     TAMP_OK = 0,
@@ -93,6 +157,7 @@ typedef struct TampConf {
     uint16_t window : 4;                 // number of window bits
     uint16_t literal : 4;                // number of literal bits
     uint16_t use_custom_dictionary : 1;  // Use a custom initialized dictionary.
+    uint16_t extended : 1;               // Extended format (RLE, extended match). Read from header bit [1].
 #if TAMP_LAZY_MATCHING
     uint16_t lazy_matching : 1;  // use Lazy Matching (spend 50-75% more CPU for around 0.5-2.0% better compression.)
                                  // only effects compression operations.
@@ -297,6 +362,26 @@ void tamp_initialize_dictionary(unsigned char *buffer, size_t size);
  */
 int8_t tamp_compute_min_pattern_size(uint8_t window, uint8_t literal);
 
+/**
+ * @brief Copy pattern from window to window, updating window_pos.
+ *
+ * Handles potential overlap between source and destination regions by
+ * copying backwards when the destination would "catch up" to the source.
+ *
+ * IMPORTANT: Caller must validate that (window_offset + match_size) does not
+ * exceed window bounds before calling this function. This function assumes
+ * window_offset and match_size are pre-validated and does not perform
+ * bounds checking on source reads.
+ *
+ * @param window Circular buffer (size must be power of 2)
+ * @param window_pos Current write position (updated by this function)
+ * @param window_offset Source position to copy from
+ * @param match_size Number of bytes to copy
+ * @param window_mask Bitmask for wrapping (window_size - 1)
+ */
+void tamp_window_copy(unsigned char *window, uint16_t *window_pos, uint16_t window_offset, uint8_t match_size,
+                      uint16_t window_mask);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c
index 0e06100..97760e2 100644
--- a/tamp/_c_src/tamp/compressor.c
+++ b/tamp/_c_src/tamp/compressor.c
@@ -9,21 +9,42 @@
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2 * !!(condition)]))
 
+#if TAMP_EXTENDED_COMPRESS
+// Extended max pattern: min_pattern_size + 11 + 112 = min_pattern_size + 123
+#define MAX_PATTERN_SIZE_EXTENDED (compressor->min_pattern_size + 123)
+#define MAX_PATTERN_SIZE (compressor->conf.extended ? MAX_PATTERN_SIZE_EXTENDED : (compressor->min_pattern_size + 13))
+#else
 #define MAX_PATTERN_SIZE (compressor->min_pattern_size + 13)
-#define WINDOW_SIZE (1 << compressor->conf_window)
+#endif
+#define WINDOW_SIZE (1 << compressor->conf.window)
 // 0xF because sizeof(TampCompressor.input) == 16;
 #define input_add(offset) ((compressor->input_pos + offset) & 0xF)
 #define read_input(offset) (compressor->input[input_add(offset)])
-#define IS_LITERAL_FLAG (1 << compressor->conf_literal)
+#define IS_LITERAL_FLAG (1 << compressor->conf.literal)
 
 #define FLUSH_CODE (0xAB)
 
+// Internal return value for poll_extended_handling: signals caller to
+// proceed with normal pattern matching rather than returning immediately.
+#define TAMP_POLL_CONTINUE ((tamp_res)127)
+
 // encodes [min_pattern_bytes, min_pattern_bytes + 13] pattern lengths
 static const uint8_t huffman_codes[] = {0x0, 0x3, 0x8, 0xb, 0x14, 0x24, 0x26, 0x2b, 0x4b, 0x54, 0x94, 0x95, 0xaa, 0x27};
 // These bit lengths pre-add the 1 bit for the 0-value is_literal flag.
 static const uint8_t huffman_bits[] = {0x2, 0x3, 0x5, 0x5, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0x9, 0x7};
 
-static inline void write_to_bit_buffer(TampCompressor *compressor, uint32_t bits, uint8_t n_bits) {
+#if TAMP_EXTENDED_COMPRESS
+#define RLE_MAX_COUNT ((13 << 4) + 15 + 2)            // 225
+#define EXTENDED_MATCH_MAX_EXTRA ((13 << 3) + 7 + 1)  // 112
+
+// Minimum output buffer space required for extended match token.
+// Extended match: symbol (7 bits) + extended huffman (11 bits) + window pos (15 bits) = 33 bits.
+// With 7 bits in bit buffer, need up to 40 bits = 5 bytes. Add 1 byte margin.
+// Pre-checking prevents OUTPUT_FULL mid-token, which would corrupt bit_buffer on retry.
+#define EXTENDED_MATCH_MIN_OUTPUT_BYTES 6
+#endif
+
+static TAMP_NOINLINE void write_to_bit_buffer(TampCompressor* compressor, uint32_t bits, uint8_t n_bits) {
     compressor->bit_buffer_pos += n_bits;
     compressor->bit_buffer |= bits << (32 - compressor->bit_buffer_pos);
 }
@@ -31,18 +52,27 @@ static inline void write_to_bit_buffer(TampCompressor *compressor, uint32_t bits
 /**
  * @brief Partially flush the internal bit buffer.
  *
- * Up to 7 bits may remain in the internal bit buffer.
+ * Flushes complete bytes from the bit buffer. Up to 7 bits may remain.
+ *
+ * @param[in,out] compressor Compressor state.
+ * @param[in,out] output Output buffer pointer (updated on return).
+ * @param[in,out] output_size Available space (updated on return).
+ * @param[in,out] output_written_size Bytes written (accumulated).
+ * @return TAMP_OK on success, TAMP_OUTPUT_FULL if output buffer is too small.
  */
-static inline tamp_res partial_flush(TampCompressor *compressor, unsigned char *output, size_t output_size,
-                                     size_t *output_written_size) {
-    for (*output_written_size = output_size; compressor->bit_buffer_pos >= 8 && output_size;
-         output_size--, compressor->bit_buffer_pos -= 8, compressor->bit_buffer <<= 8)
-        *output++ = compressor->bit_buffer >> 24;
-    *output_written_size -= output_size;
+static TAMP_NOINLINE TAMP_OPTIMIZE_SIZE tamp_res partial_flush(TampCompressor* compressor, unsigned char** output,
+                                                               size_t* output_size, size_t* output_written_size) {
+    while (compressor->bit_buffer_pos >= 8 && *output_size) {
+        *(*output)++ = compressor->bit_buffer >> 24;
+        (*output_size)--;
+        (*output_written_size)++;
+        compressor->bit_buffer_pos -= 8;
+        compressor->bit_buffer <<= 8;
+    }
     return (compressor->bit_buffer_pos >= 8) ? TAMP_OUTPUT_FULL : TAMP_OK;
 }
 
-inline bool tamp_compressor_full(const TampCompressor *compressor) {
+inline bool tamp_compressor_full(const TampCompressor* compressor) {
     return compressor->input_size == sizeof(compressor->input);
 }
 
@@ -63,7 +93,7 @@ inline bool tamp_compressor_full(const TampCompressor *compressor) {
  */
 
 #if TAMP_ESP32
-extern void find_best_match(TampCompressor *compressor, uint16_t *match_index, uint8_t *match_size);
+extern void find_best_match(TampCompressor* compressor, uint16_t* match_index, uint8_t* match_size);
 
 #elif (defined(__x86_64__) || defined(__aarch64__) || defined(_M_X64) || defined(_M_ARM64)) && !TAMP_USE_EMBEDDED_MATCH
 #include "compressor_find_match_desktop.c"
@@ -78,7 +108,7 @@ extern void find_best_match(TampCompressor *compressor, uint16_t *match_index, u
  * @param[out] match_index  If match_size is 0, this value is undefined.
  * @param[out] match_size Size of best found match.
  */
-static inline void find_best_match(TampCompressor *compressor, uint16_t *match_index, uint8_t *match_size) {
+static TAMP_NOINLINE void find_best_match(TampCompressor* compressor, uint16_t* match_index, uint8_t* match_size) {
     *match_size = 0;
 
     if (TAMP_UNLIKELY(compressor->input_size < compressor->min_pattern_size)) return;
@@ -87,7 +117,7 @@ static inline void find_best_match(TampCompressor *compressor, uint16_t *match_i
     const uint8_t second_byte = read_input(1);
     const uint32_t window_size_minus_1 = WINDOW_SIZE - 1;
     const uint8_t max_pattern_size = MIN(compressor->input_size, MAX_PATTERN_SIZE);
-    const unsigned char *window = compressor->window;
+    const unsigned char* window = compressor->window;
 
     for (uint32_t window_index = 0; window_index < window_size_minus_1; window_index++) {
         if (TAMP_LIKELY(window[window_index] != first_byte)) {
@@ -135,29 +165,34 @@ static inline bool validate_no_match_overlap(uint16_t write_pos, uint16_t match_
 }
 #endif
 
-tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf, unsigned char *window) {
+TAMP_OPTIMIZE_SIZE tamp_res tamp_compressor_init(TampCompressor* compressor, const TampConf* conf,
+                                                 unsigned char* window) {
     const TampConf conf_default = {
         .window = 10,
         .literal = 8,
         .use_custom_dictionary = false,
 #if TAMP_LAZY_MATCHING
         .lazy_matching = false,
+#endif
+#if TAMP_EXTENDED_COMPRESS
+        .extended = true,  // Default to extended format
 #endif
     };
     if (!conf) conf = &conf_default;
     if (conf->window < 8 || conf->window > 15) return TAMP_INVALID_CONF;
     if (conf->literal < 5 || conf->literal > 8) return TAMP_INVALID_CONF;
+#if !TAMP_EXTENDED_COMPRESS
+    if (conf->extended) return TAMP_INVALID_CONF;  // Extended requested but not compiled in
+#endif
 
-    for (uint8_t i = 0; i < sizeof(TampCompressor); i++)  // Zero-out the struct
-        ((unsigned char *)compressor)[i] = 0;
+    TAMP_MEMSET(compressor, 0, sizeof(TampCompressor));
 
-    compressor->conf_literal = conf->literal;
-    compressor->conf_window = conf->window;
-    compressor->conf_use_custom_dictionary = conf->use_custom_dictionary;
-#if TAMP_LAZY_MATCHING
-    compressor->conf_lazy_matching = conf->lazy_matching;
-#endif
+    // Build header directly from conf (8 bits total)
+    // Layout: [window:3][literal:2][use_custom_dictionary:1][extended:1][more_headers:1]
+    uint8_t header = ((conf->window - 8) << 5) | ((conf->literal - 5) << 3) | (conf->use_custom_dictionary << 2) |
+                     (conf->extended << 1);
 
+    compressor->conf = *conf;  // Single struct copy
     compressor->window = window;
     compressor->min_pattern_size = tamp_compute_min_pattern_size(conf->window, conf->literal);
 
@@ -165,22 +200,274 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf,
     compressor->cached_match_index = -1;  // Initialize cache as invalid
 #endif
 
-    if (!compressor->conf_use_custom_dictionary) tamp_initialize_dictionary(window, (1 << conf->window));
+    if (!conf->use_custom_dictionary) tamp_initialize_dictionary(window, (1 << conf->window));
 
-    // Write header to bit buffer
-    write_to_bit_buffer(compressor, conf->window - 8, 3);
-    write_to_bit_buffer(compressor, conf->literal - 5, 2);
-    write_to_bit_buffer(compressor, conf->use_custom_dictionary, 1);
-    write_to_bit_buffer(compressor, 0, 1);  // Reserved
-    write_to_bit_buffer(compressor, 0, 1);  // No more header bytes
+    write_to_bit_buffer(compressor, header, 8);
 
     return TAMP_OK;
 }
 
-TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned char *output, size_t output_size,
-                                            size_t *output_written_size) {
+#if TAMP_EXTENDED_COMPRESS
+/**
+ * @brief Write extended huffman encoding (huffman + trailing bits).
+ *
+ * Used for both RLE count and extended match size encoding.
+ *
+ * @param[in,out] compressor Compressor with bit buffer.
+ * @param[in] value The value to encode.
+ * @param[in] trailing_bits Number of trailing bits (3 for extended match, 4 for RLE).
+ */
+static TAMP_NOINLINE TAMP_OPTIMIZE_SIZE void write_extended_huffman(TampCompressor* compressor, uint8_t value,
+                                                                    uint8_t trailing_bits) {
+    uint8_t code_index = value >> trailing_bits;
+    // Write huffman code (without literal flag) + trailing bits in one call
+    write_to_bit_buffer(compressor, (huffman_codes[code_index] << trailing_bits) | (value & ((1 << trailing_bits) - 1)),
+                        (huffman_bits[code_index] - 1) + trailing_bits);
+}
+
+/**
+ * @brief Get the last byte written to the window.
+ *
+ * NOINLINE: called from 3 sites; outlining saves ~44 bytes on armv6m.
+ */
+static TAMP_NOINLINE TAMP_OPTIMIZE_SIZE uint8_t get_last_window_byte(TampCompressor* compressor) {
+    uint16_t prev_pos = (compressor->window_pos - 1) & ((1 << compressor->conf.window) - 1);
+    return compressor->window[prev_pos];
+}
+
+/**
+ * @brief Search for extended match continuation using implicit pattern comparison.
+ *
+ * Searches for pattern: window[current_pos:current_pos+current_count] + input[0...]
+ * starting from current_pos. Returns the longest match found (which may be at
+ * current_pos itself if O(1) extension works, or at a different position).
+ *
+ * NOINLINE + Os: Called only during extended match continuation (rare path).
+ * Outlining saves ~100 bytes in poll on armv6m.
+ *
+ * @param[in] compressor TampCompressor object
+ * @param[in] current_pos Current match position in window (also search start)
+ * @param[in] current_count Current match length
+ * @param[out] new_pos Position of found match (only valid if new_count > current_count)
+ * @param[out] new_count Length of found match
+ */
+static TAMP_NOINLINE TAMP_OPTIMIZE_SIZE void find_extended_match(TampCompressor* compressor, uint16_t current_pos,
+                                                                 uint8_t current_count, uint16_t* new_pos,
+                                                                 uint8_t* new_count) {
+    // Preconditions (guaranteed by caller):
+    // - input_size > 0
+    // - current_pos + current_count < WINDOW_SIZE
+    // - current_count < MAX_PATTERN_SIZE
+    *new_count = 0;
+    const unsigned char* window = compressor->window;
+    const uint16_t window_size = WINDOW_SIZE;
+    const uint8_t max_pattern = MIN(current_count + compressor->input_size, MAX_PATTERN_SIZE);
+    const uint8_t extend_byte = read_input(0);
+
+    for (uint16_t cand = current_pos; cand + current_count + 1 <= window_size; cand++) {
+        // Check extension byte first (most discriminating)
+        if (window[cand + current_count] != extend_byte) continue;
+
+        // Check if current_count bytes match (at cand==current_pos, compares with self)
+        uint8_t i = 0;
+        while (i < current_count && window[cand + i] == window[current_pos + i]) i++;
+        if (i < current_count) continue;
+
+        // Found a match - extend as far as possible
+        const uint8_t cand_max = MIN(max_pattern, window_size - cand);
+        uint8_t match_len = current_count + 1;
+        for (i = current_count + 1; i < cand_max; i++) {
+            if (window[cand + i] != read_input(i - current_count)) break;
+            match_len = i + 1;
+        }
+
+        if (match_len > *new_count) {
+            *new_count = match_len;
+            *new_pos = cand;
+            if (match_len == max_pattern) return;
+        }
+    }
+}
+
+/**
+ * @brief Write RLE token to bit buffer and update window.
+ *
+ * @param[in,out] compressor Compressor state.
+ * @param[in] count Number of repeated bytes (must be >= 2).
+ */
+static TAMP_NOINLINE void write_rle_token(TampCompressor* compressor, uint8_t count) {
+    const uint16_t window_mask = (1 << compressor->conf.window) - 1;
+    uint8_t symbol = get_last_window_byte(compressor);
+
+    // Write RLE symbol (12) with literal flag
+    // Note: symbols 12 and 13 are at indices 12 and 13 in huffman table (not offset by min_pattern_size)
+    write_to_bit_buffer(compressor, huffman_codes[TAMP_RLE_SYMBOL], huffman_bits[TAMP_RLE_SYMBOL]);
+    // Write extended huffman for count-2
+    write_extended_huffman(compressor, count - 2, TAMP_LEADING_RLE_BITS);
+
+    // Write up to TAMP_RLE_MAX_WINDOW bytes to window (or until buffer end, no wrap)
+    uint16_t remaining = WINDOW_SIZE - compressor->window_pos;
+    uint8_t window_write = MIN(MIN(count, TAMP_RLE_MAX_WINDOW), remaining);
+    for (uint8_t i = 0; i < window_write; i++) {
+        compressor->window[compressor->window_pos] = symbol;
+        compressor->window_pos = (compressor->window_pos + 1) & window_mask;
+    }
+}
+
+/**
+ * @brief Write extended match token to bit buffer and update window.
+ *
+ * Token format: symbol (7 bits) + extended_huffman (up to 11 bits) + window_pos (up to 15 bits)
+ * Total: up to 33 bits. We flush after symbol+huffman (18 bits max) to ensure window_pos fits.
+ *
+ * @param[in,out] compressor Compressor state.
+ * @param[in,out] output Output buffer pointer (updated on return).
+ * @param[in,out] output_size Available space (updated on return).
+ * @param[in,out] output_written_size Bytes written (accumulated).
+ * @return TAMP_OK on success, TAMP_OUTPUT_FULL if output buffer is too small.
+ */
+#if TAMP_HAS_GCC_OPTIMIZE
+#pragma GCC push_options
+#pragma GCC optimize("-fno-reorder-blocks")
+#endif
+static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor* compressor, unsigned char** output,
+                                                         size_t* output_size, size_t* output_written_size) {
+    // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer)
+    if (TAMP_UNLIKELY(*output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL;
+
+    const uint16_t window_mask = (1 << compressor->conf.window) - 1;
+    const uint8_t count = compressor->extended_match_count;
+    const uint16_t position = compressor->extended_match_position;
     tamp_res res;
-    const uint16_t window_mask = (1 << compressor->conf_window) - 1;
+
+    // Write symbol (7 bits) + extended huffman (up to 11 bits) = 18 bits max
+    // With ≤7 bits already in buffer, total ≤25 bits - fits in 32-bit buffer
+    write_to_bit_buffer(compressor, huffman_codes[TAMP_EXTENDED_MATCH_SYMBOL],
+                        huffman_bits[TAMP_EXTENDED_MATCH_SYMBOL]);
+    write_extended_huffman(compressor, count - compressor->min_pattern_size - 11 - 1, TAMP_LEADING_EXTENDED_MATCH_BITS);
+
+    // Flush to make room for window position (up to 15 bits)
+    res = partial_flush(compressor, output, output_size, output_written_size);
+    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+
+    // Write window position - with ≤7 bits remaining, up to 22 bits total - fits
+    write_to_bit_buffer(compressor, position, compressor->conf.window);
+
+    // Final flush
+    res = partial_flush(compressor, output, output_size, output_written_size);
+    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+
+    // Write to window (up to end of buffer, no wrap)
+    uint16_t remaining = WINDOW_SIZE - compressor->window_pos;
+    uint8_t window_write = MIN(count, remaining);
+    tamp_window_copy(compressor->window, &compressor->window_pos, position, window_write, window_mask);
+
+    compressor->extended_match_count = 0;  // Position reset not needed - only read when count > 0
+
+    return TAMP_OK;
+}
+#if TAMP_HAS_GCC_OPTIMIZE
+#pragma GCC pop_options
+#endif
+
+/**
+ * @brief Handle all extended-specific logic in poll (match continuation + RLE).
+ *
+ * NOINLINE + Os: Extended paths are rarely executed. Outlining from poll saves
+ * significant code size on register-constrained Cortex-M0+ where the compiler
+ * otherwise spills heavily to stack (~48 bytes saved on armv6m).
+ *
+ * @return TAMP_OK if fully handled (caller should return TAMP_OK),
+ *         TAMP_POLL_CONTINUE if caller should proceed to normal pattern matching,
+ *         other tamp_res on error.
+ */
+static TAMP_NOINLINE TAMP_OPTIMIZE_SIZE tamp_res poll_extended_handling(TampCompressor* compressor,
+                                                                        unsigned char** output, size_t* output_size,
+                                                                        size_t* output_written_size) {
+    // Handle extended match continuation
+    if (compressor->extended_match_count) {
+        const uint8_t max_ext_match = compressor->min_pattern_size + 11 + EXTENDED_MATCH_MAX_EXTRA;
+
+        while (compressor->input_size > 0) {
+            const uint16_t current_pos = compressor->extended_match_position;
+            const uint8_t current_count = compressor->extended_match_count;
+
+            if (current_pos + current_count >= WINDOW_SIZE || current_count >= max_ext_match) {
+                return write_extended_match_token(compressor, output, output_size, output_written_size);
+            }
+
+            uint16_t new_pos;
+            uint8_t new_count;
+            find_extended_match(compressor, current_pos, current_count, &new_pos, &new_count);
+
+            if (new_count > current_count) {
+                uint8_t extra_bytes = new_count - current_count;
+                compressor->extended_match_position = new_pos;
+                compressor->extended_match_count = new_count;
+                compressor->input_pos = input_add(extra_bytes);
+                compressor->input_size -= extra_bytes;
+                continue;
+            }
+
+            return write_extended_match_token(compressor, output, output_size, output_written_size);
+        }
+        return TAMP_OK;
+    }
+
+    // Handle RLE accumulation
+    uint8_t last_byte = get_last_window_byte(compressor);
+
+    uint8_t rle_available = 0;
+    while (rle_available < compressor->input_size && compressor->rle_count + rle_available < RLE_MAX_COUNT &&
+           compressor->input[input_add(rle_available)] == last_byte) {
+        rle_available++;
+    }
+
+    uint8_t total_rle = compressor->rle_count + rle_available;
+    bool rle_ended = (rle_available < compressor->input_size) || (total_rle >= RLE_MAX_COUNT);
+
+    if (!rle_ended && total_rle > 0) {
+        compressor->rle_count = total_rle;
+        compressor->input_pos = input_add(rle_available);
+        compressor->input_size -= rle_available;
+        return TAMP_OK;
+    }
+
+    if (total_rle >= 2) {
+        if (total_rle == rle_available && total_rle <= 6) {
+            uint16_t pattern_index;
+            uint8_t pattern_size;
+            find_best_match(compressor, &pattern_index, &pattern_size);
+
+            if (pattern_size > total_rle) {
+                compressor->rle_count = 0;
+                return TAMP_POLL_CONTINUE;  // Proceed to pattern matching
+            }
+        }
+
+        compressor->input_pos = input_add(rle_available);
+        compressor->input_size -= rle_available;
+        write_rle_token(compressor, total_rle);
+        compressor->rle_count = 0;
+        return TAMP_OK;
+    }
+
+    if (total_rle == 1) compressor->rle_count = 0;
+    return TAMP_POLL_CONTINUE;  // Proceed to pattern matching
+}
+#endif  // TAMP_EXTENDED_COMPRESS
+
+#if TAMP_HAS_GCC_OPTIMIZE
+#pragma GCC push_options
+#pragma GCC optimize("-fno-schedule-insns2")
+#endif
+TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned char* output, size_t output_size,
+                                            size_t* output_written_size) {
+    tamp_res res;
+    // Cache bitfield values for faster access in hot path
+    const uint8_t conf_window = compressor->conf.window;
+    const uint8_t conf_literal = compressor->conf.literal;
+    const uint16_t window_mask = (1 << conf_window) - 1;
     size_t output_written_size_proxy;
 
     if (!output_written_size) output_written_size = &output_written_size_proxy;
@@ -188,23 +475,26 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
 
     if (TAMP_UNLIKELY(compressor->input_size == 0)) return TAMP_OK;
 
-    {
-        // Make sure there's enough room in the bit buffer.
-        size_t flush_bytes_written;
-        res = partial_flush(compressor, output, output_size, &flush_bytes_written);
-        (*output_written_size) += flush_bytes_written;
-        if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-        output_size -= flush_bytes_written;
-        output += flush_bytes_written;  // cppcheck-suppress unreadVariable
-    }
+    // Make sure there's enough room in the bit buffer.
+    res = partial_flush(compressor, &output, &output_size, output_written_size);
+    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
 
     if (TAMP_UNLIKELY(output_size == 0)) return TAMP_OUTPUT_FULL;
 
     uint8_t match_size = 0;
     uint16_t match_index = 0;
 
+#if TAMP_EXTENDED_COMPRESS
+    if (TAMP_UNLIKELY(compressor->conf.extended)) {
+        // Handle extended match continuation + RLE (outlined for code size)
+        res = poll_extended_handling(compressor, &output, &output_size, output_written_size);
+        if (res != TAMP_POLL_CONTINUE) return res;
+        // TAMP_POLL_CONTINUE: proceed to pattern matching below
+    }
+#endif  // TAMP_EXTENDED_COMPRESS
+
 #if TAMP_LAZY_MATCHING
-    if (compressor->conf_lazy_matching) {
+    if (compressor->conf.lazy_matching) {
         // Check if we have a cached match from lazy matching
         if (TAMP_UNLIKELY(compressor->cached_match_index >= 0)) {
             match_index = compressor->cached_match_index;
@@ -213,15 +503,7 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
         } else {
             find_best_match(compressor, &match_index, &match_size);
         }
-    } else {
-        find_best_match(compressor, &match_index, &match_size);
-    }
-#else
-    find_best_match(compressor, &match_index, &match_size);
-#endif
 
-#if TAMP_LAZY_MATCHING
-    if (compressor->conf_lazy_matching) {
         // Lazy matching: if we have a good match, check if position i+1 has a better match
         if (match_size >= compressor->min_pattern_size && match_size <= 8 && compressor->input_size > match_size + 2) {
             // Temporarily advance input position to check next position
@@ -240,54 +522,51 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
             // literal and cache the next match
             if (next_match_size > match_size &&
                 validate_no_match_overlap(compressor->window_pos, next_match_index, next_match_size)) {
-                // Write LITERAL at current position
-                match_size = 1;
-                unsigned char c = read_input(0);
-                if (TAMP_UNLIKELY(c >> compressor->conf_literal)) {
-                    return TAMP_EXCESS_BITS;
-                }
-                write_to_bit_buffer(compressor, IS_LITERAL_FLAG | c, compressor->conf_literal + 1);
+                // Force literal at current position, cache next match
+                compressor->cached_match_index = next_match_index;
+                compressor->cached_match_size = next_match_size;
+                match_size = 0;  // Will trigger literal write below
             } else {
-                // Use current match, clear cache
                 compressor->cached_match_index = -1;
-                uint8_t huffman_index = match_size - compressor->min_pattern_size;
-                write_to_bit_buffer(compressor, huffman_codes[huffman_index], huffman_bits[huffman_index]);
-                write_to_bit_buffer(compressor, match_index, compressor->conf_window);
+                // Note: No V2 extended match check here - we're in the match_size <= 8 branch,
+                // so extended matches (which require match_size > min_pattern_size + 11) are impossible.
             }
-        } else if (TAMP_UNLIKELY(match_size < compressor->min_pattern_size)) {
-            // Write LITERAL
-            compressor->cached_match_index = -1;  // Clear cache
-            match_size = 1;
-            unsigned char c = read_input(0);
-            if (TAMP_UNLIKELY(c >> compressor->conf_literal)) {
-                return TAMP_EXCESS_BITS;
-            }
-            write_to_bit_buffer(compressor, IS_LITERAL_FLAG | c, compressor->conf_literal + 1);
         } else {
-            // Write TOKEN
             compressor->cached_match_index = -1;  // Clear cache
-            uint8_t huffman_index = match_size - compressor->min_pattern_size;
-            write_to_bit_buffer(compressor, huffman_codes[huffman_index], huffman_bits[huffman_index]);
-            write_to_bit_buffer(compressor, match_index, compressor->conf_window);
         }
-    } else
+    } else {
+        find_best_match(compressor, &match_index, &match_size);
+    }
+#else
+    find_best_match(compressor, &match_index, &match_size);
 #endif
-    {
-        // Non-lazy matching path
-        if (TAMP_UNLIKELY(match_size < compressor->min_pattern_size)) {
-            // Write LITERAL
-            match_size = 1;
-            unsigned char c = read_input(0);
-            if (TAMP_UNLIKELY(c >> compressor->conf_literal)) {
-                return TAMP_EXCESS_BITS;
-            }
-            write_to_bit_buffer(compressor, IS_LITERAL_FLAG | c, compressor->conf_literal + 1);
-        } else {
-            // Write TOKEN
-            uint8_t huffman_index = match_size - compressor->min_pattern_size;
-            write_to_bit_buffer(compressor, huffman_codes[huffman_index], huffman_bits[huffman_index]);
-            write_to_bit_buffer(compressor, match_index, compressor->conf_window);
+
+    // Shared token/literal writing logic
+    if (TAMP_UNLIKELY(match_size < compressor->min_pattern_size)) {
+        // Write LITERAL
+        match_size = 1;
+        unsigned char c = read_input(0);
+        if (TAMP_UNLIKELY(c >> conf_literal)) {
+            return TAMP_EXCESS_BITS;
         }
+        write_to_bit_buffer(compressor, (1 << conf_literal) | c, conf_literal + 1);
+    } else {
+#if TAMP_EXTENDED_COMPRESS
+        // Extended: Start extended match continuation
+        if (compressor->conf.extended && match_size > compressor->min_pattern_size + 11) {
+            compressor->extended_match_count = match_size;
+            compressor->extended_match_position = match_index;
+            // Consume matched bytes from input
+            compressor->input_pos = input_add(match_size);
+            compressor->input_size -= match_size;
+            // Return - continuation code at start of poll will try to extend or emit
+            return TAMP_OK;
+        }
+#endif  // TAMP_EXTENDED_COMPRESS
+        // Write TOKEN (huffman code + window position)
+        uint8_t huffman_index = match_size - compressor->min_pattern_size;
+        write_to_bit_buffer(compressor, (huffman_codes[huffman_index] << conf_window) | match_index,
+                            huffman_bits[huffman_index] + conf_window);
     }
     // Populate Window
     for (uint8_t i = 0; i < match_size; i++) {
@@ -299,9 +578,12 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned
 
     return TAMP_OK;
 }
+#if TAMP_HAS_GCC_OPTIMIZE
+#pragma GCC pop_options
+#endif
 
-void tamp_compressor_sink(TampCompressor *compressor, const unsigned char *input, size_t input_size,
-                          size_t *consumed_size) {
+void tamp_compressor_sink(TampCompressor* compressor, const unsigned char* input, size_t input_size,
+                          size_t* consumed_size) {
     size_t consumed_size_proxy;
     if (TAMP_LIKELY(consumed_size))
         *consumed_size = 0;
@@ -316,9 +598,11 @@ void tamp_compressor_sink(TampCompressor *compressor, const unsigned char *input
     }
 }
 
-tamp_res tamp_compressor_compress_cb(TampCompressor *compressor, unsigned char *output, size_t output_size,
-                                     size_t *output_written_size, const unsigned char *input, size_t input_size,
-                                     size_t *input_consumed_size, tamp_callback_t callback, void *user_data) {
+TAMP_OPTIMIZE_SIZE tamp_res tamp_compressor_compress_cb(TampCompressor* compressor, unsigned char* output,
+                                                        size_t output_size, size_t* output_written_size,
+                                                        const unsigned char* input, size_t input_size,
+                                                        size_t* input_consumed_size, tamp_callback_t callback,
+                                                        void* user_data) {
     tamp_res res;
     size_t input_consumed_size_proxy = 0, output_written_size_proxy = 0;
     size_t total_input_size = input_size;
@@ -357,8 +641,12 @@ tamp_res tamp_compressor_compress_cb(TampCompressor *compressor, unsigned char *
     return TAMP_OK;
 }
 
-tamp_res tamp_compressor_flush(TampCompressor *compressor, unsigned char *output, size_t output_size,
-                               size_t *output_written_size, bool write_token) {
+#if TAMP_HAS_GCC_OPTIMIZE
+#pragma GCC push_options
+#pragma GCC optimize("-fno-tree-pre")
+#endif
+tamp_res tamp_compressor_flush(TampCompressor* compressor, unsigned char* output, size_t output_size,
+                               size_t* output_written_size, bool write_token) {
     tamp_res res;
     size_t chunk_output_written_size;
     size_t output_written_size_proxy;
@@ -366,51 +654,81 @@ tamp_res tamp_compressor_flush(TampCompressor *compressor, unsigned char *output
     if (!output_written_size) output_written_size = &output_written_size_proxy;
     *output_written_size = 0;
 
-    while (compressor->input_size) {
-        // Compress the remainder of the input buffer.
+flush_check:
+    // Flush pending bits before checking for more work
+    chunk_output_written_size = 0;
+    res = partial_flush(compressor, &output, &output_size, output_written_size);
+    if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
+
+    if (TAMP_LIKELY(compressor->input_size)) {
         res = tamp_compressor_poll(compressor, output, output_size, &chunk_output_written_size);
-        (*output_written_size) += chunk_output_written_size;
-        if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-        output_size -= chunk_output_written_size;
-        output += chunk_output_written_size;
     }
-
-    // Perform partial flush to see if we need a FLUSH token (check if output buffer in not empty),
-    // and to subsequently make room for the FLUSH token.
-    res = partial_flush(compressor, output, output_size, &chunk_output_written_size);
-    output_size -= chunk_output_written_size;
+#if TAMP_EXTENDED_COMPRESS
+    else if (compressor->conf.extended && compressor->rle_count >= 1) {
+        if (compressor->rle_count == 1) {
+            // Single byte - write as literal (can't use RLE token for count < 2)
+            uint8_t literal = get_last_window_byte(compressor);
+            write_to_bit_buffer(compressor, IS_LITERAL_FLAG | literal, compressor->conf.literal + 1);
+
+            // Write to window
+            const uint16_t window_mask = (1 << compressor->conf.window) - 1;
+            compressor->window[compressor->window_pos] = literal;
+            compressor->window_pos = (compressor->window_pos + 1) & window_mask;
+        } else {
+            // count >= 2: write as RLE token
+            write_rle_token(compressor, compressor->rle_count);
+        }
+        compressor->rle_count = 0;
+    } else if (compressor->conf.extended && compressor->extended_match_count) {
+        res = write_extended_match_token(compressor, &output, &output_size, output_written_size);
+    }
+#endif  // TAMP_EXTENDED_COMPRESS
+    else {
+        goto flush_done;
+    }
     (*output_written_size) += chunk_output_written_size;
-    output += chunk_output_written_size;
     if (TAMP_UNLIKELY(res != TAMP_OK)) return res;
-
-    // Check if there's enough output buffer space
-    if (compressor->bit_buffer_pos) {
-        if (output_size == 0) {
-            return TAMP_OUTPUT_FULL;
-        }
-        if (write_token) {
-            if (output_size < 2) return TAMP_OUTPUT_FULL;
-            write_to_bit_buffer(compressor, FLUSH_CODE, 9);
-        }
+    output_size -= chunk_output_written_size;
+    output += chunk_output_written_size;
+    goto flush_check;
+
+flush_done:
+    // At this point, up to 7 bits may remain in the compressor->bit_buffer
+    // The output buffer may have 0 bytes remaining.
+    if (write_token && compressor->bit_buffer_pos) {
+        // We don't want to write the FLUSH token to the bit_buffer unless
+        // we are confident that it'll wind up in the output buffer
+        // in THIS function call.
+        // Otherwise, if we wind up with a TAMP_OUTPUT_FULL result, we could
+        // end up accidentally writing multiple FLUSH tokens.
+        if (TAMP_UNLIKELY(output_size < 2)) return TAMP_OUTPUT_FULL;
+        write_to_bit_buffer(compressor, FLUSH_CODE, 9);
     }
 
-    // Flush the remainder of the output bit-buffer
-    while (compressor->bit_buffer_pos) {
+    // At this point, up to 16 bits may remain in the compressor->bit_buffer
+    // The output buffer may have 0 bytes remaining.
+
+    // Flush whole bytes, then write trailing partial byte
+    res = partial_flush(compressor, &output, &output_size, output_written_size);
+    if (compressor->bit_buffer_pos) {
+        if (TAMP_UNLIKELY(output_size == 0)) return TAMP_OUTPUT_FULL;
         *output = compressor->bit_buffer >> 24;
-        output++;
-        compressor->bit_buffer <<= 8;
-        compressor->bit_buffer_pos -= MIN(compressor->bit_buffer_pos, 8);
-        output_size--;
         (*output_written_size)++;
+        compressor->bit_buffer_pos = 0;
+        compressor->bit_buffer = 0;
     }
 
-    return TAMP_OK;
+    return res;
 }
+#if TAMP_HAS_GCC_OPTIMIZE
+#pragma GCC pop_options
+#endif
 
-tamp_res tamp_compressor_compress_and_flush_cb(TampCompressor *compressor, unsigned char *output, size_t output_size,
-                                               size_t *output_written_size, const unsigned char *input,
-                                               size_t input_size, size_t *input_consumed_size, bool write_token,
-                                               tamp_callback_t callback, void *user_data) {
+TAMP_OPTIMIZE_SIZE tamp_res tamp_compressor_compress_and_flush_cb(TampCompressor* compressor, unsigned char* output,
+                                                                  size_t output_size, size_t* output_written_size,
+                                                                  const unsigned char* input, size_t input_size,
+                                                                  size_t* input_consumed_size, bool write_token,
+                                                                  tamp_callback_t callback, void* user_data) {
     tamp_res res;
     size_t flush_size;
     size_t output_written_size_proxy;
@@ -433,9 +751,10 @@ tamp_res tamp_compressor_compress_and_flush_cb(TampCompressor *compressor, unsig
 
 #if TAMP_STREAM
 
-tamp_res tamp_compress_stream(TampCompressor *compressor, tamp_read_t read_cb, void *read_handle, tamp_write_t write_cb,
-                              void *write_handle, size_t *input_consumed_size, size_t *output_written_size,
-                              tamp_callback_t callback, void *user_data) {
+TAMP_OPTIMIZE_SIZE tamp_res tamp_compress_stream(TampCompressor* compressor, tamp_read_t read_cb, void* read_handle,
+                                                 tamp_write_t write_cb, void* write_handle, size_t* input_consumed_size,
+                                                 size_t* output_written_size, tamp_callback_t callback,
+                                                 void* user_data) {
     size_t input_consumed_size_proxy, output_written_size_proxy;
     if (!input_consumed_size) input_consumed_size = &input_consumed_size_proxy;
     if (!output_written_size) output_written_size = &output_written_size_proxy;
diff --git a/tamp/_c_src/tamp/compressor.h b/tamp/_c_src/tamp/compressor.h
index cd6ca1f..577eac5 100644
--- a/tamp/_c_src/tamp/compressor.h
+++ b/tamp/_c_src/tamp/compressor.h
@@ -7,58 +7,52 @@ extern "C" {
 
 #include "common.h"
 
-/* Externally, do not directly edit ANY of these attributes */
+/* Externally, do not directly edit ANY of these attributes.
+ * Fields are ordered by access frequency for cache efficiency.
+ */
 typedef struct TampCompressor {
-    /* nicely aligned attributes */
-
 #if TAMP_ESP32  // Avoid bitfields for speed.
-    uint32_t window_pos;
-    uint32_t bit_buffer_pos;
+    /* HOT: accessed every iteration of the compression loop */
+    unsigned char *window;    // Pointer to window buffer
+    uint32_t bit_buffer;      // Bit buffer for output (32 bits)
+    uint32_t window_pos;      // Current position in window (15 bits used)
+    uint32_t bit_buffer_pos;  // Bits currently in bit_buffer (6 bits used)
+    uint32_t input_size;      // Bytes in input buffer (5 bits used; 0-16)
+    uint32_t input_pos;       // Current position in input buffer (4 bits used; 0-15)
+    unsigned char input[16];  // Input ring buffer
 
-    uint32_t input_size;
-    uint32_t input_pos;
+    /* WARM: read frequently, often cached in locals */
+    uint8_t min_pattern_size;  // Minimum pattern size (2 bits used; 2 or 3)
+    TampConf conf;
+#else   // Use bitfields for reduced memory-usage
+    /* HOT: accessed every iteration of the compression loop */
+    unsigned char *window;    // Pointer to window buffer
+    uint32_t bit_buffer;      // Bit buffer for output (32 bits)
+    uint16_t window_pos;      // Current position in window (15 bits used)
+    uint8_t bit_buffer_pos;   // Bits currently in bit_buffer (6 bits used)
+    uint8_t input_size;       // Bytes in input buffer (5 bits used; 0-16)
+    uint8_t input_pos;        // Current position in input buffer (4 bits used; 0-15)
+    unsigned char input[16];  // Input ring buffer
 
-    /* Conf attributes */
-    uint8_t conf_window;                 // number of window bits
-    uint8_t conf_literal;                // number of literal bits
-    uint8_t conf_use_custom_dictionary;  // Use a custom initialized dictionary.
-#if TAMP_LAZY_MATCHING
-    uint8_t conf_lazy_matching;  // Use lazy matching for better compression
-#endif
-    uint8_t min_pattern_size;
+    /* WARM: read frequently, often cached in locals */
+    uint8_t min_pattern_size;  // Minimum pattern size (2 or 3)
+    TampConf conf;
+#endif  // TAMP_ESP32
 
+    /* Fields interleaved to avoid internal padding when both LAZY_MATCHING and EXTENDED_COMPRESS enabled */
 #if TAMP_LAZY_MATCHING
-    /* Lazy matching cache */
-    int16_t cached_match_index;
-    uint8_t cached_match_size;
+    int16_t cached_match_index;  // Lazy matching cache
 #endif
-#else  // Use bitfields for reduced memory-usage
-    /* Conf attributes */
-    uint32_t conf_window : 4;                 // number of window bits
-    uint32_t conf_literal : 4;                // number of literal bits
-    uint32_t conf_use_custom_dictionary : 1;  // Use a custom initialized dictionary.
-#if TAMP_LAZY_MATCHING
-    uint32_t conf_lazy_matching : 1;  // Use lazy matching for better compression
+#if TAMP_EXTENDED_COMPRESS
+    uint16_t extended_match_position;  // Window position for extended match
 #endif
-
-    /* Other small attributes */
-    uint32_t window_pos : 15;
-    uint32_t bit_buffer_pos : 6;
-    uint32_t min_pattern_size : 2;
-
-    uint32_t input_size : 5;
-    uint32_t input_pos : 4;
-
 #if TAMP_LAZY_MATCHING
-    /* Lazy matching cache */
-    int16_t cached_match_index;
     uint8_t cached_match_size;
 #endif
-#endif  // TAMP_ESP32
-    unsigned char input[16] /* __attribute__ ((aligned (16)))*/;
-    uint32_t bit_buffer;
-
-    unsigned char *window;
+#if TAMP_EXTENDED_COMPRESS
+    uint8_t rle_count;             // Current RLE run length (max 225)
+    uint8_t extended_match_count;  // Current extended match size (max ~126)
+#endif
 } TampCompressor;
 
 /**
diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c
index a7c0baa..92af94c 100644
--- a/tamp/_c_src/tamp/decompressor.c
+++ b/tamp/_c_src/tamp/decompressor.c
@@ -7,15 +7,24 @@
 
 #define FLUSH 15
 
+#if TAMP_EXTENDED_DECOMPRESS
+/* Token state for extended decode suspend/resume (2 bits).
+ * TOKEN_RLE and TOKEN_EXT_MATCH_FRESH are arranged so that:
+ *     token_state = match_size - (TAMP_RLE_SYMBOL - 1)
+ * maps TAMP_RLE_SYMBOL (12) -> 1 and TAMP_EXTENDED_MATCH_SYMBOL (13) -> 2.
+ */
+#define TOKEN_NONE 0
+#define TOKEN_RLE 1
+#define TOKEN_EXT_MATCH_FRESH 2
+#define TOKEN_EXT_MATCH 3 /* Resume: have match_size, need window_offset */
+#endif
+
 /**
- * This array was generated with tools/huffman_jump_table.py
+ * Huffman lookup table indexed by 7 bits (after first "1" bit consumed).
+ * Upper 4 bits = additional bits to consume, lower 4 bits = symbol (15 = FLUSH).
  *
- * The idea is that the resulting code is smaller/faster as a lookup table than a bunch of if/else
- * statements.
- *
- * Of each element:
- *  * The upper 4 bits express the number of bits to decode.
- *  * The lower 4 bits express the decoded value, with FLUSH being represented as 0b1111
+ * Note: A 64-byte table with special-cased symbol 1 was tried but was ~10% slower
+ * and only saved 8 bytes in final firmware due to added branch logic.
  */
 static const uint8_t HUFFMAN_TABLE[128] = {
     50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,  50,  85,  85,  85, 85, 122, 123, 104, 104, 86, 86,
@@ -25,72 +34,235 @@ static const uint8_t HUFFMAN_TABLE[128] = {
     17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,  17,  17,  17,  17, 17, 17,  17,  17,  17};
 
 /**
- * @brief Decode a huffman match-size symbol from the decompressor's bit_buffer.
- *
- * Internally updates bit_buffer and bit_buffer_pos.
+ * @brief Decode huffman symbol + optional trailing bits from bit buffer.
  *
- * bit_buffer MUST have at least 8 bits prior to calling.
+ * Modifies bit_buffer and bit_buffer_pos in place. Caller is responsible
+ * for committing to decompressor state if needed.
  *
- * @returns Decoded match_size
+ * @param bit_buffer Pointer to bit buffer (modified in place)
+ * @param bit_buffer_pos Pointer to bit position (modified in place)
+ * @param trailing_bits Number of trailing bits to read (0, 3, or 4)
+ * @param result Output: (huffman << trailing_bits) + trailing (max 223 for trailing_bits=4)
+ * @return TAMP_OK on success, TAMP_INPUT_EXHAUSTED if more bits needed
  */
-static inline int8_t huffman_decode(uint32_t *bit_buffer, uint8_t *bit_buffer_pos) {
-    uint8_t code;
-    uint8_t bit_len;
+static tamp_res decode_huffman(uint32_t* bit_buffer, uint8_t* bit_buffer_pos, uint8_t trailing_bits, uint8_t* result) {
+    /* Need at least 1 bit for huffman, plus trailing bits */
+    if (TAMP_UNLIKELY(*bit_buffer_pos < 1 + trailing_bits)) return TAMP_INPUT_EXHAUSTED;
 
+    /* Decode huffman symbol */
+    int8_t huffman_value;
     (*bit_buffer_pos)--;
-    code = *bit_buffer >> 31;
-    *bit_buffer <<= 1;
-    if (TAMP_LIKELY(code == 0)) return 0;
+    if (TAMP_LIKELY((*bit_buffer >> 31) == 0)) {
+        /* Symbol 0: code "0" */
+        *bit_buffer <<= 1;
+        huffman_value = 0;
+    } else {
+        /* All other symbols: use 128-entry table indexed by next 7 bits */
+        *bit_buffer <<= 1;
+        uint8_t code = HUFFMAN_TABLE[*bit_buffer >> (32 - 7)];
+        uint8_t bit_len = code >> 4;
+        if (TAMP_UNLIKELY(*bit_buffer_pos < bit_len + trailing_bits)) return TAMP_INPUT_EXHAUSTED;
+        *bit_buffer <<= bit_len;
+        *bit_buffer_pos -= bit_len;
+        huffman_value = code & 0xF;
+    }
 
-    code = *bit_buffer >> (32 - 7);
-    code = HUFFMAN_TABLE[code];
-    bit_len = code >> 4;
-    *bit_buffer <<= bit_len;
-    (*bit_buffer_pos) -= bit_len;
+    /* Read trailing bits (skip if trailing_bits==0 to avoid undefined shift) */
+    if (trailing_bits) {
+        uint8_t trailing = *bit_buffer >> (32 - trailing_bits);
+        *bit_buffer <<= trailing_bits;
+        *bit_buffer_pos -= trailing_bits;
+        *result = (huffman_value << trailing_bits) + trailing;
+    } else {
+        *result = huffman_value;
+    }
 
-    return code & 0xF;
+    return TAMP_OK;
 }
 
+#if TAMP_EXTENDED_DECOMPRESS
+
 /**
- * @brief Copy pattern from window to window, updating window_pos.
+ * @brief Decode RLE token and write repeated bytes to output.
  *
- * Handles potential overlap between source and destination regions by
- * using a temporary buffer when necessary. Overlap occurs when the
- * destination would "catch up" to the source during copying.
+ * RLE format: huffman(count_high) + trailing_bits(count_low)
+ * rle_count = (count_high << 4) + count_low + 2
  */
-static inline void window_copy(unsigned char *window, uint16_t *window_pos, uint16_t window_offset, uint8_t match_size,
-                               uint16_t window_mask) {
-    const uint16_t src_to_dst = (*window_pos - window_offset) & window_mask;
-    const bool overlap = (src_to_dst < match_size) && (src_to_dst > 0);
-
-    if (TAMP_UNLIKELY(overlap)) {
-        uint8_t tmp_buf[16];
-        for (uint8_t i = 0; i < match_size; i++) {
-            tmp_buf[i] = window[window_offset + i];
-        }
-        for (uint8_t i = 0; i < match_size; i++) {
-            window[*window_pos] = tmp_buf[i];
-            *window_pos = (*window_pos + 1) & window_mask;
+static tamp_res decode_rle(TampDecompressor* d, unsigned char** output, const unsigned char* output_end,
+                           size_t* output_written_size) {
+    uint8_t rle_count; /* max 225: (13 << 4) + 15 + 2 */
+    uint8_t skip = d->skip_bytes;
+
+    if (skip > 0) {
+        /* Resume from output-full: rle_count saved in pending_window_offset */
+        rle_count = d->pending_window_offset;
+    } else {
+        /* Fresh decode */
+        uint32_t bit_buffer = d->bit_buffer;
+        uint8_t bit_buffer_pos = d->bit_buffer_pos;
+        uint8_t raw;
+        tamp_res res = decode_huffman(&bit_buffer, &bit_buffer_pos, TAMP_LEADING_RLE_BITS, &raw);
+        if (res != TAMP_OK) return res;
+        d->bit_buffer = bit_buffer;
+        d->bit_buffer_pos = bit_buffer_pos;
+        rle_count = raw + 2;
+    }
+
+    /* Get the byte to repeat (last written byte) */
+    uint16_t prev_pos = (d->window_pos - 1) & ((1u << d->conf_window) - 1);
+    uint8_t symbol = d->window[prev_pos];
+
+    /* Calculate how many to write this call */
+    uint8_t remaining_count = rle_count - skip;
+    size_t output_space = output_end - *output;
+    uint8_t to_write;
+
+    if (TAMP_UNLIKELY(remaining_count > output_space)) {
+        /* Partial write - save state for resume */
+        to_write = output_space;
+        d->skip_bytes = skip + to_write;
+        d->token_state = TOKEN_RLE;
+        d->pending_window_offset = rle_count;
+    } else {
+        /* Complete write */
+        to_write = remaining_count;
+        d->skip_bytes = 0;
+        d->token_state = TOKEN_NONE;
+    }
+
+    /* Write repeated bytes to output */
+    TAMP_MEMSET(*output, symbol, to_write);
+    *output += to_write;
+    *output_written_size += to_write;
+
+    /* Update window only on first chunk (skip==0).
+     * Write up to TAMP_RLE_MAX_WINDOW or until end of buffer (no wrap). */
+    if (skip == 0) {
+        const uint16_t window_size = 1u << d->conf_window;
+        uint16_t remaining = window_size - d->window_pos;
+        uint8_t window_write = MIN(MIN(rle_count, TAMP_RLE_MAX_WINDOW), remaining); /* max 8 */
+        for (uint8_t i = 0; i < window_write; i++) {
+            d->window[d->window_pos++] = symbol;
         }
+        d->window_pos &= (window_size - 1);
+    }
+
+    return (d->token_state == TOKEN_NONE) ? TAMP_OK : TAMP_OUTPUT_FULL;
+}
+
+/**
+ * @brief Decode extended match token and copy from window to output.
+ *
+ * NEW FORMAT: huffman(size_high) + trailing_bits(size_low) + window_offset
+ * match_size = (size_high << 3) + size_low + min_pattern_size + 12
+ *
+ * State machine:
+ * - Fresh: decode huffman+trailing, then window_offset
+ * - TOKEN_EXT_MATCH: have match_size, need window_offset
+ * - Output-full resume (skip > 0): have both match_size and window_offset
+ */
+static tamp_res decode_extended_match(TampDecompressor* d, unsigned char** output, const unsigned char* output_end,
+                                      size_t* output_written_size) {
+    const uint8_t conf_window = d->conf_window;
+    uint16_t window_offset;
+    uint8_t match_size; /* max 126: (13<<3)+7 + 3 + 12 */
+    uint8_t skip = d->skip_bytes;
+
+    if (skip > 0) {
+        /* Resume from output-full: both values saved */
+        window_offset = d->pending_window_offset;
+        match_size = d->pending_match_size;
+    } else if (d->token_state == TOKEN_EXT_MATCH) {
+        /* Resume: have match_size, need window_offset */
+        match_size = d->pending_match_size;
+
+        if (TAMP_UNLIKELY(d->bit_buffer_pos < conf_window)) return TAMP_INPUT_EXHAUSTED;
+        window_offset = d->bit_buffer >> (32 - conf_window);
+        d->bit_buffer <<= conf_window;
+        d->bit_buffer_pos -= conf_window;
     } else {
-        for (uint8_t i = 0; i < match_size; i++) {
-            window[*window_pos] = window[window_offset + i];
-            *window_pos = (*window_pos + 1) & window_mask;
+        /* Fresh decode: huffman+trailing first, then window_offset */
+        uint32_t bit_buffer = d->bit_buffer;
+        uint8_t bit_buffer_pos = d->bit_buffer_pos;
+        uint8_t raw;
+        tamp_res res = decode_huffman(&bit_buffer, &bit_buffer_pos, TAMP_LEADING_EXTENDED_MATCH_BITS, &raw);
+        if (res != TAMP_OK) return res;
+        match_size = raw + d->min_pattern_size + 12;
+
+        /* Now decode window_offset */
+        if (TAMP_UNLIKELY(bit_buffer_pos < conf_window)) {
+            /* Save match_size and return */
+            d->bit_buffer = bit_buffer;
+            d->bit_buffer_pos = bit_buffer_pos;
+            d->token_state = TOKEN_EXT_MATCH;
+            d->pending_match_size = match_size;
+            return TAMP_INPUT_EXHAUSTED;
         }
+        window_offset = bit_buffer >> (32 - conf_window);
+        bit_buffer <<= conf_window;
+        bit_buffer_pos -= conf_window;
+        d->bit_buffer = bit_buffer;
+        d->bit_buffer_pos = bit_buffer_pos;
     }
+
+    /* Security check: validate window bounds */
+    const uint32_t window_size = (1u << conf_window);
+    if (TAMP_UNLIKELY((uint32_t)window_offset >= window_size ||
+                      (uint32_t)window_offset + (uint32_t)match_size > window_size)) {
+        return TAMP_OOB;
+    }
+
+    /* Calculate how many to write this call */
+    uint8_t remaining_count = match_size - skip;
+    size_t output_space = output_end - *output;
+    uint8_t to_write;
+
+    if (TAMP_UNLIKELY(remaining_count > output_space)) {
+        /* Partial write - save state for resume */
+        to_write = output_space;
+        d->skip_bytes = skip + output_space;
+        d->token_state = TOKEN_EXT_MATCH; /* Reuse for output-full */
+        d->pending_window_offset = window_offset;
+        d->pending_match_size = match_size;
+    } else {
+        /* Complete write */
+        to_write = remaining_count;
+        d->skip_bytes = 0;
+        d->token_state = TOKEN_NONE;
+    }
+
+    /* Copy from window to output */
+    uint16_t src_offset = window_offset + skip;
+    for (uint8_t i = 0; i < to_write; i++) {
+        *(*output)++ = d->window[src_offset + i];
+    }
+    *output_written_size += to_write;
+
+    /* Update window only on complete decode.
+     * Write up to end of buffer (no wrap), matching RLE behavior. */
+    if (d->token_state == TOKEN_NONE) {
+        uint16_t wp = d->window_pos;
+        uint16_t remaining = window_size - wp;
+        uint8_t window_write = (match_size < remaining) ? match_size : remaining;
+        tamp_window_copy(d->window, &wp, window_offset, window_write, window_size - 1);
+        d->window_pos = wp;
+    }
+
+    return (d->token_state == TOKEN_NONE) ? TAMP_OK : TAMP_OUTPUT_FULL;
 }
+#endif /* TAMP_EXTENDED_DECOMPRESS */
 
-tamp_res tamp_decompressor_read_header(TampConf *conf, const unsigned char *input, size_t input_size,
-                                       size_t *input_consumed_size) {
+tamp_res tamp_decompressor_read_header(TampConf* conf, const unsigned char* input, size_t input_size,
+                                       size_t* input_consumed_size) {
     if (input_consumed_size) (*input_consumed_size) = 0;
     if (input_size == 0) return TAMP_INPUT_EXHAUSTED;
-    if (input[0] & 0x2) return TAMP_INVALID_CONF;  // Reserved
     if (input[0] & 0x1) return TAMP_INVALID_CONF;  // Currently only a single header byte is supported.
     if (input_consumed_size) (*input_consumed_size)++;
 
     conf->window = ((input[0] >> 5) & 0x7) + 8;
     conf->literal = ((input[0] >> 3) & 0x3) + 5;
     conf->use_custom_dictionary = ((input[0] >> 2) & 0x1);
+    conf->extended = ((input[0] >> 1) & 0x1);
 
     return TAMP_OK;
 }
@@ -100,8 +272,10 @@ tamp_res tamp_decompressor_read_header(TampConf *conf, const unsigned char *inpu
  *   * window
  *   * window_bits_max
  */
-static tamp_res tamp_decompressor_populate_from_conf(TampDecompressor *decompressor, uint8_t conf_window,
-                                                     uint8_t conf_literal, uint8_t conf_use_custom_dictionary) {
+static TAMP_OPTIMIZE_SIZE tamp_res tamp_decompressor_populate_from_conf(TampDecompressor* decompressor,
+                                                                        uint8_t conf_window, uint8_t conf_literal,
+                                                                        uint8_t conf_use_custom_dictionary,
+                                                                        uint8_t conf_extended) {
     if (conf_window < 8 || conf_window > 15) return TAMP_INVALID_CONF;
     if (conf_literal < 5 || conf_literal > 8) return TAMP_INVALID_CONF;
     if (conf_window > decompressor->window_bits_max) return TAMP_INVALID_CONF;
@@ -111,37 +285,62 @@ static tamp_res tamp_decompressor_populate_from_conf(TampDecompressor *decompres
     decompressor->conf_literal = conf_literal;
     decompressor->min_pattern_size = tamp_compute_min_pattern_size(conf_window, conf_literal);
     decompressor->configured = true;
+    decompressor->conf_extended = conf_extended;
+#if !TAMP_EXTENDED_DECOMPRESS
+    if (conf_extended) return TAMP_INVALID_CONF;  // Extended stream but extended support not compiled in
+#endif
 
     return TAMP_OK;
 }
 
-tamp_res tamp_decompressor_init(TampDecompressor *decompressor, const TampConf *conf, unsigned char *window,
+tamp_res tamp_decompressor_init(TampDecompressor* decompressor, const TampConf* conf, unsigned char* window,
                                 uint8_t window_bits) {
     tamp_res res = TAMP_OK;
 
     // Validate window_bits parameter
     if (window_bits < 8 || window_bits > 15) return TAMP_INVALID_CONF;
 
-    for (uint8_t i = 0; i < sizeof(TampDecompressor); i++)  // Zero-out the struct
-        ((unsigned char *)decompressor)[i] = 0;
+    TAMP_MEMSET(decompressor, 0, sizeof(TampDecompressor));
     decompressor->window = window;
     decompressor->window_bits_max = window_bits;
     if (conf) {
         res = tamp_decompressor_populate_from_conf(decompressor, conf->window, conf->literal,
-                                                   conf->use_custom_dictionary);
+                                                   conf->use_custom_dictionary, conf->extended);
     }
 
     return res;
 }
 
-tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigned char *output, size_t output_size,
-                                         size_t *output_written_size, const unsigned char *input, size_t input_size,
-                                         size_t *input_consumed_size, tamp_callback_t callback, void *user_data) {
+/**
+ * @brief Refill bit buffer from input stream.
+ *
+ * Consumes bytes from input until bit_buffer has at least 25 bits or input is exhausted.
+ *
+ * NOTE: NOINLINE saves ~192 bytes on armv6m but causes ~10% decompression
+ * speed regression. Keep this inlined for performance.
+ */
+static inline void refill_bit_buffer(TampDecompressor* d, const unsigned char** input, const unsigned char* input_end,
+                                     size_t* input_consumed_size) {
+    while (*input != input_end && d->bit_buffer_pos <= 24) {
+        d->bit_buffer_pos += 8;
+        d->bit_buffer |= (uint32_t) * (*input) << (32 - d->bit_buffer_pos);
+        (*input)++;
+        (*input_consumed_size)++;
+    }
+}
+
+#if TAMP_HAS_GCC_OPTIMIZE
+#pragma GCC push_options
+#pragma GCC optimize("-fno-tree-pre")
+#endif
+tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigned char* output, size_t output_size,
+                                         size_t* output_written_size, const unsigned char* input, size_t input_size,
+                                         size_t* input_consumed_size, tamp_callback_t callback, void* user_data) {
     size_t input_consumed_size_proxy;
     size_t output_written_size_proxy;
     tamp_res res;
-    const unsigned char *input_end = input + input_size;
-    const unsigned char *output_end = output + output_size;
+    const unsigned char* input_end = input + input_size;
+    const unsigned char* output_end = output + output_size;
 
     if (!output_written_size) output_written_size = &output_written_size_proxy;
     if (!input_consumed_size) input_consumed_size = &input_consumed_size_proxy;
@@ -156,7 +355,8 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
         res = tamp_decompressor_read_header(&conf, input, input_end - input, &header_consumed_size);
         if (res != TAMP_OK) return res;
 
-        res = tamp_decompressor_populate_from_conf(decompressor, conf.window, conf.literal, conf.use_custom_dictionary);
+        res = tamp_decompressor_populate_from_conf(decompressor, conf.window, conf.literal, conf.use_custom_dictionary,
+                                                   conf.extended);
         if (res != TAMP_OK) return res;
 
         input += header_consumed_size;
@@ -169,20 +369,42 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
     const uint8_t min_pattern_size = decompressor->min_pattern_size;
 
     const uint16_t window_mask = (1 << conf_window) - 1;
-    while (input != input_end || decompressor->bit_buffer_pos) {
+#if TAMP_EXTENDED_DECOMPRESS
+    const bool extended_enabled = decompressor->conf_extended;
+#endif
+
+    while (input != input_end || decompressor->pos_and_state) {
+        if (TAMP_UNLIKELY(output == output_end)) return TAMP_OUTPUT_FULL;
+
         // Populate the bit buffer
-        while (input != input_end && decompressor->bit_buffer_pos <= 24) {
-            uint32_t t = *input;
-            decompressor->bit_buffer_pos += 8;
-            decompressor->bit_buffer |= t << (32 - decompressor->bit_buffer_pos);
-            input++;
-            (*input_consumed_size)++;
+        refill_bit_buffer(decompressor, &input, input_end, input_consumed_size);
+
+#if TAMP_EXTENDED_DECOMPRESS
+        /* Handle extended tokens - either resuming or fresh from match_size detection below. */
+        if (TAMP_UNLIKELY(decompressor->token_state)) {
+        extended_dispatch:
+            if (decompressor->token_state == TOKEN_RLE) {
+                res = decode_rle(decompressor, &output, output_end, output_written_size);
+            } else {
+                res = decode_extended_match(decompressor, &output, output_end, output_written_size);
+            }
+            if (res == TAMP_INPUT_EXHAUSTED) {
+                uint8_t old_bit_pos = decompressor->bit_buffer_pos;
+                refill_bit_buffer(decompressor, &input, input_end, input_consumed_size);
+                /* If we couldn't get more bits and input is exhausted, stop.
+                 * Otherwise the loop would run forever with token_state set. */
+                if (decompressor->bit_buffer_pos == old_bit_pos && input == input_end) {
+                    return TAMP_INPUT_EXHAUSTED;
+                }
+                continue;
+            }
+            if (res != TAMP_OK) return res;
+            continue;
         }
+#endif  // TAMP_EXTENDED_DECOMPRESS
 
         if (TAMP_UNLIKELY(decompressor->bit_buffer_pos == 0)) return TAMP_INPUT_EXHAUSTED;
 
-        if (TAMP_UNLIKELY(output == output_end)) return TAMP_OUTPUT_FULL;
-
         // Hint that patterns are more likely than literals
         if (TAMP_UNLIKELY(decompressor->bit_buffer >> 31)) {
             // is literal
@@ -214,10 +436,10 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
             bit_buffer <<= 1;
             bit_buffer_pos--;
 
-            // There must be at least 8 bits, otherwise no possible decoding.
-            if (TAMP_UNLIKELY(bit_buffer_pos < 8)) return TAMP_INPUT_EXHAUSTED;
+            uint8_t match_size_u8;
+            if (decode_huffman(&bit_buffer, &bit_buffer_pos, 0, &match_size_u8) != TAMP_OK) return TAMP_INPUT_EXHAUSTED;
+            match_size = match_size_u8;
 
-            match_size = huffman_decode(&bit_buffer, &bit_buffer_pos);
             if (TAMP_UNLIKELY(match_size == FLUSH)) {
                 // flush bit_buffer to the nearest byte and skip the remainder of decoding
                 decompressor->bit_buffer = bit_buffer << (bit_buffer_pos & 7);
@@ -225,6 +447,18 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
                     bit_buffer_pos & ~7;  // Round bit_buffer_pos down to nearest multiple of 8.
                 continue;
             }
+
+#if TAMP_EXTENDED_DECOMPRESS
+            /* Check for extended symbols (RLE=12, extended match=13).
+             * Convert match_size to token_state via subtraction (see TOKEN_* defines). */
+            if (TAMP_UNLIKELY(extended_enabled && match_size >= TAMP_RLE_SYMBOL)) {
+                decompressor->bit_buffer = bit_buffer;
+                decompressor->bit_buffer_pos = bit_buffer_pos;
+                decompressor->token_state = match_size - (TAMP_RLE_SYMBOL - 1);
+                goto extended_dispatch;
+            }
+#endif  // TAMP_EXTENDED_DECOMPRESS
+
             if (TAMP_UNLIKELY(bit_buffer_pos < conf_window)) {
                 // There are not enough bits to decode window offset
                 return TAMP_INPUT_EXHAUSTED;
@@ -268,7 +502,7 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
 
             if (TAMP_LIKELY(decompressor->skip_bytes == 0)) {
                 uint16_t wp = decompressor->window_pos;
-                window_copy(decompressor->window, &wp, window_offset, match_size, window_mask);
+                tamp_window_copy(decompressor->window, &wp, window_offset, match_size, window_mask);
                 decompressor->window_pos = wp;
             }
         }
@@ -277,12 +511,16 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne
     }
     return TAMP_INPUT_EXHAUSTED;
 }
+#if TAMP_HAS_GCC_OPTIMIZE
+#pragma GCC pop_options
+#endif
 
 #if TAMP_STREAM
 
-tamp_res tamp_decompress_stream(TampDecompressor *decompressor, tamp_read_t read_cb, void *read_handle,
-                                tamp_write_t write_cb, void *write_handle, size_t *input_consumed_size,
-                                size_t *output_written_size, tamp_callback_t callback, void *user_data) {
+TAMP_OPTIMIZE_SIZE tamp_res tamp_decompress_stream(TampDecompressor* decompressor, tamp_read_t read_cb,
+                                                   void* read_handle, tamp_write_t write_cb, void* write_handle,
+                                                   size_t* input_consumed_size, size_t* output_written_size,
+                                                   tamp_callback_t callback, void* user_data) {
     size_t input_consumed_size_proxy, output_written_size_proxy;
     if (!input_consumed_size) input_consumed_size = &input_consumed_size_proxy;
     if (!output_written_size) output_written_size = &output_written_size_proxy;
diff --git a/tamp/_c_src/tamp/decompressor.h b/tamp/_c_src/tamp/decompressor.h
index 1608a6d..9659817 100644
--- a/tamp/_c_src/tamp/decompressor.h
+++ b/tamp/_c_src/tamp/decompressor.h
@@ -11,21 +11,40 @@ extern "C" {
  * Fields are ordered by access frequency for cache efficiency.
  */
 typedef struct {
-    /* HOT: accessed every iteration of the decompression loop.
-     * Full-width types avoid bitfield access overhead. */
-    unsigned char *window;   // Pointer to window buffer
-    uint32_t bit_buffer;     // Bit buffer for reading compressed data (32 bits)
-    uint16_t window_pos;     // Current position in window (15 bits)
-    uint8_t bit_buffer_pos;  // Bits currently in bit_buffer (6 bits)
+    /* HOT: accessed every iteration of the decompression loop. */
+    unsigned char *window;  // Pointer to window buffer
+    uint32_t bit_buffer;    // Bit buffer for reading compressed data (32 bits)
+    uint16_t window_pos;    // Current position in window (15 bits)
+
+    /* Union allows single zero-check in main loop instead of two separate checks. */
+#if TAMP_EXTENDED_DECOMPRESS
+    union {
+        struct {
+            uint8_t bit_buffer_pos;  // Bits currently in bit_buffer (6 bits needed)
+            uint8_t token_state;     // 0=none, 1=RLE, 2=ext match, 3=ext match fresh (2 bits used)
+        };
+        uint16_t pos_and_state;  // Combined for fast 16-bit zero-check
+    };
+#else
+    union {
+        uint8_t bit_buffer_pos;  // Bits currently in bit_buffer (6 bits needed)
+        uint8_t pos_and_state;   // Alias for consistent access in main loop
+    };
+#endif
+#if TAMP_EXTENDED_DECOMPRESS
+    uint16_t pending_window_offset;  // Saved window_offset for extended match output-full resume
+    uint16_t pending_match_size;     // Saved match_size for extended match resume
+#endif
 
     /* WARM: read once at start of decompress, cached in locals */
     uint8_t conf_window : 4;       // Window bits from config
     uint8_t conf_literal : 4;      // Literal bits from config
     uint8_t min_pattern_size : 2;  // Minimum pattern size, 2 or 3
+    uint8_t conf_extended : 1;     // Extended format enabled (from header)
 
     /* COLD: rarely accessed (init or edge cases).
      * Bitfields save space; add new cold fields here. */
-    uint8_t skip_bytes : 4;       // For output-buffer-limited resumption
+    uint8_t skip_bytes;           // For output-buffer-limited resumption (v2 needs >4 bits)
     uint8_t window_bits_max : 4;  // Max window bits buffer can hold
     uint8_t configured : 1;       // Whether config has been set
 } TampDecompressor;
diff --git a/tamp/cli/main.py b/tamp/cli/main.py
index 27275c2..11f7062 100644
--- a/tamp/cli/main.py
+++ b/tamp/cli/main.py
@@ -119,6 +119,7 @@ def compress(
         ),
     ] = 8,
     lazy_matching: bool = False,
+    extended: bool = True,
     implementation: ImplementationType = None,
 ):
     """Compress an input file or stream.
@@ -135,6 +136,8 @@ def compress(
         Number of bits used to represent a literal.
     lazy_matching: bool
         Use roughly 50% more cpu to get 0~2% better compression.
+    extended: bool
+        Use extended compression format (RLE, extended match encoding).
     implementation: Optional[Literal["c", "python"]]
         Explicitly specify which implementation to use (c or python). Defaults to auto-detection.
     """
@@ -145,6 +148,7 @@ def compress(
         window=window,
         literal=literal,
         lazy_matching=lazy_matching,
+        extended=extended,
     )
     write(output, output_bytes)
 
diff --git a/tamp/compressor.py b/tamp/compressor.py
index 91c5516..f59b2e3 100644
--- a/tamp/compressor.py
+++ b/tamp/compressor.py
@@ -1,3 +1,11 @@
+"""Pure Python Tamp Compressor Reference Implementation.
+
+The goal of this module is for clarity and to be able to easily test new ideas.
+Do not optimize this file for speed, unless it still maintains clarity.
+
+Some speed architectural optimizations might be tested here before implementing in other languages.
+"""
+
 from collections import deque
 from io import BytesIO
 
@@ -13,26 +21,33 @@
 
 from . import ExcessBitsError, bit_size, compute_min_pattern_size, initialize_dictionary
 
-# encodes [min_pattern_bytes, min_pattern_bytes + 13] pattern lengths
-_huffman_codes = b"\x00\x03\x08\x0b\x14$&+KT\x94\x95\xaa'"
+# encodes [0, 14] pattern lengths
+_huffman_codes = b"\x00\x03\x08\x0b\x14$&+KT\x94\x95\xaa'\xab"
 # These bit lengths pre-add the 1 bit for the 0-value is_literal flag.
-_huffman_bits = b"\x02\x03\x05\x05\x06\x07\x07\x07\x08\x08\x09\x09\x09\x07"
+_huffman_bits = b"\x02\x03\x05\x05\x06\x07\x07\x07\x08\x08\x09\x09\x09\x07\x09"
 _FLUSH_CODE = 0xAB  # 8 bits
+_RLE_SYMBOL = 12
+_RLE_MAX_WINDOW = 8  # Maximum number of RLE bytes to write to the window.
+_EXTENDED_MATCH_SYMBOL = 13
+_LEADING_EXTENDED_MATCH_HUFFMAN_BITS = 3
+_LEADING_RLE_HUFFMAN_BITS = 4
 
 
 class _BitWriter:
     """Writes bits to a stream."""
 
-    def __init__(self, f, close_f_on_close=False):
+    def __init__(self, f, *, close_f_on_close: bool = False):
         self.close_f_on_close = close_f_on_close
         self.f = f
-        self.buffer = 0  # Basically a uint24
+        self.buffer = 0  # Basically a uint32
         self.bit_pos = 0
 
-    def write_huffman(self, pattern_size):
+    def write_huffman_and_literal_flag(self, pattern_size):
+        # pattern_size in range [0, 14]
         return self.write(_huffman_codes[pattern_size], _huffman_bits[pattern_size])
 
     def write(self, bits, num_bits, flush=True):
+        bits = int(bits)
         bits &= (1 << num_bits) - 1
         self.bit_pos += num_bits
         self.buffer |= bits << (32 - self.bit_pos)
@@ -75,7 +90,7 @@ def __init__(self, buffer):
         self.size = len(buffer)
         self.pos = 0  # Always pointing to the byte-to-be-overwritten
 
-    def write_byte(self, byte):  # ~10% of time
+    def write_byte(self, byte):
         self.buffer[self.pos] = byte
         self.pos = (self.pos + 1) % self.size
 
@@ -90,6 +105,32 @@ def index(self, pattern, start):
             raise ValueError("substring not found")
         return result
 
+    def write_from_self(self, position, size):
+        # Write up to end of buffer (no wrap)
+        remaining = self.size - self.pos
+        window_write = min(size, remaining)
+        # Read source data first to avoid overlap when source and destination ranges overlap
+        data = self.get(position, window_write)
+        for byte in data:
+            self.buffer[self.pos] = byte
+            self.pos += 1
+        if self.pos == self.size:
+            self.pos = 0
+
+    def get(self, index, size):
+        out = bytearray(size)
+        for i in range(size):
+            pos = (index + i) % self.size
+            out[i] = self.buffer[pos]
+        return bytes(out)
+
+    @property
+    def last_written_byte(self) -> int:
+        pos = self.pos - 1
+        if pos < 0:
+            pos = self.size - 1
+        return self.buffer[pos]  # TODO: unit-test this thoroughly on initial start!
+
 
 class Compressor:
     """Compresses data to a file or stream."""
@@ -102,6 +143,7 @@ def __init__(
         literal: int = 8,
         dictionary: Optional[bytearray] = None,
         lazy_matching: bool = False,
+        extended: bool = True,
     ):
         """
         Parameters
@@ -129,11 +171,24 @@ def __init__(
         lazy_matching: bool
             Use roughly 50% more cpu to get 0~2% better compression.
         """
-        if lazy_matching:
-            raise NotImplementedError("lazy matching not implemented in pure python implementation.")
+        self.window_bits = window
+        self.literal_bits = literal
+        self.min_pattern_size = compute_min_pattern_size(window, literal)
+        self.extended: bool = extended
+
+        self._rle_count = 0
+
+        # "+1" Because a RLE of 1 is not valid.
+        self._rle_max_size = (13 << _LEADING_RLE_HUFFMAN_BITS) + (1 << _LEADING_RLE_HUFFMAN_BITS) + 1
+
+        self._extended_match_count = 0
+        self._extended_match_position = 0
+
+        self.lazy_matching = lazy_matching
+        self._cached_match_index = -1
+        self._cached_match_size = 0
 
         if not hasattr(f, "write"):  # It's probably a path-like object.
-            # TODO: then close it on close
             f = open(str(f), "wb")
             close_f_on_close = True
         else:
@@ -143,11 +198,15 @@ def __init__(
         if dictionary and bit_size(len(dictionary) - 1) != window:
             raise ValueError("Dictionary-window size mismatch.")
 
-        self.window_bits = window
-        self.literal_bits = literal
-
-        self.min_pattern_size = compute_min_pattern_size(window, literal)
-        self.max_pattern_size = self.min_pattern_size + 13
+        if self.extended:
+            self.max_pattern_size = (
+                self.min_pattern_size
+                + 11
+                + (13 << _LEADING_EXTENDED_MATCH_HUFFMAN_BITS)
+                + (1 << _LEADING_EXTENDED_MATCH_HUFFMAN_BITS)
+            )
+        else:
+            self.max_pattern_size = self.min_pattern_size + 13
 
         self.literal_flag = 1 << self.literal_bits
 
@@ -155,26 +214,170 @@ def __init__(
             buffer=dictionary if dictionary else initialize_dictionary(1 << window),
         )
 
-        self._input_buffer = deque(maxlen=self.max_pattern_size)
+        self._input_buffer = deque(maxlen=16)  # matching the C implementation
 
         # Callbacks for debugging/metric collection; can be externally set.
-        self.token_cb = None
+        self.match_cb = None
+        self.extended_match_cb = None
         self.literal_cb = None
         self.flush_cb = None
+        self.rle_cb = None
+
+        # For debugging: how many uncompressed bytes have we consumed so far.
+        self.input_index = 0
 
         # Write header
         self._bit_writer.write(window - 8, 3, flush=False)
         self._bit_writer.write(literal - 5, 2, flush=False)
         self._bit_writer.write(bool(dictionary), 1, flush=False)
-        self._bit_writer.write(0, 1, flush=False)  # Reserved
+        self._bit_writer.write(self.extended, 1, flush=False)
         self._bit_writer.write(0, 1, flush=False)  # No other header bytes
 
+    def _validate_no_match_overlap(self, write_pos, match_index, match_size):
+        """Check if writing a single byte will overlap with a future match section."""
+        return write_pos < match_index or write_pos >= match_index + match_size
+
     def _compress_input_buffer_single(self) -> int:
-        target = bytes(self._input_buffer)
         bytes_written = 0
-        search_i = 0
-        match_size = 1
-        for match_size in range(self.min_pattern_size, len(target) + 1):
+
+        if not self._input_buffer:
+            return bytes_written
+
+        if self._extended_match_count:
+            while self._input_buffer:
+                if (self._extended_match_position + self._extended_match_count) >= self._window_buffer.size:
+                    # Reached window boundary - emit match (no wrap-around, only 0.02% compression loss)
+                    bytes_written += self._write_extended_match()
+                    return bytes_written
+
+                # Search the remainder of the window buffer for a longer match.
+                target = self._window_buffer.get(self._extended_match_position, self._extended_match_count)
+                target += bytes([self._input_buffer[0]])
+                search_i, match = self._search(target, start=self._extended_match_position)
+                match_size = len(match)
+                if match_size > self._extended_match_count:
+                    self._input_buffer.popleft()
+                    self._extended_match_count = match_size
+                    self._extended_match_position = search_i
+                    if self._extended_match_count == self.max_pattern_size:
+                        bytes_written += self._write_extended_match()
+                        return bytes_written
+                    continue
+                else:
+                    # We've found the end of the match
+                    bytes_written += self._write_extended_match()
+                    return bytes_written
+
+            # We ran out of input_buffer, return so caller can re-populate the input_buffer
+            return bytes_written
+
+        # RLE handling with persistent state (v2 only)
+        # Accumulate RLE count across compression cycles for better compression of long runs
+        if self.extended:
+            last_byte = self._window_buffer.last_written_byte
+
+            # Count RLE bytes in current buffer WITHOUT consuming yet
+            rle_available = 0
+            for byte in self._input_buffer:
+                if byte == last_byte and self._rle_count + rle_available < self._rle_max_size:
+                    rle_available += 1
+                else:
+                    break
+
+            total_rle = self._rle_count + rle_available
+            rle_ended = (rle_available < len(self._input_buffer)) or (total_rle >= self._rle_max_size)
+
+            # If RLE hasn't ended and we haven't hit max, consume and wait for more
+            if not rle_ended and total_rle > 0:
+                self._rle_count = total_rle
+                for _ in range(rle_available):
+                    self._input_buffer.popleft()
+                return bytes_written
+
+            # RLE run has ended - decide between RLE and pattern match
+            if total_rle >= 2:
+                use_pattern = False
+
+                # For short RLE runs (all from this call), check if pattern match is better
+                if total_rle == rle_available and total_rle <= 6:
+                    target = bytes(self._input_buffer)
+                    search_i, match = self._search(target, start=0)
+                    match_size = len(match)
+
+                    if match_size > total_rle:
+                        use_pattern = True
+                        # Don't consume RLE bytes - fall through to pattern matching
+
+                if not use_pattern:
+                    # Use RLE - consume bytes and write token
+                    for _ in range(rle_available):
+                        self._input_buffer.popleft()
+                    self._rle_count = total_rle
+                    bytes_written += self._write_rle()
+                    return bytes_written
+                self._rle_count = 0
+            elif total_rle == 1:
+                # Single byte - not worth RLE, will be handled as literal/pattern
+                self._rle_count = 0
+
+        # Normal pattern matching
+        target = bytes(self._input_buffer)
+
+        if self.lazy_matching and self._cached_match_index >= 0:
+            search_i = self._cached_match_index
+            match_size = self._cached_match_size
+            match = self._window_buffer.get(search_i, match_size)
+            self._cached_match_index = -1
+        else:
+            search_i, match = self._search(target, start=0)
+            match_size = len(match)
+
+        # Lazy matching logic
+        if (
+            self.lazy_matching
+            and match_size >= self.min_pattern_size
+            and match_size <= 8
+            and len(self._input_buffer) > match_size + 2
+        ):
+            # Check if next position has a better match
+            next_target = bytes(list(self._input_buffer)[1:])  # Skip first byte
+            next_search_i, next_match = self._search(next_target, start=0)
+            next_match_size = len(next_match)
+
+            # If next position has a better match, and the match doesn't overlap with the literal we are writing
+            if next_match_size > match_size and self._validate_no_match_overlap(
+                self._window_buffer.pos, next_search_i, next_match_size
+            ):
+                # Write literal at current position and cache the next match
+                literal = self._input_buffer.popleft()
+                bytes_written += self._write_literal(literal)
+                self._cached_match_index = next_search_i
+                self._cached_match_size = next_match_size
+                return bytes_written
+
+        if match_size >= self.min_pattern_size:
+            if self.extended and match_size > (self.min_pattern_size + 11):
+                # Protects +12 to be RLE symbol, and +13 to be extended match symbol
+                self._extended_match_position = search_i
+                self._extended_match_count = match_size
+            else:
+                bytes_written += self._write_match(search_i, match)
+
+            for _ in range(match_size):
+                self._input_buffer.popleft()
+        else:
+            literal = self._input_buffer.popleft()
+            bytes_written += self._write_literal(literal)
+
+        return bytes_written
+
+    def _search(self, target: bytes, start=0):
+        match_size = 0
+        search_i = start
+        for match_size in range(
+            self.min_pattern_size,
+            min(len(target), self.max_pattern_size) + 1,
+        ):
             match = target[:match_size]
             try:
                 search_i = self._window_buffer.index(match, search_i)
@@ -183,30 +386,92 @@ def _compress_input_buffer_single(self) -> int:
                 match_size -= 1
                 break
         match = target[:match_size]
+        return search_i, match
 
-        if match_size >= self.min_pattern_size:
-            if self.token_cb:
-                self.token_cb(
-                    search_i,
-                    match_size,
-                    match,
-                )
-            bytes_written += self._bit_writer.write_huffman(match_size - self.min_pattern_size)
-            bytes_written += self._bit_writer.write(search_i, self.window_bits)
-            self._window_buffer.write_bytes(match)
+    def _write_extended_huffman(self, value, leading_bits):
+        bytes_written = 0
+        # the upper bits can have values [0, 13]
+        mask = (1 << leading_bits) - 1
+        if value > ((13 << leading_bits) + mask) or value < 0:
+            raise ValueError
+        code_index = value >> leading_bits
+        # Don't use write_huffman_and_literal_flag since we don't want to write a flag.
+        bytes_written += self._bit_writer.write(_huffman_codes[code_index], _huffman_bits[code_index] - 1)
+        bytes_written += self._bit_writer.write(value & mask, leading_bits)
+        return bytes_written
 
-            for _ in range(match_size):
-                self._input_buffer.popleft()
+    def _write_extended_match(self):
+        bytes_written = 0
+        if self.extended_match_cb:
+            string = self._window_buffer.get(self._extended_match_position, self._extended_match_count)
+            self.extended_match_cb(
+                self._window_buffer.pos, self._extended_match_position, self._extended_match_count, string
+            )
+        # Format: symbol, size (huffman+trailing), position
+        bytes_written += self._bit_writer.write_huffman_and_literal_flag(_EXTENDED_MATCH_SYMBOL)
+        bytes_written += self._write_extended_huffman(
+            self._extended_match_count - self.min_pattern_size - 11 - 1,
+            _LEADING_EXTENDED_MATCH_HUFFMAN_BITS,
+        )
+        bytes_written += self._bit_writer.write(self._extended_match_position, self.window_bits)
+
+        self._window_buffer.write_from_self(self._extended_match_position, self._extended_match_count)
+
+        # Reset state
+        self._extended_match_count = 0
+        self._extended_match_position = 0  # Technically not necessary.
+
+        return bytes_written
+
+    def _write_literal(self, literal) -> int:
+        bytes_written = 0
+        if self.literal_cb:
+            self.literal_cb(literal)
+        if literal >> self.literal_bits:
+            raise ExcessBitsError
+
+        bytes_written += self._bit_writer.write(literal | self.literal_flag, self.literal_bits + 1)
+        self._window_buffer.write_byte(literal)
+        return bytes_written
+
+    def _write_match(self, search_i, match) -> int:
+        match_size = len(match)
+
+        if self.match_cb:
+            self.match_cb(
+                self._window_buffer.pos,
+                search_i,
+                match_size,
+                match,
+            )
+
+        bytes_written = 0
+        bytes_written += self._bit_writer.write_huffman_and_literal_flag(match_size - self.min_pattern_size)
+        bytes_written += self._bit_writer.write(search_i, self.window_bits)
+        self._window_buffer.write_bytes(match)
+        return bytes_written
+
+    def _write_rle(self) -> int:
+        bytes_written = 0
+        last_written_byte = self._window_buffer.last_written_byte
+
+        if self._rle_count == 0:
+            raise ValueError("No RLE to write.")
+        elif self._rle_count == 1:
+            # Just write a literal
+            bytes_written += self._write_literal(last_written_byte)
         else:
-            char = self._input_buffer.popleft()
-            if self.literal_cb:
-                self.literal_cb(char)
-            if char >> self.literal_bits:
-                raise ExcessBitsError
+            if self.rle_cb:
+                self.rle_cb(self._rle_count, last_written_byte)
+            bytes_written += self._bit_writer.write_huffman_and_literal_flag(_RLE_SYMBOL)
+            bytes_written += self._write_extended_huffman(self._rle_count - 2, _LEADING_RLE_HUFFMAN_BITS)
 
-            bytes_written += self._bit_writer.write(char | self.literal_flag, self.literal_bits + 1)
-            self._window_buffer.write_byte(char)
+            # Write up to 8 bytes to the window (up to end of buffer, no wrap).
+            remaining = self._window_buffer.size - self._window_buffer.pos
+            window_write = min(self._rle_count, _RLE_MAX_WINDOW, remaining)
+            self._window_buffer.write_bytes(bytes([last_written_byte]) * window_write)
 
+        self._rle_count = 0
         return bytes_written
 
     def write(self, data: Union[bytes, bytearray]) -> int:
@@ -225,8 +490,12 @@ def write(self, data: Union[bytes, bytearray]) -> int:
         """
         bytes_written = 0
 
-        for char in data:
-            self._input_buffer.append(char)
+        self.input_index = 0
+        while self.input_index < len(data):
+            if len(self._input_buffer) != self._input_buffer.maxlen:
+                self._input_buffer.append(data[self.input_index])
+                self.input_index += 1
+
             if len(self._input_buffer) == self._input_buffer.maxlen:
                 bytes_written += self._compress_input_buffer_single()
 
@@ -255,7 +524,18 @@ def flush(self, write_token: bool = True) -> int:
             self.flush_cb()
         while self._input_buffer:
             bytes_written += self._compress_input_buffer_single()
-        bytes_written += self._bit_writer.flush(write_token=write_token)
+        if self.extended and self._rle_count:
+            bytes_written += self._write_rle()
+        if self.extended and self._extended_match_count:
+            bytes_written += self._write_extended_match()
+
+        # Clear any cached lazy matching state
+        if self.lazy_matching:
+            self._cached_match_index = -1
+            self._cached_match_size = 0
+
+        bytes_written_flush = self._bit_writer.flush(write_token=write_token)
+        bytes_written += bytes_written_flush
         return bytes_written
 
     def close(self) -> int:
@@ -300,6 +580,7 @@ def compress(
     literal: int = 8,
     dictionary: Optional[bytearray] = None,
     lazy_matching: bool = False,
+    extended: bool = True,
 ) -> bytes:
     """Single-call to compress data.
 
@@ -326,6 +607,8 @@ def compress(
         first be initialized with :func:`~tamp.initialize_dictionary`
     lazy_matching: bool
         Use roughly 50% more cpu to get 0~2% better compression.
+    extended: bool
+        Use extended compression format. Defaults to True.
 
     Returns
     -------
@@ -340,6 +623,7 @@ def compress(
                 literal=literal,
                 dictionary=dictionary,
                 lazy_matching=lazy_matching,
+                extended=extended,
             )
             c.write(data)
         else:
@@ -349,6 +633,7 @@ def compress(
                 literal=literal,
                 dictionary=dictionary,
                 lazy_matching=lazy_matching,
+                extended=extended,
             )
             c.write(data)
         c.flush(write_token=False)
diff --git a/tamp/ctamp.pxd b/tamp/ctamp.pxd
index 087ff62..3a70308 100644
--- a/tamp/ctamp.pxd
+++ b/tamp/ctamp.pxd
@@ -6,6 +6,7 @@ cdef extern from "tamp/common.h":
         int window
         int literal
         bool use_custom_dictionary
+        bool extended  # Extended format (RLE, extended match). Read from header bit [1].
         # The lazy_matching field is conditionally compiled based on TAMP_LAZY_MATCHING
         # We declare it here, but accessing it when the macro is disabled will cause compile errors
         # This is handled in the Cython code by always setting it when the struct is initialized
diff --git a/tamp/decompressor.py b/tamp/decompressor.py
index fb2fec7..6216b3e 100644
--- a/tamp/decompressor.py
+++ b/tamp/decompressor.py
@@ -10,6 +10,13 @@
 _CHUNK_SIZE = 1 << 20
 _FLUSH = object()
 
+# These variables must match compressor.py
+_RLE_SYMBOL = 12
+_EXTENDED_MATCH_SYMBOL = 13
+_RLE_MAX_WINDOW = 8  # Maximum number of RLE bytes to write to the window.
+_LEADING_EXTENDED_MATCH_HUFFMAN_BITS = 3
+_LEADING_RLE_HUFFMAN_BITS = 4
+
 # Each key here are the huffman codes or'd with 0x80
 # This is so that each lookup is easy/quick.
 _huffman_lookup = {
@@ -57,15 +64,15 @@ def read(self, num_bits):
             if not byte:
                 raise EOFError
             byte_value = int.from_bytes(byte, "little")
-            self.buffer |= byte_value << (24 - self.bit_pos)
+            self.buffer |= byte_value << (56 - self.bit_pos)
             self.bit_pos += 8
 
             if self.backup_buffer is not None and self.backup_bit_pos is not None:
-                self.backup_buffer |= byte_value << (24 - self.backup_bit_pos)
+                self.backup_buffer |= byte_value << (56 - self.backup_bit_pos)
                 self.backup_bit_pos += 8
 
-        result = self.buffer >> (32 - num_bits)
-        mask = (1 << (32 - num_bits)) - 1
+        result = self.buffer >> (64 - num_bits)
+        mask = (1 << (64 - num_bits)) - 1
         self.buffer = (self.buffer & mask) << num_bits
         self.bit_pos -= num_bits
 
@@ -120,6 +127,20 @@ def write_bytes(self, data):
         for byte in data:
             self.write_byte(byte)
 
+    def get(self, index, size):
+        out = bytearray(size)
+        for i in range(size):
+            pos = (index + i) % self.size
+            out[i] = self.buffer[pos]
+        return bytes(out)
+
+    @property
+    def last_written_byte(self) -> int:
+        pos = self.pos - 1
+        if pos < 0:
+            pos = self.size - 1
+        return self.buffer[pos]  # TODO: unit-test this thoroughly on initial start!
+
 
 class Decompressor:
     """Decompresses a file or stream of tamp-compressed data.
@@ -158,12 +179,9 @@ def __init__(self, f, *, dictionary: Optional[bytearray] = None):
         self.window_bits = self._bit_reader.read(3) + 8
         self.literal_bits = self._bit_reader.read(2) + 5
         uses_custom_dictionary = self._bit_reader.read(1)
-        reserved = self._bit_reader.read(1)
+        self.extended = self._bit_reader.read(1)
         more_header_bytes = self._bit_reader.read(1)
 
-        if reserved:
-            raise NotImplementedError
-
         if more_header_bytes:
             raise NotImplementedError
 
@@ -176,6 +194,7 @@ def __init__(self, f, *, dictionary: Optional[bytearray] = None):
 
         self.min_pattern_size = compute_min_pattern_size(self.window_bits, self.literal_bits)
 
+        # Used to store decoded bytes that do not currently fit in the output buffer.
         self.overflow = bytearray()
 
     def readinto(self, buf: bytearray) -> int:
@@ -191,49 +210,82 @@ def readinto(self, buf: bytearray) -> int:
         int
             Number of bytes decompressed into buffer.
         """
+        bytes_written = 0
+
         if len(self.overflow) > len(buf):
             buf[:] = self.overflow[: len(buf)]
-            written = len(buf)
+            bytes_written += len(buf)
             self.overflow = self.overflow[len(buf) :]
-            return written
+            return bytes_written
         elif self.overflow:
             buf[: len(self.overflow)] = self.overflow
-            written = len(self.overflow)
+            bytes_written += len(self.overflow)
             self.overflow = bytearray()
-        else:
-            written = 0
 
-        while written < len(buf):
+        def write_to_output(string):
+            nonlocal bytes_written
+            match_size = len(string)
+            to_buf = min(len(buf) - bytes_written, match_size)
+            buf[bytes_written : bytes_written + to_buf] = string[:to_buf]
+            bytes_written += to_buf
+            if to_buf < match_size:
+                self.overflow[:] = string[to_buf:]
+                return False  # stop decoding
+            return True
+
+        while bytes_written < len(buf):
             try:
                 with self._bit_reader:
                     is_literal = self._bit_reader.read(1)
 
                     if is_literal:
-                        c = self._bit_reader.read(self.literal_bits)
-                        self._window_buffer.write_byte(c)
-                        buf[written] = c
-                        written += 1
+                        string = bytes([self._bit_reader.read(self.literal_bits)])
+                        self._window_buffer.write_bytes(string)
                     else:
                         match_size = self._bit_reader.read_huffman()
                         if match_size is _FLUSH:
                             self._bit_reader.clear()
                             continue
-                        match_size += self.min_pattern_size
-                        index = self._bit_reader.read(self.window_bits)
-
-                        string = self._window_buffer.buffer[index : index + match_size]
-                        self._window_buffer.write_bytes(string)
-
-                        to_buf = min(len(buf) - written, match_size)
-                        buf[written : written + to_buf] = string[:to_buf]
-                        written += to_buf
-                        if to_buf < match_size:
-                            self.overflow[:] = string[to_buf:]
-                            break
+                        if self.extended and match_size > 11:
+                            if match_size == _RLE_SYMBOL:
+                                rle_count = self._bit_reader.read_huffman()
+                                rle_count <<= _LEADING_RLE_HUFFMAN_BITS
+                                rle_count += self._bit_reader.read(_LEADING_RLE_HUFFMAN_BITS)
+                                rle_count += 1 + 1
+                                symbol = self._window_buffer.last_written_byte
+                                string = bytes([symbol]) * rle_count
+                                remaining = self._window_buffer.size - self._window_buffer.pos
+                                window_write = min(rle_count, _RLE_MAX_WINDOW, remaining)
+                                self._window_buffer.write_bytes(string[:window_write])
+                            elif match_size == _EXTENDED_MATCH_SYMBOL:
+                                # Format: size (huffman+trailing), then position
+                                match_size = self._bit_reader.read_huffman()
+                                match_size <<= _LEADING_EXTENDED_MATCH_HUFFMAN_BITS
+                                match_size += self._bit_reader.read(_LEADING_EXTENDED_MATCH_HUFFMAN_BITS)
+                                match_size += self.min_pattern_size + 11 + 1
+                                index = self._bit_reader.read(self.window_bits)
+
+                                string = self._window_buffer.get(index, match_size)
+
+                                # Write up to end of buffer (no wrap)
+                                remaining = self._window_buffer.size - self._window_buffer.pos
+                                window_write = min(match_size, remaining)
+                                self._window_buffer.write_bytes(string[:window_write])
+                            else:
+                                raise ValueError("unreachable")
+                        else:
+                            match_size += self.min_pattern_size
+                            index = self._bit_reader.read(self.window_bits)
+
+                            string = self._window_buffer.get(index, match_size)
+                            self._window_buffer.write_bytes(string)
+
+                    if not write_to_output(string):
+                        break
             except EOFError:
                 break
 
-        return written
+        return bytes_written
 
     def read(self, size: int = -1) -> bytearray:
         """Decompresses data to bytes.
diff --git a/tests/test_cli.py b/tests/test_cli.py
index c23917d..b6b79ba 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -51,7 +51,7 @@ def test_compress_file_to_stdout(self):
             test_file.write_bytes(b"foo foo foo")
 
             with patch("sys.stdout.buffer.write") as mock_stdout:
-                app(["compress", str(test_file)], **_app_kwargs)
+                app(["compress", "--no-extended", str(test_file)], **_app_kwargs)
                 mock_stdout.assert_called_once_with(compressed_foo_foo_foo)
 
     def test_compress_stdin_to_stdout(self):
@@ -59,7 +59,7 @@ def test_compress_stdin_to_stdout(self):
             patch("sys.stdout.buffer.write") as mock_stdout,
             patch("sys.stdin.buffer.read", return_value="foo foo foo"),
         ):
-            app("compress", **_app_kwargs)
+            app(["compress", "--no-extended"], **_app_kwargs)
             mock_stdout.assert_called_once_with(compressed_foo_foo_foo)
 
     def test_decompress_file_to_stdout(self):
diff --git a/tests/test_compressor.py b/tests/test_compressor.py
index 188447c..309dcd7 100644
--- a/tests/test_compressor.py
+++ b/tests/test_compressor.py
@@ -36,12 +36,6 @@
 NativeExcessBitsError = ExcessBitsError
 
 if micropython:
-    from tamp.compressor_viper import Compressor as ViperCompressor
-    from tamp.compressor_viper import compress as viper_compress
-
-    Compressors.append(ViperCompressor)
-    compresses.append(viper_compress)
-
     try:
         from tamp_native import Compressor as NativeCompressor
         from tamp_native import ExcessBitsError as NativeExcessBitsError
@@ -94,7 +88,7 @@ def test_compressor_default(self):
 
                 bytes_written = 0
                 with io.BytesIO() as f:
-                    compressor = Compressor(f)
+                    compressor = Compressor(f, extended=False)
                     bytes_written += compressor.write(test_string)
                     bytes_written += compressor.flush(write_token=False)
 
@@ -106,7 +100,7 @@ def test_compressor_default(self):
 
                 # Test Context Manager
                 bytes_written = 0
-                with io.BytesIO() as f, Compressor(f) as compressor:
+                with io.BytesIO() as f, Compressor(f, extended=False) as compressor:
                     bytes_written += compressor.write(test_string)
                     bytes_written += compressor.flush(write_token=False)
 
@@ -137,7 +131,7 @@ def test_compressor_input_buffer(self):
                 )
 
                 with io.BytesIO() as f:
-                    compressor = Compressor(f)
+                    compressor = Compressor(f, extended=False)
                     compressor.write(b"f")
                     compressor.write(b"oo")
                     compressor.write(b" fo")
@@ -171,7 +165,7 @@ def test_compressor_7bit(self):
                     # fmt: on
                 )
                 with io.BytesIO() as f:
-                    compressor = Compressor(f, literal=7)
+                    compressor = Compressor(f, literal=7, extended=False)
                     compressor.write(test_string)
                     compressor.flush(write_token=False)
 
@@ -200,7 +194,7 @@ def test_compressor_predefined_dictionary(self):
                 )
 
                 with io.BytesIO() as f:
-                    compressor = Compressor(f, window=8, literal=7, dictionary=dictionary)
+                    compressor = Compressor(f, window=8, literal=7, dictionary=dictionary, extended=False)
                     compressor.write(test_string)
                     compressor.flush(write_token=False)
 
@@ -223,7 +217,7 @@ def test_oob_2_byte_pattern(self):
                 test_string = memoryview(test_string_extended)[:3]  # b"Q\x00Q"
 
                 with io.BytesIO() as f:
-                    compressor = Compressor(f)
+                    compressor = Compressor(f, extended=False)
                     compressor.write(test_string)
                     compressor.flush(write_token=False)
 
@@ -245,7 +239,7 @@ def test_oob_2_byte_pattern(self):
     def test_excess_bits(self):
         for Compressor in Compressors:
             with self.subTest(Compressor=Compressor), io.BytesIO() as f:
-                compressor = Compressor(f, literal=7)
+                compressor = Compressor(f, literal=7, extended=False)
 
                 with self.assertRaises((ExcessBitsError, NativeExcessBitsError)):
                     compressor.write(b"\xff")
@@ -271,7 +265,7 @@ def test_single_shot_compress_text(self):
                     ]
                     # fmt: on
                 )
-                self.assertEqual(compress("foo foo foo"), expected)
+                self.assertEqual(compress("foo foo foo", extended=False), expected)
 
     def test_single_shot_compress_binary(self):
         for compress in compresses:
@@ -293,7 +287,7 @@ def test_single_shot_compress_binary(self):
                     ]
                     # fmt: on
                 )
-                self.assertEqual(compress(b"foo foo foo"), expected)
+                self.assertEqual(compress(b"foo foo foo", extended=False), expected)
 
     def test_invalid_conf(self):
         for Compressor in Compressors:
diff --git a/tests/test_compressor_decompressor.py b/tests/test_compressor_decompressor.py
index 22b964a..aa0a8e1 100644
--- a/tests/test_compressor_decompressor.py
+++ b/tests/test_compressor_decompressor.py
@@ -19,21 +19,16 @@
         CCompressor = None
         CDecompressor = None
 
-    ViperCompressor = None
-    ViperDecompressor = None
     NativeCompressor = None
     NativeDecompressor = None
 else:
-    # MicroPython: only test Viper and Native implementations
+    # MicroPython: only test Native implementation
     # Pure Python and Cython implementations use CPython-specific features
     PyCompressor = None
     PyDecompressor = None
     CCompressor = None
     CDecompressor = None
 
-    from tamp.compressor_viper import Compressor as ViperCompressor
-    from tamp.decompressor_viper import Decompressor as ViperDecompressor
-
     try:
         from tamp_native import Compressor as NativeCompressor
         from tamp_native import Decompressor as NativeDecompressor
@@ -43,8 +38,8 @@
         NativeDecompressor = None
 
 
-Compressors = (PyCompressor, CCompressor, ViperCompressor, NativeCompressor)
-Decompressors = (PyDecompressor, CDecompressor, ViperDecompressor, NativeDecompressor)
+Compressors = (PyCompressor, CCompressor, NativeCompressor)
+Decompressors = (PyDecompressor, CDecompressor, NativeDecompressor)
 
 
 def walk_compressors_decompressors():
diff --git a/tests/test_dataset_regression.py b/tests/test_dataset_regression.py
index 734401e..a6716cf 100644
--- a/tests/test_dataset_regression.py
+++ b/tests/test_dataset_regression.py
@@ -73,6 +73,66 @@
     ),
 ]
 
+# Extended format datasets (uses RLE and Extended Match encoding)
+EXTENDED_DATASETS = [
+    (
+        "datasets/extended-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp",
+        "e0c40eacf1afc550a6add74888c48bb981b28788a6d75a62a0e2444e997b9864",
+    ),
+    (
+        "datasets/extended-compressed/dickens.tamp",
+        "b24c37886142e11d0ee687db6ab06f936207aa7f2ea1fd1d9a36763c7a507e6a",
+    ),
+    (
+        "datasets/extended-compressed/mr.tamp",
+        "68637ed52e3e4860174ed2dc0840ac77d5f1a60abbcb13770d5754e3774d53e6",
+    ),
+    (
+        "datasets/extended-compressed/ooffice.tamp",
+        "e7ee013880d34dd5208283d0d3d91b07f442e067454276095ded14f322a656eb",
+    ),
+    (
+        "datasets/extended-compressed/osdb.tamp",
+        "60f027179302ca3ad87c58ac90b6be72ec23588aaa7a3b7fe8ecc0f11def3fa3",
+    ),
+    (
+        "datasets/extended-compressed/reymont.tamp",
+        "0eac0114a3dfe6e2ee1f345a0f79d653cb26c3bc9f0ed79238af4933422b7578",
+    ),
+    (
+        "datasets/extended-compressed/sao.tamp",
+        "c2d0ea2cc59d4c21b7fe43a71499342a00cbe530a1d5548770e91ecd6214adcc",
+    ),
+    (
+        "datasets/extended-compressed/x-ray.tamp",
+        "7de9fce1405dc44ae5e6813ed21cd5751e761bd4265655a005d39b9685d1c9ad",
+    ),
+    (
+        "datasets/extended-compressed/xml.tamp",
+        "0e82e54e695c1938e4193448022543845b33020c8be6bf3bf3ead2224903e08c",
+    ),
+    (
+        "datasets/extended-compressed/samba.tamp",
+        "93ba07bc44d8267789c1d911992f40b089ffa2140b4a160fac11ccae9a40e7b2",
+    ),
+    (
+        "datasets/extended-compressed/nci.tamp",
+        "fc63a31770947b8c2062d3b19ca94c00485a232bb91b502021948fee983e1635",
+    ),
+    (
+        "datasets/extended-compressed/webster.tamp",
+        "6a68f69b26daf09f9dd84f7470368553194a0b294fcfa80f1604efb11143a383",
+    ),
+    (
+        "datasets/extended-compressed/mozilla.tamp",
+        "657fc3764b0c75ac9de9623125705831ebbfbe08fed248df73bc2dc66e2a963b",
+    ),
+    (
+        "datasets/extended-compressed/enwik8.tamp",
+        "2b49720ec4d78c3c9fabaee6e4179a5e997302b3a70029f30f2d582218c024a8",
+    ),
+]
+
 
 class TestV1Decompression(unittest.TestCase):
     @pytest.mark.dataset
@@ -90,5 +150,21 @@ def test_v1_decompress(self):
                     self.assertEqual(actual, expected_sha256, f"SHA256 mismatch for {rel_path} using {impl_name}")
 
 
+class TestExtendedDecompression(unittest.TestCase):
+    @pytest.mark.dataset
+    def test_extended_decompress(self):
+        for impl_name, decompress_func in DECOMPRESSOR_IMPLEMENTATIONS:
+            for rel_path, expected_sha256 in EXTENDED_DATASETS:
+                with self.subTest(implementation=impl_name, dataset=rel_path):
+                    path = PROJECT_DIR / rel_path
+
+                    with open(path, "rb") as f:
+                        data = f.read()
+
+                    decompressed = decompress_func(data)
+                    actual = hashlib.sha256(decompressed).hexdigest()
+                    self.assertEqual(actual, expected_sha256, f"SHA256 mismatch for {rel_path} using {impl_name}")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_decompressor.py b/tests/test_decompressor.py
index 26e9d2f..df4cc0b 100644
--- a/tests/test_decompressor.py
+++ b/tests/test_decompressor.py
@@ -26,11 +26,14 @@
         pass
 
 else:
-    from tamp.decompressor_viper import Decompressor as ViperDecompressor
-    from tamp.decompressor_viper import decompress as viper_decompress
+    try:
+        from tamp_native import Decompressor as NativeDecompressor
+        from tamp_native import decompress as native_decompress
 
-    Decompressors.append(ViperDecompressor)
-    decompresses.append(viper_decompress)
+        Decompressors.append(NativeDecompressor)
+        decompresses.append(native_decompress)
+    except ImportError:
+        pass
 
 
 class TestDecompressor(unittest.TestCase):
diff --git a/tests/test_pseudorandom.py b/tests/test_pseudorandom.py
index f2ca6f5..62d0e75 100644
--- a/tests/test_pseudorandom.py
+++ b/tests/test_pseudorandom.py
@@ -12,12 +12,6 @@
     micropython = None
 
 if micropython:
-    import tamp.compressor_viper
-    import tamp.decompressor_viper
-
-    modules.append(tamp.compressor_viper)
-    modules.append(tamp.decompressor_viper)
-
     try:
         import tamp_native
 
diff --git a/tools/print_compressed_sizes.py b/tools/print_compressed_sizes.py
new file mode 100644
index 0000000..7cb809e
--- /dev/null
+++ b/tools/print_compressed_sizes.py
@@ -0,0 +1,47 @@
+"""
+Print compressed sizes for test files used in optimize-extended-huffman.py.
+
+This script compresses the same files that optimize-extended-huffman.py uses
+and prints the compressed size for each file with thousands separators.
+"""
+
+from pathlib import Path
+
+import tamp.compressor
+
+
+def main():
+    # Define test files (same as optimize-extended-huffman.py)
+    datasets_dir = Path(__file__).parent.parent / "datasets"
+    test_files = [
+        datasets_dir / "enwik8",
+        datasets_dir / "RPI_PICO-20250415-v1.25.0.uf2",
+        *(datasets_dir / "silesia").iterdir(),
+    ]
+    test_files.sort()
+
+    ratios = []
+    for file_path in test_files:
+        # Read and compress the file
+        data = file_path.read_bytes()
+        if len(data) == 0:
+            print(f"{file_path.name}: Empty file")
+            continue
+
+        compressed_data = tamp.compressor.compress(data)
+
+        original_size = len(data)
+        compressed_size = len(compressed_data)
+
+        ratio = original_size / compressed_size
+        ratios.append(ratio)
+
+        # Print with thousands separators
+        print(f"{file_path.name}: {compressed_size:,} (**{ratio:.3f}**)")
+
+    avg = sum(ratios) / len(ratios)
+    print(f"Average Ratio: {avg}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/wasm/src/streams.js b/wasm/src/streams.js
index d342002..2a6c350 100644
--- a/wasm/src/streams.js
+++ b/wasm/src/streams.js
@@ -41,9 +41,17 @@ export class TampCompressionStream extends TransformStream {
         } finally {
           if (compressor) {
             compressor.destroy();
+            compressor = null;
           }
         }
       },
+
+      cancel(_reason) {
+        if (compressor) {
+          compressor.destroy();
+          compressor = null;
+        }
+      },
     });
   }
 }
@@ -88,9 +96,17 @@ export class TampDecompressionStream extends TransformStream {
         } finally {
           if (decompressor) {
             decompressor.destroy();
+            decompressor = null;
           }
         }
       },
+
+      cancel(_reason) {
+        if (decompressor) {
+          decompressor.destroy();
+          decompressor = null;
+        }
+      },
     });
   }
 }
diff --git a/wasm/src/tamp.d.ts b/wasm/src/tamp.d.ts
index 584e2c4..8f19674 100644
--- a/wasm/src/tamp.d.ts
+++ b/wasm/src/tamp.d.ts
@@ -43,6 +43,8 @@ export interface TampOptions {
   literal?: number;
   /** Custom dictionary data. If null, no custom dictionary is used. If Uint8Array, uses the provided dictionary. Default: null */
   dictionary?: Uint8Array | null;
+  /** Enable extended format (RLE, extended match) for better compression ratios. Default: true */
+  extended?: boolean;
   /** Enable lazy matching for better compression ratios. Default: false */
   lazy_matching?: boolean;
 }
@@ -68,6 +70,7 @@ export interface TampDefaults {
   readonly window: 10;
   readonly literal: 8;
   readonly dictionary: null;
+  readonly extended: true;
   readonly lazy_matching: false;
 }
 
diff --git a/wasm/src/tamp.js b/wasm/src/tamp.js
index f52788d..df82ba7 100644
--- a/wasm/src/tamp.js
+++ b/wasm/src/tamp.js
@@ -122,6 +122,7 @@ export class TampCompressor {
       window: 10,
       literal: 8,
       dictionary: null,
+      extended: true,
       lazy_matching: false,
       ...options,
     };
@@ -183,7 +184,8 @@ export class TampCompressor {
         (this.options.window & 0xf) |
         ((this.options.literal & 0xf) << 4) |
         ((this.options.dictionary ? 1 : 0) << 8) |
-        ((this.options.lazy_matching ? 1 : 0) << 9);
+        ((this.options.extended ? 1 : 0) << 9) |
+        ((this.options.lazy_matching ? 1 : 0) << 10);
       this.module.setValue(confPtr, confValue, 'i32');
 
       // Initialize compressor
@@ -790,10 +792,12 @@ export async function compress(data, options = {}) {
   const callbackOptions = {};
 
   // Extract compression-specific options
-  const { window, literal, dictionary, lazy_matching, onPoll, signal, pollIntervalMs, pollIntervalBytes } = options;
+  const { window, literal, dictionary, extended, lazy_matching, onPoll, signal, pollIntervalMs, pollIntervalBytes } =
+    options;
   if (window !== undefined) compressionOptions.window = window;
   if (literal !== undefined) compressionOptions.literal = literal;
   if (dictionary !== undefined) compressionOptions.dictionary = dictionary;
+  if (extended !== undefined) compressionOptions.extended = extended;
   if (lazy_matching !== undefined) compressionOptions.lazy_matching = lazy_matching;
 
   // Extract callback options
diff --git a/website/index.html b/website/index.html
index 8847451..50ec9a6 100644
--- a/website/index.html
+++ b/website/index.html
@@ -144,6 +144,16 @@ <h3 style="margin-bottom: 0.75rem; font-weight: 500; color: #333;">Configuration
                         <option value="15">15</option>
                     </select>
                 </div>
+                <div style="display: flex; align-items: center; gap: 0.5rem;">
+                    <input type="checkbox" class="option-checkbox" id="extended" checked>
+                    <label class="checkbox-label" for="extended" style="margin: 0;">Extended</label>
+                    <div class="help-tooltip">
+                        <span class="help-icon">?</span>
+                        <div class="tooltip-content">
+                            Enables extended compression format with RLE and extended match encoding for better compression ratios. Recommended for most use cases.
+                        </div>
+                    </div>
+                </div>
                 <div style="display: flex; align-items: center; gap: 0.5rem;">
                     <input type="checkbox" class="option-checkbox" id="lazyMatching">
                     <label class="checkbox-label" for="lazyMatching" style="margin: 0;">Lazy Matching</label>
@@ -182,6 +192,16 @@ <h3 style="margin-bottom: 0.75rem; font-weight: 500; color: #333;">Configuration
                             <option value="15">15</option>
                         </select>
                     </div>
+                    <div style="display: flex; align-items: center; gap: 0.5rem;">
+                        <input type="checkbox" class="option-checkbox" id="textExtended" checked>
+                        <label class="checkbox-label" for="textExtended" style="margin: 0;">Extended</label>
+                        <div class="help-tooltip">
+                            <span class="help-icon">?</span>
+                            <div class="tooltip-content">
+                                Enables extended compression format with RLE and extended match encoding for better compression ratios. Recommended for most use cases.
+                            </div>
+                        </div>
+                    </div>
                     <div style="display: flex; align-items: center; gap: 0.5rem;">
                         <input type="checkbox" class="option-checkbox" id="textLazyMatching">
                         <label class="checkbox-label" for="textLazyMatching" style="margin: 0;">Lazy Matching</label>
@@ -194,7 +214,7 @@ <h3 style="margin-bottom: 0.75rem; font-weight: 500; color: #333;">Configuration
                     </div>
                 </div>
             </div>
-            
+
             <div class="option-group">
                 <div style="display: flex; align-items: center; gap: 0.5rem; margin-bottom: 0.5rem;">
                     <button type="button" class="toggle-dictionary-btn" id="toggleDictionaryBtn">
diff --git a/website/js/main.js b/website/js/main.js
index 093d9bf..58ea6e0 100644
--- a/website/js/main.js
+++ b/website/js/main.js
@@ -23,8 +23,10 @@ let dropZone,
   compressionOptions,
   textMode,
   windowBitsSelect,
+  extendedCheckbox,
   lazyMatchingCheckbox,
   textWindowBitsSelect,
+  textExtendedCheckbox,
   textLazyMatchingCheckbox,
   plainTextArea,
   compressedTextArea,
@@ -87,8 +89,10 @@ document.addEventListener('DOMContentLoaded', () => {
   compressionOptions = document.getElementById('compressionOptions');
   textMode = document.getElementById('textMode');
   windowBitsSelect = document.getElementById('windowBits');
+  extendedCheckbox = document.getElementById('extended');
   lazyMatchingCheckbox = document.getElementById('lazyMatching');
   textWindowBitsSelect = document.getElementById('textWindowBits');
+  textExtendedCheckbox = document.getElementById('textExtended');
   textLazyMatchingCheckbox = document.getElementById('textLazyMatching');
   plainTextArea = document.getElementById('plainText');
   compressedTextArea = document.getElementById('compressedText');
@@ -348,6 +352,7 @@ async function processFiles() {
         const windowBits = parseInt(windowBitsSelect.value);
         const options = {
           window: windowBits,
+          extended: extendedCheckbox.checked,
           // Add progress callback for compression with overall progress calculation
           onPoll: async progressInfo => {
             const bytesProcessed = progressInfo.bytesProcessed || 0;
@@ -583,6 +588,7 @@ async function compressTextContent() {
   try {
     const options = {
       window: windowBits,
+      extended: textExtendedCheckbox.checked,
       // Add progress callback for text compression
       onPoll: async progressInfo => {
         const bytesProcessed = progressInfo.bytesProcessed || 0;
@@ -624,9 +630,11 @@ async function compressTextContent() {
     const ratio = data.length > 0 ? (data.length / compressed.length).toFixed(2) : '0';
     const savings = data.length > 0 ? ((1 - compressed.length / data.length) * 100).toFixed(1) : '0';
 
-    const configStr = `${windowBits}-bit window${isPureAscii ? ', 7-bit literals' : ''}${
-      textLazyMatchingCheckbox.checked ? ', lazy matching' : ''
-    }${dictionaryValidation.dictionaryBytes ? ', custom dictionary' : ''}`;
+    const configStr = `${windowBits}-bit window${textExtendedCheckbox.checked ? ', extended' : ', basic'}${
+      isPureAscii ? ', 7-bit literals' : ''
+    }${textLazyMatchingCheckbox.checked ? ', lazy matching' : ''}${
+      dictionaryValidation.dictionaryBytes ? ', custom dictionary' : ''
+    }`;
 
     const stats = [
       { label: 'Configuration', value: configStr },