diff --git a/.gitattributes b/.gitattributes index 5f6c222..a1dce8f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,2 @@ datasets/v1-compressed/** filter=lfs diff=lfs merge=lfs -text +datasets/extended-compressed/** filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/build_wheels.yaml b/.github/workflows/build_wheels.yaml index 2f9bb48..17e29e8 100644 --- a/.github/workflows/build_wheels.yaml +++ b/.github/workflows/build_wheels.yaml @@ -117,7 +117,7 @@ jobs: CIBW_ARCHS: ${{ matrix.cibw_archs }} CIBW_BUILD: ${{ matrix.cibw_build }} CIBW_TEST_REQUIRES: pytest - CIBW_TEST_COMMAND: pytest {package}/tests + CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset - uses: actions/upload-artifact@v4 with: @@ -164,7 +164,7 @@ jobs: CIBW_ARCHS: ${{ matrix.cibw_archs }} CIBW_BUILD: ${{ matrix.cibw_build }} CIBW_TEST_REQUIRES: pytest - CIBW_TEST_COMMAND: pytest {package}/tests + CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset - uses: actions/upload-artifact@v4 with: @@ -212,7 +212,7 @@ jobs: CIBW_ARCHS: ${{ matrix.cibw_archs }} CIBW_BUILD: ${{ matrix.cibw_build }} CIBW_TEST_REQUIRES: pytest - CIBW_TEST_COMMAND: pytest {package}/tests + CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset - uses: actions/upload-artifact@v4 with: @@ -254,7 +254,7 @@ jobs: CIBW_ARCHS: ${{ matrix.cibw_archs }} CIBW_BUILD: ${{ matrix.cibw_build }} CIBW_TEST_REQUIRES: pytest - CIBW_TEST_COMMAND: pytest {package}/tests + CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset - uses: actions/upload-artifact@v4 with: @@ -302,7 +302,7 @@ jobs: CIBW_ARCHS: ${{ matrix.cibw_archs }} CIBW_BUILD: ${{ matrix.cibw_build }} CIBW_TEST_REQUIRES: pytest - CIBW_TEST_COMMAND: pytest {package}/tests + CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset - uses: actions/upload-artifact@v4 with: @@ -344,7 +344,7 @@ jobs: CIBW_ARCHS: ${{ matrix.cibw_archs }} CIBW_BUILD: ${{ matrix.cibw_build }} CIBW_TEST_REQUIRES: pytest - CIBW_TEST_COMMAND: pytest {package}/tests + CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset - uses: actions/upload-artifact@v4 with: @@ -385,7 +385,7 @@ jobs: CIBW_ARCHS: ${{ matrix.cibw_archs }} CIBW_BUILD: ${{ matrix.cibw_build }} CIBW_TEST_REQUIRES: pytest - CIBW_TEST_COMMAND: pytest {package}/tests + CIBW_TEST_COMMAND: pytest {package}/tests --no-dataset - uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/javascript.yaml b/.github/workflows/javascript.yaml index 00fdd5b..4dc0fa7 100644 --- a/.github/workflows/javascript.yaml +++ b/.github/workflows/javascript.yaml @@ -99,9 +99,9 @@ jobs: cd build HASH=$(sha256sum enwik8-js.tamp | cut -d' ' -f1) echo "Compression hash: $HASH" - if [ "$HASH" != "02e05af059a0040d641988075cf1dfc479a084f9a34b5c8a348354211c5fa038" ]; then + if [ "$HASH" != "dd5e431b0cbaa6ee001b10493c2b08e6235f42e3f4ce00958e88b1b97581872e" ]; then echo "❌ Hash mismatch!" - echo "Expected: 02e05af059a0040d641988075cf1dfc479a084f9a34b5c8a348354211c5fa038" + echo "Expected: dd5e431b0cbaa6ee001b10493c2b08e6235f42e3f4ce00958e88b1b97581872e" echo "Got: $HASH" exit 1 fi diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 47a77b9..1c44aa9 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -94,8 +94,48 @@ jobs: source .venv/bin/activate SKIP=wasm-eslint,wasm-npm-test,wasm-file-validation,typescript-check,package-json-lint pre-commit run --show-diff-on-failure --color=always --all-files + cache-lfs: + name: 'Cache LFS files' + runs-on: ubuntu-latest + outputs: + cache-key: ${{ steps.lfs-key.outputs.key }} + steps: + - name: Check out repository + uses: actions/checkout@v4 + with: + submodules: recursive + lfs: false + + - name: Compute LFS cache key + id: lfs-key + run: | + # Hash pointer files before they get replaced by git lfs pull + # Use find for reliable recursive globbing, sort for deterministic order + hash=$(find datasets/v1-compressed datasets/extended-compressed -name '*.tamp' -type f | sort | xargs cat | sha256sum | cut -d' ' -f1) + echo "key=lfs-${hash}" >> $GITHUB_OUTPUT + echo "Cache key: lfs-${hash}" + + - name: Restore LFS cache + uses: actions/cache/restore@v4 + id: lfs-cache + with: + path: .git/lfs + key: ${{ steps.lfs-key.outputs.key }} + + - name: Pull LFS files + if: steps.lfs-cache.outputs.cache-hit != 'true' + run: git lfs pull + + - name: Save LFS cache + if: steps.lfs-cache.outputs.cache-hit != 'true' + uses: actions/cache/save@v4 + with: + path: .git/lfs + key: ${{ steps.lfs-key.outputs.key }} + test: name: 'Test Python ${{ matrix.python-version }}' + needs: cache-lfs timeout-minutes: 15 runs-on: ubuntu-latest strategy: @@ -110,7 +150,17 @@ jobs: uses: actions/checkout@v4 with: submodules: recursive - lfs: true + lfs: false + + - name: Restore LFS cache + uses: actions/cache/restore@v4 + with: + path: .git/lfs + key: ${{ needs.cache-lfs.outputs.cache-key }} + fail-on-cache-miss: true + + - name: Pull LFS files + run: git lfs pull - name: Set up python 3.13 (for Poetry) id: setup-python-system @@ -217,7 +267,7 @@ jobs: implementation: [desktop, embedded] env: POETRY_HOME: '~/poetry' - EXPECTED_COMPRESSED_HASH: '02e05af059a0040d641988075cf1dfc479a084f9a34b5c8a348354211c5fa038' + EXPECTED_COMPRESSED_HASH: 'dd5e431b0cbaa6ee001b10493c2b08e6235f42e3f4ce00958e88b1b97581872e' steps: - name: Check out repository diff --git a/.gitignore b/.gitignore index 4344bd2..e69f804 100644 --- a/.gitignore +++ b/.gitignore @@ -249,10 +249,12 @@ Temporary Items # Compression benchmark datasets datasets/* !datasets/v1-compressed/ +!datasets/extended-compressed/ enwik8* *.pkl *.tamp !datasets/v1-compressed/** +!datasets/extended-compressed/** # Cython-generated files tamp/_c_compressor.c @@ -435,6 +437,9 @@ wasm/build/ *.swo *~ +# clangd (C/C++ language server) +.clangd + # Emacs *~ \#*\# diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1c1cbfc..686769a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -57,10 +57,13 @@ repos: args: ['-style=file', '-i'] exclude: ^espidf/tamp/private/tamp_search\.hpp$ - - repo: https://github.com/pocc/pre-commit-hooks - rev: v1.3.5 + - repo: local hooks: - id: cppcheck + name: cppcheck + entry: cppcheck + language: system + files: \.(c|h|cpp|hpp)$ exclude: ^(espidf|mpy_bindings|ctests|tools)/ args: [ '-Itamp/_c_src', diff --git a/CLAUDE.md b/CLAUDE.md index 1dd02c3..5ba7e8c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -26,9 +26,13 @@ different platforms: **Shared C Source:** All implementations use the same C source code in `tamp/_c_src/tamp/`: -- `common.h/c` - Shared utilities and data structures -- `compressor.h/c` - Compression implementation +- `common.h/c` - Shared utilities, data structures, stream I/O callbacks, and + dictionary initialization +- `compressor.h/c` - Compression implementation (sink/poll low-level API and + higher-level compress/flush API) - `decompressor.h/c` - Decompression implementation +- `compressor_find_match_desktop.c` - Desktop-optimized match finding (included + by `compressor.c` on non-embedded targets) ## Development Commands @@ -162,38 +166,81 @@ make website-clean # Clean website build artifacts **WebAssembly Build Process:** 1. `wasm/Makefile` compiles C source to WebAssembly using Emscripten -2. `wasm/scripts/build.js` generates multiple JS/TS distribution formats +2. `tsup` (via `npm run build:js`) bundles into multiple JS/TS distribution + formats (CJS, ESM, `.d.ts`) 3. Exports specific C functions and runtime methods for JS interop -**Configuration Flags:** +**Configuration Flags (compile-time `-D` defines):** -- `TAMP_LAZY_MATCHING=1` - Enable lazy matching optimization (default) -- `TAMP_ESP32=1` - ESP32-specific optimizations +- `TAMP_LAZY_MATCHING=1` - Enable lazy matching optimization (default in + build.py) +- `TAMP_ESP32=1` - ESP32-specific optimizations (avoids bitfields for speed) - `TAMP_COMPRESSOR`/`TAMP_DECOMPRESSOR` - Include/exclude components +- `TAMP_EXTENDED=1` - Master switch for extended format: RLE and extended match + (default: 1). `TAMP_EXTENDED_COMPRESS` and `TAMP_EXTENDED_DECOMPRESS` can + individually override. +- `TAMP_STREAM=1` - Include stream API (default: 1). Disable with + `-DTAMP_STREAM=0` to save ~2.8KB. +- `TAMP_STREAM_WORK_BUFFER_SIZE=32` - Stack-allocated work buffer for stream API + (default: 32 bytes, 256+ recommended for performance) +- `TAMP_STREAM_MEMORY` / `TAMP_STREAM_STDIO` / `TAMP_STREAM_LITTLEFS` / + `TAMP_STREAM_FATFS` - Enable built-in I/O handlers for specific backends +- `TAMP_USE_EMBEDDED_MATCH=1` - Force embedded `find_best_match` implementation + on desktop (for testing) + +**Build Environment Variables (Python):** + +- `TAMP_SANITIZE=1` - Enable AddressSanitizer + UBSan +- `TAMP_PROFILE=1` - Enable profiling (line trace, debug info) +- `TAMP_USE_EMBEDDED_MATCH=1` - Force embedded match finding +- `TAMP_BUILD_C_EXTENSIONS=0` - Skip building C extensions entirely +- `CIBUILDWHEEL=1` - CI wheel building mode (disables allowed_to_fail) ### Testing Strategy **Multi-layered Testing:** -- **Python tests** (`tests/`) - Core algorithm testing using pytest +- **Python tests** (`tests/`) - Core algorithm testing using pytest. Includes + bit reader/writer, compressor, decompressor, round-trip, CLI, dataset + regression, and file interface tests. - **WebAssembly tests** (`wasm/test/`) - JS/TS API testing with Node.js test - runner + runner (`node --test`) - **C tests** (`ctests/`) - Low-level C API testing using Unity framework + (submodule at `ctests/Unity/`). Includes stream API tests and filesystem + integration tests with LittleFS and FatFS RAM backends. - **Integration tests** - Cross-platform compatibility and performance benchmarks **Test Data Sources:** -- Enwik8 dataset (100MB) for performance benchmarking -- Silesia corpus for compression ratio evaluation +- Enwik8 dataset (100MB) for performance benchmarking (`make download-enwik8`) +- Silesia corpus for compression ratio evaluation (`make download-silesia`) - Custom test cases for edge conditions +### Compressor Architecture + +The C compressor uses a two-phase low-level API: + +1. `tamp_compressor_sink()` - Copies input bytes into a 16-byte internal ring + buffer (cheap/fast) +2. `tamp_compressor_poll()` - Runs one compression iteration on the internal + buffer (computationally intensive) + +Higher-level convenience functions (`tamp_compressor_compress`, +`tamp_compressor_compress_and_flush`) wrap these. Callback variants (`_cb` +suffix) accept a `tamp_callback_t` progress callback. + +The stream API (`tamp_compress_stream`, `tamp_decompress_stream`) provides a +file-oriented interface using read/write callbacks, supporting multiple I/O +backends (memory, stdio, LittleFS, FatFS). + ### Memory Management Patterns **Key Principle:** Fixed memory usage during compression/decompression - Window size determines memory usage: `(1 << windowBits)` bytes - No dynamic allocation during compression/decompression operations +- Stream API uses a stack-allocated work buffer (`TAMP_STREAM_WORK_BUFFER_SIZE`) - Streaming interfaces require explicit resource management (`destroy()` calls in JS/TS) @@ -202,7 +249,9 @@ make website-clean # Clean website build artifacts ### Making Changes to Core Algorithm 1. **Modify C source** in `tamp/_c_src/tamp/` -2. **Rebuild all implementations:** +2. **Update pure Python reference** in `tamp/compressor.py` / + `tamp/decompressor.py` to match +3. **Rebuild all implementations:** ```bash # Python @@ -212,11 +261,12 @@ make website-clean # Clean website build artifacts cd wasm && npm run build ``` -3. **Run comprehensive tests:** +4. **Run comprehensive tests:** ```bash - make test # Python + MicroPython + poetry run pytest # Python tests + make c-test # C unit tests with sanitizers + make c-test-embedded # C tests with embedded match finding cd wasm && npm test # WebAssembly - make c-test # C unit tests ``` ### Adding New Features @@ -232,11 +282,13 @@ make website-clean # Clean website build artifacts - **Use provided benchmarking tools:** ```bash make on-device-compression-benchmark # MicroPython performance - npm run test:enwik8 # WebAssembly performance - python tools/performance-benchmark.sh # Python performance + cd wasm && npm run test:enwik8 # WebAssembly performance + bash tools/performance-benchmark.sh # Python performance + make c-benchmark-stream # C stream API benchmark + make binary-size # ARM binary size table ``` -- **Profile with:** `tools/profiler.py` for Python, browser dev tools for - WebAssembly +- **Profile with:** `tools/profiler.py` for Python (requires `TAMP_PROFILE=1`), + browser dev tools for WebAssembly ### Release Process @@ -247,6 +299,29 @@ make website-clean # Clean website build artifacts - WebAssembly npm package 3. **CI/CD handles** cross-platform builds and testing +### Python Import Fallback Chain + +`tamp/__init__.py` imports Compressor/Decompressor using this priority: + +1. Viper (MicroPython optimized) - only available on MicroPython +2. Cython C extensions (`_c_compressor`/`_c_decompressor`) - primary on CPython +3. Pure Python reference (`compressor.py`/`decompressor.py`) - fallback + +When modifying compression behavior, changes to the C source must be mirrored in +the pure Python reference implementation to keep them in sync. + +### CI/CD + +GitHub Actions workflows (`.github/workflows/`): + +- `tests.yaml` - Lint (ruff, pre-commit) and test across Python 3.9/3.12/3.13 + and multiple OS. Also runs `c-test` and `c-test-embedded`. +- `build_wheels.yaml` - Cross-platform wheel builds via cibuildwheel +- `javascript.yaml` - WebAssembly tests on Node 18/20 +- `mpy_native_module.yaml` - MicroPython native module builds for ARM + architectures +- `esp_upload_component.yml` - ESP-IDF component registry upload + ## Documentation Style - Avoid "fake" subsections (e.g., bold text like `**Error Promotion:**` acting diff --git a/Makefile b/Makefile index 9e21e42..eb8d825 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,8 @@ help: @echo " make tamp-c-library Build static C library" @echo " make website-build Build website for deployment" +.PHONY: clean test collect-data venv download + ########################### # MicroPython Native Module @@ -73,7 +75,7 @@ MOD = tamp # Override -Os with -O2 for better performance (last flag wins) CFLAGS_EXTRA = -O2 -CFLAGS += -Itamp/_c_src -DTAMP_COMPRESSOR=$(TAMP_COMPRESSOR) -DTAMP_DECOMPRESSOR=$(TAMP_DECOMPRESSOR) +CFLAGS += -Itamp/_c_src -DTAMP_COMPRESSOR=$(TAMP_COMPRESSOR) -DTAMP_DECOMPRESSOR=$(TAMP_DECOMPRESSOR) -DTAMP_STREAM=0 -DTAMP_USE_MEMSET=0 # Compiler-specific flags based on target architecture ifeq ($(filter $(ARCH),x86 x64),) # Cross-compiling for embedded (ARM, xtensa) - use GCC flags @@ -180,7 +182,14 @@ build/enwik8-100kb: download-enwik8 @head -c 100000 datasets/enwik8 > build/enwik8-100kb build/enwik8-100kb.tamp: build/enwik8-100kb - @poetry run tamp compress build/enwik8-100kb -o build/enwik8-100kb.tamp + @# Use Python implementation for extended format compression + @poetry run tamp compress --implementation=python build/enwik8-100kb -o build/enwik8-100kb.tamp + +download-micropython: + mkdir -p datasets + cd datasets && curl -O https://micropython.org/resources/firmware/RPI_PICO-20250415-v1.25.0.uf2 + +download: download-enwik8 download-silesia download-micropython ################## @@ -218,7 +227,7 @@ define mpremote-sync fi endef -on-device-compression-benchmark: mpy build/enwik8-100kb build/enwik8-100kb.tamp +on-device-compression-benchmark: mpy build/enwik8-100kb $(MPREMOTE) rm :enwik8-100kb.tamp || true @# Remove any viper implementation that may exist from previous belay syncs $(MPREMOTE) rm :tamp/__init__.py :tamp/compressor_viper.py :tamp/decompressor_viper.py :tamp/compressor.py :tamp/decompressor.py :tamp/__main__.py :tamp/py.typed 2>/dev/null || true @@ -229,7 +238,8 @@ on-device-compression-benchmark: mpy build/enwik8-100kb build/enwik8-100kb.tamp $(MPREMOTE) soft-reset $(MPREMOTE) run tools/on-device-compression-benchmark.py $(MPREMOTE) cp :enwik8-100kb.tamp build/on-device-enwik8-100kb.tamp - cmp build/enwik8-100kb.tamp build/on-device-enwik8-100kb.tamp + poetry run tamp decompress build/on-device-enwik8-100kb.tamp -o build/on-device-enwik8-100kb-decompressed + cmp build/enwik8-100kb build/on-device-enwik8-100kb-decompressed @echo "Success!" on-device-decompression-benchmark: mpy build/enwik8-100kb.tamp @@ -283,7 +293,7 @@ mpy-viper-size: size_comp=$$(wc -c < /tmp/_tamp_comp.mpy | tr -d ' '); \ size_decomp=$$(wc -c < /tmp/_tamp_decomp.mpy | tr -d ' '); \ rm -f /tmp/_tamp_init.mpy /tmp/_tamp_comp.mpy /tmp/_tamp_decomp.mpy; \ - printf 'Tamp (MicroPython Viper) %d %d %d\n' \ + printf '%-34s %10d %12d %25d\n' "Tamp (MicroPython Viper)" \ $$((size_init + size_comp)) $$((size_init + size_decomp)) $$((size_init + size_comp + size_decomp)) mpy-native-size: @@ -299,7 +309,7 @@ endif rm -rf tamp.mpy build/tamp build/mpy_bindings build/tamp.native.mpy && \ $(MAKE) -s _mpy-build MPY_DIR=$(MPY_DIR) ARCH=armv6m TAMP_COMPRESSOR=1 TAMP_DECOMPRESSOR=1 >/dev/null 2>&1 && \ size_both=$$(wc -c < tamp.mpy | tr -d ' ') && \ - printf 'Tamp (MicroPython Native) %s %s %s\n' $$size_comp $$size_decomp $$size_both + printf '%-34s %10s %12s %25s\n' "Tamp (MicroPython Native)" $$size_comp $$size_decomp $$size_both mpy-compression-benchmark: @time belay run micropython -X heapsize=300M tools/micropython-compression-benchmark.py @@ -482,7 +492,7 @@ tamp-c-library: build/tamp.a # Binary Sizes ############### # Generate binary size information for README table (armv6m with -O3). -.PHONY: binary-size c-size +.PHONY: binary-size c-size c-size-no-extended c-size-extended ARM_CC := arm-none-eabi-gcc ARM_AR := arm-none-eabi-ar @@ -493,70 +503,85 @@ C_SRC_COMMON = tamp/_c_src/tamp/common.c C_SRC_COMP = tamp/_c_src/tamp/compressor.c C_SRC_DECOMP = tamp/_c_src/tamp/decompressor.c -# Build compressor-only library (without stream API) -build/arm/tamp_comp.a: $(C_SRC_COMMON) $(C_SRC_COMP) - @mkdir -p build/arm - $(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common_c.o - $(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o - $(ARM_AR) rcs $@ build/arm/common_c.o build/arm/compressor.o - -# Build decompressor-only library (without stream API) -build/arm/tamp_decomp.a: $(C_SRC_COMMON) $(C_SRC_DECOMP) - @mkdir -p build/arm - $(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common_d.o - $(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o - $(ARM_AR) rcs $@ build/arm/common_d.o build/arm/decompressor.o - -# Build full library (without stream API) -build/arm/tamp_full.a: $(C_SRC_COMMON) $(C_SRC_COMP) $(C_SRC_DECOMP) - @mkdir -p build/arm - $(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common_f.o - $(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor_f.o - $(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor_f.o - $(ARM_AR) rcs $@ build/arm/common_f.o build/arm/compressor_f.o build/arm/decompressor_f.o - -# Build compressor-only library (with stream API, the default) -build/arm/tamp_comp_stream.a: $(C_SRC_COMMON) $(C_SRC_COMP) - @mkdir -p build/arm - $(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMMON) -o build/arm/common_cs.o - $(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMP) -o build/arm/compressor_s.o - $(ARM_AR) rcs $@ build/arm/common_cs.o build/arm/compressor_s.o - -# Build decompressor-only library (with stream API, the default) -build/arm/tamp_decomp_stream.a: $(C_SRC_COMMON) $(C_SRC_DECOMP) - @mkdir -p build/arm - $(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common_ds.o - $(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor_s.o - $(ARM_AR) rcs $@ build/arm/common_ds.o build/arm/decompressor_s.o - -# Build full library (with stream API, the default) -build/arm/tamp_full_stream.a: $(C_SRC_COMMON) $(C_SRC_COMP) $(C_SRC_DECOMP) - @mkdir -p build/arm - $(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common_fs.o - $(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMP) -o build/arm/compressor_fs.o - $(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor_fs.o - $(ARM_AR) rcs $@ build/arm/common_fs.o build/arm/compressor_fs.o build/arm/decompressor_fs.o - -c-size: - @rm -rf build/arm - @$(MAKE) --no-print-directory build/arm/tamp_comp_stream.a build/arm/tamp_decomp_stream.a build/arm/tamp_full_stream.a build/arm/tamp_comp.a build/arm/tamp_decomp.a build/arm/tamp_full.a - @size_comp=$$($(ARM_SIZE) -B --totals build/arm/tamp_comp.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \ - size_decomp=$$($(ARM_SIZE) -B --totals build/arm/tamp_decomp.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \ - size_full=$$($(ARM_SIZE) -B --totals build/arm/tamp_full.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \ - printf 'Tamp (C, -DTAMP_STREAM=0) %d %d %d\n' $$size_comp $$size_decomp $$size_full - @size_comp=$$($(ARM_SIZE) -B --totals build/arm/tamp_comp_stream.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \ - size_decomp=$$($(ARM_SIZE) -B --totals build/arm/tamp_decomp_stream.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \ - size_full=$$($(ARM_SIZE) -B --totals build/arm/tamp_full_stream.a 2>/dev/null | grep TOTALS | awk '{print $$1+$$2}'); \ - printf 'Tamp (C) %d %d %d\n' $$size_comp $$size_decomp $$size_full +# Flags to disable extended format support +NO_EXTENDED_FLAGS = -DTAMP_EXTENDED=0 + +c-size-no-extended: + @rm -rf build/arm && mkdir -p build/arm + @# No-extended without stream API + @$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o + @$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o + @$(ARM_AR) rcs build/arm/noext_comp.a build/arm/common.o build/arm/compressor.o + @$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o + @$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o + @$(ARM_AR) rcs build/arm/noext_decomp.a build/arm/common.o build/arm/decompressor.o + @$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o + @$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o + @$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o + @$(ARM_AR) rcs build/arm/noext_full.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o + @# No-extended with stream API + @$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMMON) -o build/arm/common.o + @$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMP) -o build/arm/compressor.o + @$(ARM_AR) rcs build/arm/noext_comp_s.a build/arm/common.o build/arm/compressor.o + @$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o + @$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o + @$(ARM_AR) rcs build/arm/noext_decomp_s.a build/arm/common.o build/arm/decompressor.o + @$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o + @$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMP) -o build/arm/compressor.o + @$(ARM_CC) $(ARM_CFLAGS) $(NO_EXTENDED_FLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o + @$(ARM_AR) rcs build/arm/noext_full_s.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o + @size_comp=$$($(ARM_SIZE) -B --totals build/arm/noext_comp.a | grep TOTALS | awk '{print $$1+$$2}'); \ + size_decomp=$$($(ARM_SIZE) -B --totals build/arm/noext_decomp.a | grep TOTALS | awk '{print $$1+$$2}'); \ + size_full=$$($(ARM_SIZE) -B --totals build/arm/noext_full.a | grep TOTALS | awk '{print $$1+$$2}'); \ + printf '%-34s %10d %12d %25d\n' "Tamp (C, no extended, no stream)" $$size_comp $$size_decomp $$size_full + @size_comp=$$($(ARM_SIZE) -B --totals build/arm/noext_comp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \ + size_decomp=$$($(ARM_SIZE) -B --totals build/arm/noext_decomp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \ + size_full=$$($(ARM_SIZE) -B --totals build/arm/noext_full_s.a | grep TOTALS | awk '{print $$1+$$2}'); \ + printf '%-34s %10d %12d %25d\n' "Tamp (C, no extended)" $$size_comp $$size_decomp $$size_full + +c-size-extended: + @rm -rf build/arm && mkdir -p build/arm + @# Extended without stream API + @$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o + @$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o + @$(ARM_AR) rcs build/arm/ext_comp.a build/arm/common.o build/arm/compressor.o + @$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o + @$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o + @$(ARM_AR) rcs build/arm/ext_decomp.a build/arm/common.o build/arm/decompressor.o + @$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMMON) -o build/arm/common.o + @$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_COMP) -o build/arm/compressor.o + @$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -DTAMP_STREAM=0 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o + @$(ARM_AR) rcs build/arm/ext_full.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o + @# Extended with stream API + @$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMMON) -o build/arm/common.o + @$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=0 -c $(C_SRC_COMP) -o build/arm/compressor.o + @$(ARM_AR) rcs build/arm/ext_comp_s.a build/arm/common.o build/arm/compressor.o + @$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o + @$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=0 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o + @$(ARM_AR) rcs build/arm/ext_decomp_s.a build/arm/common.o build/arm/decompressor.o + @$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMMON) -o build/arm/common.o + @$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_COMP) -o build/arm/compressor.o + @$(ARM_CC) $(ARM_CFLAGS) -DTAMP_COMPRESSOR=1 -DTAMP_DECOMPRESSOR=1 -c $(C_SRC_DECOMP) -o build/arm/decompressor.o + @$(ARM_AR) rcs build/arm/ext_full_s.a build/arm/common.o build/arm/compressor.o build/arm/decompressor.o + @size_comp=$$($(ARM_SIZE) -B --totals build/arm/ext_comp.a | grep TOTALS | awk '{print $$1+$$2}'); \ + size_decomp=$$($(ARM_SIZE) -B --totals build/arm/ext_decomp.a | grep TOTALS | awk '{print $$1+$$2}'); \ + size_full=$$($(ARM_SIZE) -B --totals build/arm/ext_full.a | grep TOTALS | awk '{print $$1+$$2}'); \ + printf '%-34s %10d %12d %25d\n' "Tamp (C, extended, no stream)" $$size_comp $$size_decomp $$size_full + @size_comp=$$($(ARM_SIZE) -B --totals build/arm/ext_comp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \ + size_decomp=$$($(ARM_SIZE) -B --totals build/arm/ext_decomp_s.a | grep TOTALS | awk '{print $$1+$$2}'); \ + size_full=$$($(ARM_SIZE) -B --totals build/arm/ext_full_s.a | grep TOTALS | awk '{print $$1+$$2}'); \ + printf '%-34s %10d %12d %25d\n' "Tamp (C, extended)" $$size_comp $$size_decomp $$size_full + +c-size: c-size-no-extended c-size-extended binary-size: @echo "Binary sizes for armv6m (bytes):" @echo "" - @printf '%-27s %-10s %-12s %s\n' "" "Compressor" "Decompressor" "Compressor + Decompressor" - @printf '%-27s %-10s %-12s %s\n' "---------------------------" "----------" "------------" "-------------------------" - @output=$$($(MAKE) -s mpy-viper-size 2>&1) && echo "$$output" || echo "Tamp (MicroPython Viper) (requires mpy-cross)" - @output=$$($(MAKE) -s mpy-native-size 2>&1) && echo "$$output" || echo "Tamp (MicroPython Native) (requires MPY_DIR)" - @output=$$($(MAKE) -s c-size 2>&1) && echo "$$output" || echo "Tamp (C) (requires arm-none-eabi-gcc)" + @printf '%-34s %10s %12s %25s\n' "" "Compressor" "Decompressor" "Compressor + Decompressor" + @printf '%-34s %10s %12s %25s\n' "----------------------------------" "----------" "------------" "-------------------------" + @output=$$($(MAKE) -s mpy-viper-size 2>&1) && echo "$$output" || echo "Tamp (MicroPython Viper) (requires mpy-cross)" + @output=$$($(MAKE) -s mpy-native-size 2>&1) && echo "$$output" || echo "Tamp (MicroPython Native) (requires MPY_DIR)" + @output=$$($(MAKE) -s c-size 2>&1) && echo "$$output" || echo "Tamp (C) (requires arm-none-eabi-gcc)" ########## diff --git a/README.md b/README.md index 73fdf5b..2f338fc 100644 --- a/README.md +++ b/README.md @@ -34,11 +34,8 @@ of RAM and firmware storage. - `pip install tamp` will use a python-bound C implementation optimized for speed. - Micropython: - - Native Module (suggested micropython implementation). + - Native Module. - `mpy_bindings/` - - Viper. - - `tamp/__init__.py`, `tamp/compressor_viper.py`, - `tamp/decompressor_viper.py` - C library: - `tamp/_c_src/` - Javascript/Typescript via Emscripten WASM. @@ -56,14 +53,12 @@ of RAM and firmware storage. # Installation -Tamp contains 4 implementations: +Tamp contains 3 implementations: 1. A reference desktop CPython implementation that is optimized for readability (and **not** speed). 2. A Micropython Native Module implementation (fast). -3. A Micropython Viper implementation (not recommended, please use Native - Module). -4. A C implementation (with python bindings) for accelerated desktop use and to +3. A C implementation (with python bindings) for accelerated desktop use and to be used in C projects (very fast). This section instructs how to install each implementation. @@ -98,42 +93,6 @@ following to `pyproject.toml`. tamp = "https://github.com/BrianPugh/tamp/releases/download/v1.7.0/tamp-1.7.0-mpy1.23-armv6m.mpy" ``` -### MicroPython Viper - -**NOT RECOMMENDED, PLEASE USE NATIVE MODULE** - -For micropython use, there are 3 main files: - -1. `tamp/__init__.py` - Always required. -2. `tamp/decompressor_viper.py` - Required for on-device decompression. -3. `tamp/compressor_viper.py` - Required for on-device compression. - -For example, if on-device decompression isn't used, then do not include -`decompressor_viper.py`. If manually installing, just copy these files to your -microcontroller's `/lib/tamp` folder. - -If using -[mip](https://docs.micropython.org/en/latest/reference/packages.html#installing-packages-with-mip), -tamp can be installed by specifying the appropriate `package-*.json` file. - -```bash -mip install github:brianpugh/tamp # Defaults to package.json: Compressor & Decompressor -mip install github:brianpugh/tamp/package-compressor.json # Compressor only -mip install github:brianpugh/tamp/package-decompressor.json # Decompressor only -``` - -If using [Belay](https://github.com/BrianPugh/belay), tamp can be installed by -adding the following to `pyproject.toml`. - -```toml -[tool.belay.dependencies] -tamp = [ - "https://github.com/BrianPugh/tamp/blob/main/tamp/__init__.py", - "https://github.com/BrianPugh/tamp/blob/main/tamp/compressor_viper.py", - "https://github.com/BrianPugh/tamp/blob/main/tamp/decompressor_viper.py", -] -``` - ## C Copy the `tamp/_c_src/tamp` folder into your project. For more information, see @@ -258,31 +217,30 @@ input data sourced from the [Enwik8](https://mattmahoney.net/dc/textdata.html). This should give a general idea of how these algorithms perform over a variety of input data types. -| dataset | raw | tamp | tamp (LazyMatching) | zlib | heatshrink | -| --------------- | ----------- | -------------- | ------------------- | -------------- | ---------- | -| enwik8 | 100,000,000 | **51,635,633** | 51,252,113 | 56,205,166 | 56,110,394 | -| silesia/dickens | 10,192,446 | **5,546,761** | 5,511,604 | 6,049,169 | 6,155,768 | -| silesia/mozilla | 51,220,480 | 25,121,385 | 24,936,067 | **25,104,966** | 25,435,908 | -| silesia/mr | 9,970,564 | 5,027,032 | 4,886,272 | **4,864,734** | 5,442,180 | -| silesia/nci | 33,553,445 | 8,643,610 | 8,645,299 | **5,765,521** | 8,247,487 | -| silesia/ooffice | 6,152,192 | **3,814,938** | 3,798,261 | 4,077,277 | 3,994,589 | -| silesia/osdb | 10,085,684 | **8,520,835** | 8,506,443 | 8,625,159 | 8,747,527 | -| silesia/reymont | 6,627,202 | **2,847,981** | 2,820,870 | 2,897,661 | 2,910,251 | -| silesia/samba | 21,606,400 | 9,102,594 | 9,060,692 | **8,862,423** | 9,223,827 | -| silesia/sao | 7,251,944 | **6,137,755** | 6,101,744 | 6,506,417 | 6,400,926 | -| silesia/webster | 41,458,703 | **18,694,172** | 18,567,288 | 20,212,235 | 19,942,817 | -| silesia/x-ray | 8,474,240 | 7,510,606 | 7,405,814 | **7,351,750** | 8,059,723 | -| silesia/xml | 5,345,280 | 1,681,687 | 1,672,660 | **1,586,985** | 1,665,179 | - -Tamp usually out-performs heatshrink, and is generally very competitive with -zlib. While trying to be an apples-to-apples comparison, zlib still uses -significantly more memory during both compression and decompression (see next -section). Tamp accomplishes competitive performance while using around 10x less -memory. +| dataset | raw | tamp | tamp (LazyMatching) | zlib | heatshrink | +| --------------- | ----------- | ----------- | ------------------- | ------------- | ---------- | +| enwik8 | 100,000,000 | 51,017,102 | **50,626,118** | 56,205,166 | 56,110,394 | +| RPI_PICO (.uf2) | 667,648 | **289,204** | 290,442 | 303,763 | - | +| silesia/dickens | 10,192,446 | 5,538,353 | **5,502,834** | 6,049,169 | 6,155,768 | +| silesia/mozilla | 51,220,480 | 24,412,662 | **24,228,654** | 25,104,966 | 25,435,908 | +| silesia/mr | 9,970,564 | 4,519,402 | **4,393,009** | 4,864,734 | 5,442,180 | +| silesia/nci | 33,553,445 | 6,824,403 | 6,772,379 | **5,765,521** | 8,247,487 | +| silesia/ooffice | 6,152,192 | 3,773,089 | **3,755,153** | 4,077,277 | 3,994,589 | +| silesia/osdb | 10,085,684 | 8,466,875 | **8,464,328** | 8,625,159 | 8,747,527 | +| silesia/reymont | 6,627,202 | 2,818,554 | **2,788,774** | 2,897,661 | 2,910,251 | +| silesia/samba | 21,606,400 | 8,384,183 | **8,345,616** | 8,862,423 | 9,223,827 | +| silesia/sao | 7,251,944 | 6,136,077 | **6,100,061** | 6,506,417 | 6,400,926 | +| silesia/webster | 41,458,703 | 18,146,649 | **18,010,980** | 20,212,235 | 19,942,817 | +| silesia/x-ray | 8,474,240 | 7,509,449 | 7,404,794 | **7,351,750** | 8,059,723 | +| silesia/xml | 5,345,280 | 1,473,463 | **1,455,877** | 1,586,985 | 1,665,179 | + +Tamp outperforms both heatshrink and zlib on most datasets, winning 12 out of 14 +benchmarks. This is while using around 10x less memory than zlib during both +compression and decompression (see next section). Lazy Matching is a simple technique to improve compression ratios at the expense of CPU while requiring very little code. One can expect **50-75%** more CPU -usage for modest compression gains (around 0.5 - 2.0%). Because of this poor +usage for modest compression gains (around 0.5 - 2.0%). Because of this trade-off, it is disabled by default; however, in applications where we want to compress once on a powerful machine (like a desktop/server) and decompress on an embedded device, it may be worth it to spend a bit more compute. Lazy matched @@ -305,6 +263,33 @@ repeating data more efficiently. Given Tamp's excellent performance in most of the other data compression benchmark files, this is a good tradeoff for most real-world scenarios. +### Ablation Study + +The following table shows the effect of the `extended` and `lazy_matching` +compression parameters across all benchmark datasets (`window=10`, `literal=8`). + +| dataset | raw | Baseline | +lazy | +extended | +lazy +extended | +| --------------- | ----------- | ---------- | ------------------ | ------------------ | ------------------ | +| enwik8 | 100,000,000 | 51,635,633 | 51,252,694 (−0.7%) | 51,017,102 (−1.2%) | 50,626,118 (−2.0%) | +| RPI_PICO (.uf2) | 667,648 | 331,310 | 329,893 (−0.4%) | 289,204 (−12.7%) | 290,442 (−12.3%) | +| silesia/dickens | 10,192,446 | 5,546,761 | 5,511,681 (−0.6%) | 5,538,353 (−0.2%) | 5,502,834 (−0.8%) | +| silesia/mozilla | 51,220,480 | 25,121,385 | 24,937,036 (−0.7%) | 24,412,662 (−2.8%) | 24,228,654 (−3.6%) | +| silesia/mr | 9,970,564 | 5,027,032 | 4,888,930 (−2.7%) | 4,519,402 (−10.1%) | 4,393,009 (−12.6%) | +| silesia/nci | 33,553,445 | 8,643,610 | 8,645,399 (+0.0%) | 6,824,403 (−21.0%) | 6,772,379 (−21.6%) | +| silesia/ooffice | 6,152,192 | 3,814,938 | 3,798,393 (−0.4%) | 3,773,089 (−1.1%) | 3,755,153 (−1.6%) | +| silesia/osdb | 10,085,684 | 8,520,835 | 8,518,502 (−0.0%) | 8,466,875 (−0.6%) | 8,464,328 (−0.7%) | +| silesia/reymont | 6,627,202 | 2,847,981 | 2,820,948 (−0.9%) | 2,818,554 (−1.0%) | 2,788,774 (−2.1%) | +| silesia/samba | 21,606,400 | 9,102,594 | 9,061,143 (−0.5%) | 8,384,183 (−7.9%) | 8,345,616 (−8.3%) | +| silesia/sao | 7,251,944 | 6,137,755 | 6,101,747 (−0.6%) | 6,136,077 (−0.0%) | 6,100,061 (−0.6%) | +| silesia/webster | 41,458,703 | 18,694,172 | 18,567,618 (−0.7%) | 18,146,649 (−2.9%) | 18,010,980 (−3.7%) | +| silesia/x-ray | 8,474,240 | 7,510,606 | 7,406,001 (−1.4%) | 7,509,449 (−0.0%) | 7,404,794 (−1.4%) | +| silesia/xml | 5,345,280 | 1,681,687 | 1,672,827 (−0.5%) | 1,473,463 (−12.4%) | 1,455,877 (−13.4%) | + +The `extended` parameter enables additional Huffman codes for longer pattern +matches, which significantly improves compression on datasets with many long +repeating patterns (e.g., nci, samba, xml). Extended support was added in +v2.0.0. + ## Memory Usage The following table shows approximately how much memory each algorithm uses @@ -331,7 +316,7 @@ on an M3 Macbook Air. | | Compression (s) | Decompression (s) | | ---------------------------- | --------------- | ----------------- | | Tamp (Pure Python Reference) | 136.2 | 105.0 | -| Tamp (C bindings) | 5.56 | 0.544 | +| Tamp (C bindings) | 5.45 | 0.544 | | ZLib | 3.65 | 0.578 | | Heatshrink (with index) | 4.42 | 0.67 | | Heatshrink (without index) | 27.40 | 0.67 | @@ -350,8 +335,7 @@ speed Tamp can achieve. In all tests, a 1KB window (10 bit) was used. | | Compression (bytes/s) | Decompression (bytes/s) | | -------------------------------- | --------------------- | ----------------------- | -| Tamp (MicroPython Viper) | 4,300 | 42,000 | -| Tamp (Micropython Native Module) | 31,192 | 1,086,957 | +| Tamp (Micropython Native Module) | 31,949 | 1,086,957 | | Tamp (C) | 36,127 | 1,400,600 | | Deflate (micropython builtin) | 6,885 | 294,985 | @@ -365,19 +349,20 @@ compiled for the Pi Pico (`armv6m`). All libraries were compiled with `-O3`. Numbers reported in bytes. Tamp sizes were measured using `arm-none-eabi-gcc` 15.2.1 and MicroPython v1.27, and can be regenerated with `make binary-size`. -| | Compressor | Decompressor | Compressor + Decompressor | -| ------------------------- | ---------- | ------------ | ------------------------- | -| Tamp (MicroPython Viper) | 4676 | 4372 | 7917 | -| Tamp (MicroPython Native) | 3896 | 3559 | 6616 | -| Tamp (C, -DTAMP_STREAM=0) | 2028 | 1992 | 3900 | -| Tamp (C) | 2472 | 2444 | 4796 | -| Heatshrink (C) | 2956 | 3876 | 6832 | -| uzlib (C) | 2355 | 3963 | 6318 | - -Tamp C includes a high-level stream API by default. Even with `-DTAMP_STREAM=0`, -Tamp includes buffer-looping functions (like `tamp_compressor_compress`) that -Heatshrink lacks (Heatshrink only provides poll/sink primitives). In an -apples-to-apples comparison, Tamp would be even smaller. +| | Compressor | Decompressor | Compressor + Decompressor | +| -------------------------------- | ---------- | ------------ | ------------------------- | +| Tamp (MicroPython Native) | 4708 | 4339 | 8124 | +| Tamp (C, no extended, no stream) | 1466 | 1312 | 2592 | +| Tamp (C, no extended) | 1748 | 1550 | 3112 | +| Tamp (C, extended, no stream) | 2558 | 2072 | 4444 | +| Tamp (C, extended) | 2840 | 2310 | 4964 | +| Heatshrink (C) | 2956 | 3876 | 6832 | +| uzlib (C) | 2355 | 3963 | 6318 | + +Tamp C "extended" includes `tamp_compressor_compress_and_flush`. Tamp C includes +a high-level stream API by default. Even with `no stream`, Tamp includes +buffer-looping functions (like `tamp_compressor_compress`) that Heatshrink lacks +(Heatshrink only provides poll/sink primitives). ## Acknowledgement diff --git a/datasets/extended-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp b/datasets/extended-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp new file mode 100644 index 0000000..1232a50 --- /dev/null +++ b/datasets/extended-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fb7efbdc484cbf400817074910275c3f2a89aec0ab96c8984fd58423e5e9290 +size 291036 diff --git a/datasets/extended-compressed/dickens.tamp b/datasets/extended-compressed/dickens.tamp new file mode 100644 index 0000000..a1ed82b --- /dev/null +++ b/datasets/extended-compressed/dickens.tamp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db20dcfa7e76829a809a4c9d253f1b4e53b294e86db789490bc4fadb19ab5dc0 +size 5538332 diff --git a/datasets/extended-compressed/enwik8.tamp b/datasets/extended-compressed/enwik8.tamp new file mode 100644 index 0000000..edf7491 --- /dev/null +++ b/datasets/extended-compressed/enwik8.tamp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24f3e70e49e7344ebbe3ab23d274910f7aff5bb1fb1212658b1f136a99d244f4 +size 51019055 diff --git a/datasets/extended-compressed/mozilla.tamp b/datasets/extended-compressed/mozilla.tamp new file mode 100644 index 0000000..b1ee2c5 --- /dev/null +++ b/datasets/extended-compressed/mozilla.tamp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cedba7ad7710757f3c5e7bf8176fd92eede9b8e5be2e8e697f9a6dc15d45718 +size 24415401 diff --git a/datasets/extended-compressed/mr.tamp b/datasets/extended-compressed/mr.tamp new file mode 100644 index 0000000..404f417 --- /dev/null +++ b/datasets/extended-compressed/mr.tamp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a619cf3440c067f6bd5eebcf6b2145d583feca83067244c0c2585aecb4b3cae +size 4519929 diff --git a/datasets/extended-compressed/nci.tamp b/datasets/extended-compressed/nci.tamp new file mode 100644 index 0000000..085b3bf --- /dev/null +++ b/datasets/extended-compressed/nci.tamp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adf1eac94e48d44092a9e64a7e8d14e1452b357f176e2aba7ed26eb2b7340946 +size 6855616 diff --git a/datasets/extended-compressed/ooffice.tamp b/datasets/extended-compressed/ooffice.tamp new file mode 100644 index 0000000..d824344 --- /dev/null +++ b/datasets/extended-compressed/ooffice.tamp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9376fe3e9f0286c1edbfafeadb1a8d997dd8524a32dbce7438636f270d61789 +size 3773219 diff --git a/datasets/extended-compressed/osdb.tamp b/datasets/extended-compressed/osdb.tamp new file mode 100644 index 0000000..5dac28b --- /dev/null +++ b/datasets/extended-compressed/osdb.tamp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b3188714fc3361691b684fecbff75b2d2cb9e6e690887aec456469d7505a586 +size 8466736 diff --git a/datasets/extended-compressed/reymont.tamp b/datasets/extended-compressed/reymont.tamp new file mode 100644 index 0000000..1407234 --- /dev/null +++ b/datasets/extended-compressed/reymont.tamp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e45bf12f0ca5971b47cfca38b2cc47216c93e13915b9b3ac19aa4195b9e87d5 +size 2818601 diff --git a/datasets/extended-compressed/samba.tamp b/datasets/extended-compressed/samba.tamp new file mode 100644 index 0000000..259d6cc --- /dev/null +++ b/datasets/extended-compressed/samba.tamp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac74af80542ad88dd617b95047e2a77e820cfcac3ef17abb8904949b87fd605f +size 8386303 diff --git a/datasets/extended-compressed/sao.tamp b/datasets/extended-compressed/sao.tamp new file mode 100644 index 0000000..46c39e2 --- /dev/null +++ b/datasets/extended-compressed/sao.tamp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c05ac1c7d78b04874f07e10265cd254ecf9d6dcf1a3f0d1ea695815509ff0b1 +size 6136077 diff --git a/datasets/extended-compressed/webster.tamp b/datasets/extended-compressed/webster.tamp new file mode 100644 index 0000000..6c6835a --- /dev/null +++ b/datasets/extended-compressed/webster.tamp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1227df26234f9c9cf53d9be82d6a38ecd344db70bd6b25109a6e5ae1d4c1673f +size 18146647 diff --git a/datasets/extended-compressed/x-ray.tamp b/datasets/extended-compressed/x-ray.tamp new file mode 100644 index 0000000..981eb1d --- /dev/null +++ b/datasets/extended-compressed/x-ray.tamp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ba0c1fb79addae24888c12a466e84b73c32ca608836c458487226d224a63fc3 +size 7509449 diff --git a/datasets/extended-compressed/xml.tamp b/datasets/extended-compressed/xml.tamp new file mode 100644 index 0000000..67e6882 --- /dev/null +++ b/datasets/extended-compressed/xml.tamp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6adb3788d5644d28c56ea785ea4149a6f0b8ba5562c4f8dcc4a4ba8371920e97 +size 1473552 diff --git a/docs/source/c_library.rst b/docs/source/c_library.rst index dadf934..42e950a 100644 --- a/docs/source/c_library.rst +++ b/docs/source/c_library.rst @@ -5,6 +5,61 @@ C Library Tamp provides a C library optimized for low-memory-usage, fast runtime, and small binary footprint. This page describes how to use the provided library. +Compile-Time Flags +^^^^^^^^^^^^^^^^^^ +Tamp's C library can be customized via compile-time flags to control features, code size, and performance. +Pass these flags to your compiler (e.g., ``-DTAMP_STREAM=0``). + ++----------------------------------+-------------------+------------------------------------------------------------------------------+ +| Flag | Default | Description | ++==================================+===================+==============================================================================+ +| ``TAMP_EXTENDED`` | ``1`` | Default value for extended format support (RLE, extended match encoding). | +| | | Set to ``0`` to disable extended support in both compressor and decompressor.| ++----------------------------------+-------------------+------------------------------------------------------------------------------+ +| ``TAMP_EXTENDED_COMPRESS`` | ``TAMP_EXTENDED`` | Enable extended format compression. Defaults to ``TAMP_EXTENDED`` but can | +| | | be individually overridden for compressor-only or decompressor-only builds. | ++----------------------------------+-------------------+------------------------------------------------------------------------------+ +| ``TAMP_EXTENDED_DECOMPRESS`` | ``TAMP_EXTENDED`` | Enable extended format decompression. Defaults to ``TAMP_EXTENDED`` but can | +| | | be individually overridden for compressor-only or decompressor-only builds. | ++----------------------------------+-------------------+------------------------------------------------------------------------------+ +| ``TAMP_LAZY_MATCHING`` | ``0`` | Enable lazy matching support. When enabled, ``TampConf.lazy_matching`` | +| | | becomes available. Improves compression ratio by 0.5-2% at the cost of | +| | | 50-75% slower compression. Most embedded systems should leave disabled. | ++----------------------------------+-------------------+------------------------------------------------------------------------------+ +| ``TAMP_STREAM`` | ``1`` | Include stream API (``tamp_compress_stream``, ``tamp_decompress_stream``). | +| | | Disable with ``-DTAMP_STREAM=0`` to save ~2.8KB if only using low-level API. | ++----------------------------------+-------------------+------------------------------------------------------------------------------+ +| ``TAMP_STREAM_WORK_BUFFER_SIZE`` | ``32`` | Stack-allocated work buffer size (bytes) for stream API. Split in half | +| | | for input/output. Larger values reduce I/O callback invocations, | +| | | improving decompression speed. 256+ bytes recommended when stack permits. | ++----------------------------------+-------------------+------------------------------------------------------------------------------+ +| ``TAMP_STREAM_STDIO`` | ``0`` | Enable stdio (``FILE*``) stream handlers. Works with standard C library, | +| | | ESP-IDF VFS, and POSIX-compatible systems. | ++----------------------------------+-------------------+------------------------------------------------------------------------------+ +| ``TAMP_STREAM_MEMORY`` | ``0`` | Enable memory buffer stream handlers (``TampMemReader``, ``TampMemWriter``). | +| | | Useful for file-to-memory or memory-to-file operations. | ++----------------------------------+-------------------+------------------------------------------------------------------------------+ +| ``TAMP_STREAM_LITTLEFS`` | ``0`` | Enable LittleFS stream handlers. Requires LittleFS headers. | ++----------------------------------+-------------------+------------------------------------------------------------------------------+ +| ``TAMP_STREAM_FATFS`` | ``0`` | Enable FatFs (ChaN's FAT filesystem) stream handlers. Requires FatFs headers.| ++----------------------------------+-------------------+------------------------------------------------------------------------------+ +| ``TAMP_ESP32`` | ``0`` | Use ESP32-optimized variant. Avoids bitfields for speed at the cost of | +| | | slightly higher memory usage. Automatically enabled via Kconfig on ESP-IDF. | ++----------------------------------+-------------------+------------------------------------------------------------------------------+ + +**Example: Minimal decompressor-only build** + +.. code-block:: bash + + gcc -DTAMP_EXTENDED_COMPRESS=0 -DTAMP_STREAM=0 -c decompressor.c common.c + +**Example: Full-featured build with LittleFS support** + +.. code-block:: bash + + gcc -DTAMP_LAZY_MATCHING=1 -DTAMP_STREAM_LITTLEFS=1 -DTAMP_STREAM_WORK_BUFFER_SIZE=256 \ + -c compressor.c decompressor.c common.c + Overview ^^^^^^^^ To use Tamp in your C project, simply copy the contents of ``tamp/_c_src`` into your project. diff --git a/docs/source/javascript.rst b/docs/source/javascript.rst index 39bbe21..6972bef 100644 --- a/docs/source/javascript.rst +++ b/docs/source/javascript.rst @@ -51,6 +51,12 @@ Customize compression behavior with options: // For general use, 8 (the whole byte) is appropriate. literal: 7, + // Enable extended format (RLE, extended match) for better compression ratios. + // The extended format provides better compression for typical data at the + // cost of slightly more complex encoding. + // Default: true + extended: true, + // Enable lazy matching to slightly improve compression (0.5-2.0%) ratios // at the cost of 50-75% slower compression. // Most embedded systems will **not** want to use this feature and disable it. @@ -136,6 +142,7 @@ Configure compression parameters by passing in options: const options = { window: 12, // Larger window for (usually) better compression literal: 7, // ASCII text only requires 7 bits. + extended: true, // Enable extended format (RLE, extended match) lazy_matching: true // Better compression ratios; slower to compress }; diff --git a/docs/source/specification.rst b/docs/source/specification.rst index f612893..31bb102 100644 --- a/docs/source/specification.rst +++ b/docs/source/specification.rst @@ -26,7 +26,8 @@ The bit-location 0 is equivalent to typical MSb position 7 of the first byte. | [2] | custom_dictionary | A custom dictionary initialization method was used | | | | and must be provided at decompression. | +---------+-------------------+---------------------------------------------------------------------+ -| [1] | reserved | Reserved for future use. Must be 0. | +| [1] | extended | Enables extended format features (RLE, extended match encoding). | +| | | Generally improves compression, introduced in tamp v2.0.0. | +---------+-------------------+---------------------------------------------------------------------+ | [0] | more_header | If ``True``, then the next byte in the stream is more header data. | | | | Currently always ``False``, but allows for future expandability. | @@ -60,8 +61,9 @@ Modifications are made to make the implementation simpler/faster. and points at the offset from the beginning of the dictionary buffer to the pattern. The shortest pattern-length is either going to be 2 or 3 bytes, depending on ``window`` and ``literal`` parameters. The shortest pattern-length encoding must be shorter than - an equivalent stream of literals. The longest pattern-length will the minimum - pattern-length plus 13. + an equivalent stream of literals. In the basic (non-extended) format, the longest + pattern-length is the minimum pattern-length plus 13. When the ``extended`` flag + is set, longer matches are possible via extended match encoding. Classically, the ``offset`` is from the current position in the buffer. Doing so results in the ``offset`` distribution slightly favoring smaller numbers. Intuitively, it makes @@ -167,6 +169,87 @@ The maximum match-size is more likely than the second-highest match-size because For any given huffman coding schema, a equivalent coding can be obtained by inverting all the bits (reflecting the huffman tree). The single-bit, most common code ``0b0`` representing a pattern-size 2 is intentionally represented as ``0b0`` instead of ``0b1``. This makes the MSb of all other codes be 1, simplifying the decoding procedure because the number of bits read doesn't strictly have to be recorded. +Extended Format (v2.0.0+) +^^^^^^^^^^^^^^^^^^^^^^^^^ +When the ``extended`` header bit is set, two additional token types are available: +RLE (Run-Length Encoding) and Extended Match. These use Huffman symbols 12 and 13 +respectively, which in the basic format would represent match sizes ``min_pattern_size + 12`` +and ``min_pattern_size + 13``. + +Extended Huffman Encoding +------------------------- +Both RLE and Extended Match use a secondary Huffman encoding to represent their payload values. +This encoding combines a Huffman code (without the literal flag) with trailing bits: + +1. Read the Huffman symbol (12 for RLE, 13 for Extended Match) with the literal flag (``0b0``). +2. Decode an additional Huffman code (reusing the same table, but without the leading literal flag bit). +3. Read trailing bits (4 bits for RLE, 3 bits for Extended Match). +4. Combine: ``value = (huffman_index << trailing_bits) + trailing_bits_value`` + +RLE Token (Symbol 12) +--------------------- +RLE encodes runs of repeated bytes efficiently. The repeated byte is implicitly +the last byte written to the window buffer. If no bytes have been written yet +(i.e., ``window_pos == 0``), the byte at position ``window_size - 1`` of the +initial dictionary is used. + +Format: ``0b0 | huffman_code[12] | extended_huffman(count - 2, trailing=4)`` + +Where: + +- ``huffman_code[12]`` = ``0xAA`` (9 bits including literal flag) +- ``extended_huffman`` encodes ``count - 2`` with 4 trailing bits +- ``count`` ranges from 2 to 225: ``(13 << 4) + 15 + 2 = 225`` + +Window update: Only the first 8 bytes are written to the dictionary (no wrap-around). +If fewer than 8 bytes remain before the end of the window buffer, only those bytes +are written. This bounds the window update cost while still allowing the decompressor +to find subsequent pattern matches. + +.. code-block:: text + + RLE Token Structure: + +---+------------+-------------------+----------------+ + | 0 | huffman[12]| huffman(cnt>>4) | cnt & 0xF | + +---+------------+-------------------+----------------+ + |1b | 8 bits | 1-8 bits | 4 bits | + +---+------------+-------------------+----------------+ + +Extended Match Token (Symbol 13) +-------------------------------- +Extended Match allows pattern matches longer than the basic format's maximum of +``min_pattern_size + 13``. It is used when a match exceeds ``min_pattern_size + 11``. + +Format: ``0b0 | huffman_code[13] | extended_huffman(size - min_pattern_size - 12, trailing=3) | offset`` + +Where: + +- ``huffman_code[13]`` = ``0x27`` (7 bits including literal flag) +- ``extended_huffman`` encodes ``size - min_pattern_size - 12`` with 3 trailing bits +- ``offset`` is ``window`` bits, pointing to the start of the pattern +- Maximum extra size: ``(13 << 3) + 7 + 1 = 112`` +- Maximum total match size: ``min_pattern_size + 11 + 112 = min_pattern_size + 123`` + +The ``-12`` offset ensures extended matches start at ``min_pattern_size + 12``, leaving +symbols 0-11 for basic matches (0-11 maps to ``min_pattern_size`` through ``min_pattern_size + 11``). + +Window constraints: The source pattern cannot span past the window buffer boundary; +the compressor terminates extended matches early if they would cross this boundary. +Similarly, destination writes do not wrap-around; only bytes up to the end of the +window buffer are written. This simplifies implementation while having minimal +impact on compression ratio (approximately 0.02% loss). + +.. code-block:: text + + Extended Match Token Structure: + +---+------------+-------------------+----------------+--------+ + | 0 | huffman[13]| huffman(sz>>3) | sz & 0x7 | offset | + +---+------------+-------------------+----------------+--------+ + |1b | 6 bits | 1-8 bits | 3 bits | window | + +---+------------+-------------------+----------------+--------+ + + Where sz = match_size - min_pattern_size - 12 + Flush Symbol ------------ A special FLUSH symbol is encoded as the least likely Huffman code. diff --git a/espidf/tamp/compressor_esp32.cpp b/espidf/tamp/compressor_esp32.cpp index afa0c9b..a17570c 100644 --- a/espidf/tamp/compressor_esp32.cpp +++ b/espidf/tamp/compressor_esp32.cpp @@ -27,7 +27,7 @@ typedef uint32_t u16; #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define MAX_PATTERN_SIZE (compressor->min_pattern_size + 13) -#define WINDOW_SIZE (1 << compressor->conf_window) +#define WINDOW_SIZE (1 << compressor->conf.window) static inline void write_to_bit_buffer(TampCompressor *compressor, uint32_t bits, u8 n_bits) { compressor->bit_buffer_pos += n_bits; diff --git a/mpy_bindings/bindings.c b/mpy_bindings/bindings.c index d75d559..0cdf9d0 100644 --- a/mpy_bindings/bindings.c +++ b/mpy_bindings/bindings.c @@ -9,13 +9,14 @@ **********/ #include "tamp/common.h" -#define CHUNK_SIZE 32 // Must be <= 65535 +#define CHUNK_SIZE 32 // Must be >= 32 and <= 65535 +_Static_assert(CHUNK_SIZE >= 32, "CHUNK_SIZE must be >= 32 to hold flush output"); #define mp_type_bytearray (*(mp_obj_type_t *)(mp_load_global(MP_QSTR_bytearray))) static void TAMP_CHECK(tamp_res res) { - if (res == TAMP_EXCESS_BITS) { + if (TAMP_UNLIKELY(res == TAMP_EXCESS_BITS)) { nlr_raise(mp_obj_new_exception(mp_load_global(MP_QSTR_ExcessBitsError))); - } else if (res < TAMP_OK) { + } else if (TAMP_UNLIKELY(res < TAMP_OK)) { mp_raise_ValueError(""); } } @@ -67,6 +68,7 @@ static mp_obj_t compressor_make_new(const mp_obj_type_t *type, size_t n_args, si .window = mp_obj_get_int(args_in[1]), .literal = mp_obj_get_int(args_in[2]), .use_custom_dictionary = mp_obj_get_int(args_in[4]), + .extended = mp_obj_get_int(args_in[5]), }; mp_obj_compressor_t *o = mp_obj_malloc(mp_obj_compressor_t, type); @@ -75,7 +77,7 @@ static mp_obj_t compressor_make_new(const mp_obj_type_t *type, size_t n_args, si mp_buffer_info_t dictionary_buffer_info; mp_get_buffer_raise(o->dictionary, &dictionary_buffer_info, MP_BUFFER_RW); - if (dictionary_buffer_info.len < (1 << conf.window)) { + if (TAMP_UNLIKELY(dictionary_buffer_info.len < (1 << conf.window))) { mp_raise_ValueError(""); } @@ -175,7 +177,7 @@ static mp_obj_t decompressor_make_new(const mp_obj_type_t *type, size_t n_args, const uint16_t window_size = 1 << conf.window; if (o->dictionary == mp_const_none) { - if (conf.use_custom_dictionary) { + if (TAMP_UNLIKELY(conf.use_custom_dictionary)) { mp_raise_ValueError(""); } o->dictionary = mp_obj_new_bytearray_by_ref(window_size, m_malloc(window_size)); @@ -184,7 +186,7 @@ static mp_obj_t decompressor_make_new(const mp_obj_type_t *type, size_t n_args, { mp_buffer_info_t dictionary_buffer_info; mp_get_buffer_raise(o->dictionary, &dictionary_buffer_info, MP_BUFFER_RW); - if (dictionary_buffer_info.len < window_size) { + if (TAMP_UNLIKELY(dictionary_buffer_info.len < window_size)) { mp_raise_ValueError(""); } diff --git a/mpy_bindings/bindings_compressor.py b/mpy_bindings/bindings_compressor.py index 414ae6a..9fcbb81 100644 --- a/mpy_bindings/bindings_compressor.py +++ b/mpy_bindings/bindings_compressor.py @@ -9,6 +9,7 @@ def __init__( window=10, literal=8, dictionary=None, + extended=True, ): self._cf = False # shorter name to save binary space if not hasattr(f, "write"): # It's probably a path-like object. @@ -18,7 +19,7 @@ def __init__( custom = dictionary is not None if not dictionary: dictionary = bytearray(1 << window) - self._c = _C(f, window, literal, dictionary, custom) + self._c = _C(f, window, literal, dictionary, custom, extended) self.write = self._c.write diff --git a/pyproject.toml b/pyproject.toml index cf5b40e..d2b2b71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,6 +102,9 @@ unittest = [ ] uprofiler = "https://github.com/BrianPugh/micropython-libs/blob/main/lib/uprofiler.py" +[tool.pytest.ini_options] +testpaths = ["tests"] + [tool.coverage.run] branch = true omit = [ diff --git a/tamp/_c_compressor.pyx b/tamp/_c_compressor.pyx index a3e403d..9c19f0b 100644 --- a/tamp/_c_compressor.pyx +++ b/tamp/_c_compressor.pyx @@ -35,6 +35,7 @@ cdef class Compressor: int literal=8, dictionary=None, bool lazy_matching=False, + bool extended=True, ): cdef ctamp.TampConf conf @@ -55,6 +56,7 @@ cdef class Compressor: # Set lazy_matching - this field is conditionally compiled based on TAMP_LAZY_MATCHING # The build system defines this macro, so the field should be available conf.lazy_matching = lazy_matching + conf.extended = extended self._window_buffer = dictionary if dictionary else bytearray(1 << window) self._window_buffer_ptr = self._window_buffer @@ -102,7 +104,7 @@ cdef class Compressor: cpdef int flush(self, bool write_token = True) except -1: cdef ctamp.tamp_res res - cdef bytearray buffer = bytearray(24) + cdef bytearray buffer = bytearray(32) cdef size_t output_written_size = 0 res = ctamp.tamp_compressor_flush( diff --git a/tamp/_c_src/tamp/common.c b/tamp/_c_src/tamp/common.c index f88dd34..be0099a 100644 --- a/tamp/_c_src/tamp/common.c +++ b/tamp/_c_src/tamp/common.c @@ -24,7 +24,7 @@ static inline uint32_t xorshift32(uint32_t *state) { return x; } -void tamp_initialize_dictionary(unsigned char *buffer, size_t size) { +TAMP_OPTIMIZE_SIZE void tamp_initialize_dictionary(unsigned char *buffer, size_t size) { uint32_t seed = 3758097560; // This was experimentally discovered with tools/find_seed.py uint32_t randbuf = 0; for (size_t i = 0; i < size; i++) { @@ -34,10 +34,40 @@ void tamp_initialize_dictionary(unsigned char *buffer, size_t size) { } } -int8_t tamp_compute_min_pattern_size(uint8_t window, uint8_t literal) { +TAMP_OPTIMIZE_SIZE int8_t tamp_compute_min_pattern_size(uint8_t window, uint8_t literal) { return 2 + (window > (10 + ((literal - 5) << 1))); } +void tamp_window_copy(unsigned char *window, uint16_t *window_pos, uint16_t window_offset, uint8_t match_size, + uint16_t window_mask) { + /* Calculate distance from source to destination in circular buffer. + * src_to_dst = (dst - src) & mask gives the forward distance. */ + const uint16_t src_to_dst = (*window_pos - window_offset) & window_mask; + + /* Critical overlap case: destination is AHEAD of source and they overlap. + * When dst > src by less than match_size, a forward copy corrupts data because + * we write to positions before reading from them. + * + * Example: src=100, dst=105, match_size=8 + * - Forward copy at i=5 would read window[105], but we already overwrote it at i=0! + * - Must copy in REVERSE order (end to start) to read source bytes before overwriting. + */ + if (TAMP_UNLIKELY(src_to_dst < match_size && src_to_dst > 0)) { + /* Copy in reverse order: start from last byte, work backwards to first byte. + * This ensures we read all overlapping source bytes before they're overwritten. + * Destination wraps via mask; source doesn't need wrapping (pre-validated bounds). */ + for (uint8_t i = match_size; i-- > 0;) { + window[(*window_pos + i) & window_mask] = window[window_offset + i]; + } + *window_pos = (*window_pos + match_size) & window_mask; + } else { + for (uint8_t i = 0; i < match_size; i++) { + window[*window_pos] = window[window_offset + i]; + *window_pos = (*window_pos + 1) & window_mask; + } + } +} + /******************************************************************************* * Built-in I/O handler implementations ******************************************************************************/ diff --git a/tamp/_c_src/tamp/common.h b/tamp/_c_src/tamp/common.h index 0b2b8e4..11006f4 100644 --- a/tamp/_c_src/tamp/common.h +++ b/tamp/_c_src/tamp/common.h @@ -39,15 +39,53 @@ extern "C" { #define TAMP_UNLIKELY(c) (c) #endif +/* Per-function optimize attributes and #pragma GCC push/pop_options require + * GCC on a target that supports them. Xtensa GCC does not. */ +#if defined(__GNUC__) && !defined(__clang__) && !defined(__XTENSA__) +#define TAMP_HAS_GCC_OPTIMIZE 1 +#else +#define TAMP_HAS_GCC_OPTIMIZE 0 +#endif + #if defined(_MSC_VER) #define TAMP_ALWAYS_INLINE __forceinline #define TAMP_NOINLINE __declspec(noinline) -#elif defined(__GNUC__) || defined(__clang__) +#define TAMP_OPTIMIZE_SIZE /* not supported */ +#elif defined(__GNUC__) && !defined(__clang__) +#define TAMP_ALWAYS_INLINE inline __attribute__((always_inline)) +#define TAMP_NOINLINE __attribute__((noinline)) +#if TAMP_HAS_GCC_OPTIMIZE +#define TAMP_OPTIMIZE_SIZE __attribute__((optimize("Os"))) +#else +#define TAMP_OPTIMIZE_SIZE +#endif +#elif defined(__clang__) #define TAMP_ALWAYS_INLINE inline __attribute__((always_inline)) #define TAMP_NOINLINE __attribute__((noinline)) +#define TAMP_OPTIMIZE_SIZE /* clang doesn't support per-function optimize */ #else #define TAMP_ALWAYS_INLINE inline #define TAMP_NOINLINE +#define TAMP_OPTIMIZE_SIZE +#endif + +/* TAMP_USE_MEMSET: Use libc memset (default: 1). + * Set to 0 for environments without libc (e.g. MicroPython native modules). + * When disabled, uses a volatile loop that prevents GCC from emitting a + * memset call at the cost of inhibiting store coalescing. */ +#ifndef TAMP_USE_MEMSET +#define TAMP_USE_MEMSET 1 +#endif + +#if TAMP_USE_MEMSET +#include +#define TAMP_MEMSET(dst, val, n) memset((dst), (val), (n)) +#else +#define TAMP_MEMSET(dst, val, n) \ + do { \ + volatile unsigned char *_tamp_p = (volatile unsigned char *)(dst); \ + for (size_t _tamp_i = 0; _tamp_i < (n); _tamp_i++) _tamp_p[_tamp_i] = (val); \ + } while (0) #endif /* Include stream API (tamp_compress_stream, tamp_decompress_stream). @@ -68,6 +106,32 @@ extern "C" { #define TAMP_STREAM_WORK_BUFFER_SIZE 32 #endif +/* Extended format support (RLE, extended match). + * Enabled by default. Disable to save code size on minimal builds. + * + * TAMP_EXTENDED is the master switch (default: 1). + * TAMP_EXTENDED_COMPRESS and TAMP_EXTENDED_DECOMPRESS default to TAMP_EXTENDED, + * but can be individually overridden for compressor-only or decompressor-only builds. + */ +#ifndef TAMP_EXTENDED +#define TAMP_EXTENDED 1 +#endif +#ifndef TAMP_EXTENDED_DECOMPRESS +#define TAMP_EXTENDED_DECOMPRESS TAMP_EXTENDED +#endif +#ifndef TAMP_EXTENDED_COMPRESS +#define TAMP_EXTENDED_COMPRESS TAMP_EXTENDED +#endif + +/* Extended encoding constants */ +#if TAMP_EXTENDED_DECOMPRESS || TAMP_EXTENDED_COMPRESS +#define TAMP_RLE_SYMBOL 12 +#define TAMP_EXTENDED_MATCH_SYMBOL 13 +#define TAMP_LEADING_EXTENDED_MATCH_BITS 3 +#define TAMP_LEADING_RLE_BITS 4 +#define TAMP_RLE_MAX_WINDOW 8 +#endif + enum { /* Normal/Recoverable status >= 0 */ TAMP_OK = 0, @@ -93,6 +157,7 @@ typedef struct TampConf { uint16_t window : 4; // number of window bits uint16_t literal : 4; // number of literal bits uint16_t use_custom_dictionary : 1; // Use a custom initialized dictionary. + uint16_t extended : 1; // Extended format (RLE, extended match). Read from header bit [1]. #if TAMP_LAZY_MATCHING uint16_t lazy_matching : 1; // use Lazy Matching (spend 50-75% more CPU for around 0.5-2.0% better compression.) // only effects compression operations. @@ -297,6 +362,26 @@ void tamp_initialize_dictionary(unsigned char *buffer, size_t size); */ int8_t tamp_compute_min_pattern_size(uint8_t window, uint8_t literal); +/** + * @brief Copy pattern from window to window, updating window_pos. + * + * Handles potential overlap between source and destination regions by + * copying backwards when the destination would "catch up" to the source. + * + * IMPORTANT: Caller must validate that (window_offset + match_size) does not + * exceed window bounds before calling this function. This function assumes + * window_offset and match_size are pre-validated and does not perform + * bounds checking on source reads. + * + * @param window Circular buffer (size must be power of 2) + * @param window_pos Current write position (updated by this function) + * @param window_offset Source position to copy from + * @param match_size Number of bytes to copy + * @param window_mask Bitmask for wrapping (window_size - 1) + */ +void tamp_window_copy(unsigned char *window, uint16_t *window_pos, uint16_t window_offset, uint8_t match_size, + uint16_t window_mask); + #ifdef __cplusplus } #endif diff --git a/tamp/_c_src/tamp/compressor.c b/tamp/_c_src/tamp/compressor.c index 0e06100..97760e2 100644 --- a/tamp/_c_src/tamp/compressor.c +++ b/tamp/_c_src/tamp/compressor.c @@ -9,21 +9,42 @@ #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2 * !!(condition)])) +#if TAMP_EXTENDED_COMPRESS +// Extended max pattern: min_pattern_size + 11 + 112 = min_pattern_size + 123 +#define MAX_PATTERN_SIZE_EXTENDED (compressor->min_pattern_size + 123) +#define MAX_PATTERN_SIZE (compressor->conf.extended ? MAX_PATTERN_SIZE_EXTENDED : (compressor->min_pattern_size + 13)) +#else #define MAX_PATTERN_SIZE (compressor->min_pattern_size + 13) -#define WINDOW_SIZE (1 << compressor->conf_window) +#endif +#define WINDOW_SIZE (1 << compressor->conf.window) // 0xF because sizeof(TampCompressor.input) == 16; #define input_add(offset) ((compressor->input_pos + offset) & 0xF) #define read_input(offset) (compressor->input[input_add(offset)]) -#define IS_LITERAL_FLAG (1 << compressor->conf_literal) +#define IS_LITERAL_FLAG (1 << compressor->conf.literal) #define FLUSH_CODE (0xAB) +// Internal return value for poll_extended_handling: signals caller to +// proceed with normal pattern matching rather than returning immediately. +#define TAMP_POLL_CONTINUE ((tamp_res)127) + // encodes [min_pattern_bytes, min_pattern_bytes + 13] pattern lengths static const uint8_t huffman_codes[] = {0x0, 0x3, 0x8, 0xb, 0x14, 0x24, 0x26, 0x2b, 0x4b, 0x54, 0x94, 0x95, 0xaa, 0x27}; // These bit lengths pre-add the 1 bit for the 0-value is_literal flag. static const uint8_t huffman_bits[] = {0x2, 0x3, 0x5, 0x5, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0x9, 0x7}; -static inline void write_to_bit_buffer(TampCompressor *compressor, uint32_t bits, uint8_t n_bits) { +#if TAMP_EXTENDED_COMPRESS +#define RLE_MAX_COUNT ((13 << 4) + 15 + 2) // 225 +#define EXTENDED_MATCH_MAX_EXTRA ((13 << 3) + 7 + 1) // 112 + +// Minimum output buffer space required for extended match token. +// Extended match: symbol (7 bits) + extended huffman (11 bits) + window pos (15 bits) = 33 bits. +// With 7 bits in bit buffer, need up to 40 bits = 5 bytes. Add 1 byte margin. +// Pre-checking prevents OUTPUT_FULL mid-token, which would corrupt bit_buffer on retry. +#define EXTENDED_MATCH_MIN_OUTPUT_BYTES 6 +#endif + +static TAMP_NOINLINE void write_to_bit_buffer(TampCompressor* compressor, uint32_t bits, uint8_t n_bits) { compressor->bit_buffer_pos += n_bits; compressor->bit_buffer |= bits << (32 - compressor->bit_buffer_pos); } @@ -31,18 +52,27 @@ static inline void write_to_bit_buffer(TampCompressor *compressor, uint32_t bits /** * @brief Partially flush the internal bit buffer. * - * Up to 7 bits may remain in the internal bit buffer. + * Flushes complete bytes from the bit buffer. Up to 7 bits may remain. + * + * @param[in,out] compressor Compressor state. + * @param[in,out] output Output buffer pointer (updated on return). + * @param[in,out] output_size Available space (updated on return). + * @param[in,out] output_written_size Bytes written (accumulated). + * @return TAMP_OK on success, TAMP_OUTPUT_FULL if output buffer is too small. */ -static inline tamp_res partial_flush(TampCompressor *compressor, unsigned char *output, size_t output_size, - size_t *output_written_size) { - for (*output_written_size = output_size; compressor->bit_buffer_pos >= 8 && output_size; - output_size--, compressor->bit_buffer_pos -= 8, compressor->bit_buffer <<= 8) - *output++ = compressor->bit_buffer >> 24; - *output_written_size -= output_size; +static TAMP_NOINLINE TAMP_OPTIMIZE_SIZE tamp_res partial_flush(TampCompressor* compressor, unsigned char** output, + size_t* output_size, size_t* output_written_size) { + while (compressor->bit_buffer_pos >= 8 && *output_size) { + *(*output)++ = compressor->bit_buffer >> 24; + (*output_size)--; + (*output_written_size)++; + compressor->bit_buffer_pos -= 8; + compressor->bit_buffer <<= 8; + } return (compressor->bit_buffer_pos >= 8) ? TAMP_OUTPUT_FULL : TAMP_OK; } -inline bool tamp_compressor_full(const TampCompressor *compressor) { +inline bool tamp_compressor_full(const TampCompressor* compressor) { return compressor->input_size == sizeof(compressor->input); } @@ -63,7 +93,7 @@ inline bool tamp_compressor_full(const TampCompressor *compressor) { */ #if TAMP_ESP32 -extern void find_best_match(TampCompressor *compressor, uint16_t *match_index, uint8_t *match_size); +extern void find_best_match(TampCompressor* compressor, uint16_t* match_index, uint8_t* match_size); #elif (defined(__x86_64__) || defined(__aarch64__) || defined(_M_X64) || defined(_M_ARM64)) && !TAMP_USE_EMBEDDED_MATCH #include "compressor_find_match_desktop.c" @@ -78,7 +108,7 @@ extern void find_best_match(TampCompressor *compressor, uint16_t *match_index, u * @param[out] match_index If match_size is 0, this value is undefined. * @param[out] match_size Size of best found match. */ -static inline void find_best_match(TampCompressor *compressor, uint16_t *match_index, uint8_t *match_size) { +static TAMP_NOINLINE void find_best_match(TampCompressor* compressor, uint16_t* match_index, uint8_t* match_size) { *match_size = 0; if (TAMP_UNLIKELY(compressor->input_size < compressor->min_pattern_size)) return; @@ -87,7 +117,7 @@ static inline void find_best_match(TampCompressor *compressor, uint16_t *match_i const uint8_t second_byte = read_input(1); const uint32_t window_size_minus_1 = WINDOW_SIZE - 1; const uint8_t max_pattern_size = MIN(compressor->input_size, MAX_PATTERN_SIZE); - const unsigned char *window = compressor->window; + const unsigned char* window = compressor->window; for (uint32_t window_index = 0; window_index < window_size_minus_1; window_index++) { if (TAMP_LIKELY(window[window_index] != first_byte)) { @@ -135,29 +165,34 @@ static inline bool validate_no_match_overlap(uint16_t write_pos, uint16_t match_ } #endif -tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf, unsigned char *window) { +TAMP_OPTIMIZE_SIZE tamp_res tamp_compressor_init(TampCompressor* compressor, const TampConf* conf, + unsigned char* window) { const TampConf conf_default = { .window = 10, .literal = 8, .use_custom_dictionary = false, #if TAMP_LAZY_MATCHING .lazy_matching = false, +#endif +#if TAMP_EXTENDED_COMPRESS + .extended = true, // Default to extended format #endif }; if (!conf) conf = &conf_default; if (conf->window < 8 || conf->window > 15) return TAMP_INVALID_CONF; if (conf->literal < 5 || conf->literal > 8) return TAMP_INVALID_CONF; +#if !TAMP_EXTENDED_COMPRESS + if (conf->extended) return TAMP_INVALID_CONF; // Extended requested but not compiled in +#endif - for (uint8_t i = 0; i < sizeof(TampCompressor); i++) // Zero-out the struct - ((unsigned char *)compressor)[i] = 0; + TAMP_MEMSET(compressor, 0, sizeof(TampCompressor)); - compressor->conf_literal = conf->literal; - compressor->conf_window = conf->window; - compressor->conf_use_custom_dictionary = conf->use_custom_dictionary; -#if TAMP_LAZY_MATCHING - compressor->conf_lazy_matching = conf->lazy_matching; -#endif + // Build header directly from conf (8 bits total) + // Layout: [window:3][literal:2][use_custom_dictionary:1][extended:1][more_headers:1] + uint8_t header = ((conf->window - 8) << 5) | ((conf->literal - 5) << 3) | (conf->use_custom_dictionary << 2) | + (conf->extended << 1); + compressor->conf = *conf; // Single struct copy compressor->window = window; compressor->min_pattern_size = tamp_compute_min_pattern_size(conf->window, conf->literal); @@ -165,22 +200,274 @@ tamp_res tamp_compressor_init(TampCompressor *compressor, const TampConf *conf, compressor->cached_match_index = -1; // Initialize cache as invalid #endif - if (!compressor->conf_use_custom_dictionary) tamp_initialize_dictionary(window, (1 << conf->window)); + if (!conf->use_custom_dictionary) tamp_initialize_dictionary(window, (1 << conf->window)); - // Write header to bit buffer - write_to_bit_buffer(compressor, conf->window - 8, 3); - write_to_bit_buffer(compressor, conf->literal - 5, 2); - write_to_bit_buffer(compressor, conf->use_custom_dictionary, 1); - write_to_bit_buffer(compressor, 0, 1); // Reserved - write_to_bit_buffer(compressor, 0, 1); // No more header bytes + write_to_bit_buffer(compressor, header, 8); return TAMP_OK; } -TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned char *output, size_t output_size, - size_t *output_written_size) { +#if TAMP_EXTENDED_COMPRESS +/** + * @brief Write extended huffman encoding (huffman + trailing bits). + * + * Used for both RLE count and extended match size encoding. + * + * @param[in,out] compressor Compressor with bit buffer. + * @param[in] value The value to encode. + * @param[in] trailing_bits Number of trailing bits (3 for extended match, 4 for RLE). + */ +static TAMP_NOINLINE TAMP_OPTIMIZE_SIZE void write_extended_huffman(TampCompressor* compressor, uint8_t value, + uint8_t trailing_bits) { + uint8_t code_index = value >> trailing_bits; + // Write huffman code (without literal flag) + trailing bits in one call + write_to_bit_buffer(compressor, (huffman_codes[code_index] << trailing_bits) | (value & ((1 << trailing_bits) - 1)), + (huffman_bits[code_index] - 1) + trailing_bits); +} + +/** + * @brief Get the last byte written to the window. + * + * NOINLINE: called from 3 sites; outlining saves ~44 bytes on armv6m. + */ +static TAMP_NOINLINE TAMP_OPTIMIZE_SIZE uint8_t get_last_window_byte(TampCompressor* compressor) { + uint16_t prev_pos = (compressor->window_pos - 1) & ((1 << compressor->conf.window) - 1); + return compressor->window[prev_pos]; +} + +/** + * @brief Search for extended match continuation using implicit pattern comparison. + * + * Searches for pattern: window[current_pos:current_pos+current_count] + input[0...] + * starting from current_pos. Returns the longest match found (which may be at + * current_pos itself if O(1) extension works, or at a different position). + * + * NOINLINE + Os: Called only during extended match continuation (rare path). + * Outlining saves ~100 bytes in poll on armv6m. + * + * @param[in] compressor TampCompressor object + * @param[in] current_pos Current match position in window (also search start) + * @param[in] current_count Current match length + * @param[out] new_pos Position of found match (only valid if new_count > current_count) + * @param[out] new_count Length of found match + */ +static TAMP_NOINLINE TAMP_OPTIMIZE_SIZE void find_extended_match(TampCompressor* compressor, uint16_t current_pos, + uint8_t current_count, uint16_t* new_pos, + uint8_t* new_count) { + // Preconditions (guaranteed by caller): + // - input_size > 0 + // - current_pos + current_count < WINDOW_SIZE + // - current_count < MAX_PATTERN_SIZE + *new_count = 0; + const unsigned char* window = compressor->window; + const uint16_t window_size = WINDOW_SIZE; + const uint8_t max_pattern = MIN(current_count + compressor->input_size, MAX_PATTERN_SIZE); + const uint8_t extend_byte = read_input(0); + + for (uint16_t cand = current_pos; cand + current_count + 1 <= window_size; cand++) { + // Check extension byte first (most discriminating) + if (window[cand + current_count] != extend_byte) continue; + + // Check if current_count bytes match (at cand==current_pos, compares with self) + uint8_t i = 0; + while (i < current_count && window[cand + i] == window[current_pos + i]) i++; + if (i < current_count) continue; + + // Found a match - extend as far as possible + const uint8_t cand_max = MIN(max_pattern, window_size - cand); + uint8_t match_len = current_count + 1; + for (i = current_count + 1; i < cand_max; i++) { + if (window[cand + i] != read_input(i - current_count)) break; + match_len = i + 1; + } + + if (match_len > *new_count) { + *new_count = match_len; + *new_pos = cand; + if (match_len == max_pattern) return; + } + } +} + +/** + * @brief Write RLE token to bit buffer and update window. + * + * @param[in,out] compressor Compressor state. + * @param[in] count Number of repeated bytes (must be >= 2). + */ +static TAMP_NOINLINE void write_rle_token(TampCompressor* compressor, uint8_t count) { + const uint16_t window_mask = (1 << compressor->conf.window) - 1; + uint8_t symbol = get_last_window_byte(compressor); + + // Write RLE symbol (12) with literal flag + // Note: symbols 12 and 13 are at indices 12 and 13 in huffman table (not offset by min_pattern_size) + write_to_bit_buffer(compressor, huffman_codes[TAMP_RLE_SYMBOL], huffman_bits[TAMP_RLE_SYMBOL]); + // Write extended huffman for count-2 + write_extended_huffman(compressor, count - 2, TAMP_LEADING_RLE_BITS); + + // Write up to TAMP_RLE_MAX_WINDOW bytes to window (or until buffer end, no wrap) + uint16_t remaining = WINDOW_SIZE - compressor->window_pos; + uint8_t window_write = MIN(MIN(count, TAMP_RLE_MAX_WINDOW), remaining); + for (uint8_t i = 0; i < window_write; i++) { + compressor->window[compressor->window_pos] = symbol; + compressor->window_pos = (compressor->window_pos + 1) & window_mask; + } +} + +/** + * @brief Write extended match token to bit buffer and update window. + * + * Token format: symbol (7 bits) + extended_huffman (up to 11 bits) + window_pos (up to 15 bits) + * Total: up to 33 bits. We flush after symbol+huffman (18 bits max) to ensure window_pos fits. + * + * @param[in,out] compressor Compressor state. + * @param[in,out] output Output buffer pointer (updated on return). + * @param[in,out] output_size Available space (updated on return). + * @param[in,out] output_written_size Bytes written (accumulated). + * @return TAMP_OK on success, TAMP_OUTPUT_FULL if output buffer is too small. + */ +#if TAMP_HAS_GCC_OPTIMIZE +#pragma GCC push_options +#pragma GCC optimize("-fno-reorder-blocks") +#endif +static TAMP_NOINLINE tamp_res write_extended_match_token(TampCompressor* compressor, unsigned char** output, + size_t* output_size, size_t* output_written_size) { + // Pre-check output space to prevent OUTPUT_FULL mid-token (would corrupt bit_buffer) + if (TAMP_UNLIKELY(*output_size < EXTENDED_MATCH_MIN_OUTPUT_BYTES)) return TAMP_OUTPUT_FULL; + + const uint16_t window_mask = (1 << compressor->conf.window) - 1; + const uint8_t count = compressor->extended_match_count; + const uint16_t position = compressor->extended_match_position; tamp_res res; - const uint16_t window_mask = (1 << compressor->conf_window) - 1; + + // Write symbol (7 bits) + extended huffman (up to 11 bits) = 18 bits max + // With ≤7 bits already in buffer, total ≤25 bits - fits in 32-bit buffer + write_to_bit_buffer(compressor, huffman_codes[TAMP_EXTENDED_MATCH_SYMBOL], + huffman_bits[TAMP_EXTENDED_MATCH_SYMBOL]); + write_extended_huffman(compressor, count - compressor->min_pattern_size - 11 - 1, TAMP_LEADING_EXTENDED_MATCH_BITS); + + // Flush to make room for window position (up to 15 bits) + res = partial_flush(compressor, output, output_size, output_written_size); + if (TAMP_UNLIKELY(res != TAMP_OK)) return res; + + // Write window position - with ≤7 bits remaining, up to 22 bits total - fits + write_to_bit_buffer(compressor, position, compressor->conf.window); + + // Final flush + res = partial_flush(compressor, output, output_size, output_written_size); + if (TAMP_UNLIKELY(res != TAMP_OK)) return res; + + // Write to window (up to end of buffer, no wrap) + uint16_t remaining = WINDOW_SIZE - compressor->window_pos; + uint8_t window_write = MIN(count, remaining); + tamp_window_copy(compressor->window, &compressor->window_pos, position, window_write, window_mask); + + compressor->extended_match_count = 0; // Position reset not needed - only read when count > 0 + + return TAMP_OK; +} +#if TAMP_HAS_GCC_OPTIMIZE +#pragma GCC pop_options +#endif + +/** + * @brief Handle all extended-specific logic in poll (match continuation + RLE). + * + * NOINLINE + Os: Extended paths are rarely executed. Outlining from poll saves + * significant code size on register-constrained Cortex-M0+ where the compiler + * otherwise spills heavily to stack (~48 bytes saved on armv6m). + * + * @return TAMP_OK if fully handled (caller should return TAMP_OK), + * TAMP_POLL_CONTINUE if caller should proceed to normal pattern matching, + * other tamp_res on error. + */ +static TAMP_NOINLINE TAMP_OPTIMIZE_SIZE tamp_res poll_extended_handling(TampCompressor* compressor, + unsigned char** output, size_t* output_size, + size_t* output_written_size) { + // Handle extended match continuation + if (compressor->extended_match_count) { + const uint8_t max_ext_match = compressor->min_pattern_size + 11 + EXTENDED_MATCH_MAX_EXTRA; + + while (compressor->input_size > 0) { + const uint16_t current_pos = compressor->extended_match_position; + const uint8_t current_count = compressor->extended_match_count; + + if (current_pos + current_count >= WINDOW_SIZE || current_count >= max_ext_match) { + return write_extended_match_token(compressor, output, output_size, output_written_size); + } + + uint16_t new_pos; + uint8_t new_count; + find_extended_match(compressor, current_pos, current_count, &new_pos, &new_count); + + if (new_count > current_count) { + uint8_t extra_bytes = new_count - current_count; + compressor->extended_match_position = new_pos; + compressor->extended_match_count = new_count; + compressor->input_pos = input_add(extra_bytes); + compressor->input_size -= extra_bytes; + continue; + } + + return write_extended_match_token(compressor, output, output_size, output_written_size); + } + return TAMP_OK; + } + + // Handle RLE accumulation + uint8_t last_byte = get_last_window_byte(compressor); + + uint8_t rle_available = 0; + while (rle_available < compressor->input_size && compressor->rle_count + rle_available < RLE_MAX_COUNT && + compressor->input[input_add(rle_available)] == last_byte) { + rle_available++; + } + + uint8_t total_rle = compressor->rle_count + rle_available; + bool rle_ended = (rle_available < compressor->input_size) || (total_rle >= RLE_MAX_COUNT); + + if (!rle_ended && total_rle > 0) { + compressor->rle_count = total_rle; + compressor->input_pos = input_add(rle_available); + compressor->input_size -= rle_available; + return TAMP_OK; + } + + if (total_rle >= 2) { + if (total_rle == rle_available && total_rle <= 6) { + uint16_t pattern_index; + uint8_t pattern_size; + find_best_match(compressor, &pattern_index, &pattern_size); + + if (pattern_size > total_rle) { + compressor->rle_count = 0; + return TAMP_POLL_CONTINUE; // Proceed to pattern matching + } + } + + compressor->input_pos = input_add(rle_available); + compressor->input_size -= rle_available; + write_rle_token(compressor, total_rle); + compressor->rle_count = 0; + return TAMP_OK; + } + + if (total_rle == 1) compressor->rle_count = 0; + return TAMP_POLL_CONTINUE; // Proceed to pattern matching +} +#endif // TAMP_EXTENDED_COMPRESS + +#if TAMP_HAS_GCC_OPTIMIZE +#pragma GCC push_options +#pragma GCC optimize("-fno-schedule-insns2") +#endif +TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor* compressor, unsigned char* output, size_t output_size, + size_t* output_written_size) { + tamp_res res; + // Cache bitfield values for faster access in hot path + const uint8_t conf_window = compressor->conf.window; + const uint8_t conf_literal = compressor->conf.literal; + const uint16_t window_mask = (1 << conf_window) - 1; size_t output_written_size_proxy; if (!output_written_size) output_written_size = &output_written_size_proxy; @@ -188,23 +475,26 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned if (TAMP_UNLIKELY(compressor->input_size == 0)) return TAMP_OK; - { - // Make sure there's enough room in the bit buffer. - size_t flush_bytes_written; - res = partial_flush(compressor, output, output_size, &flush_bytes_written); - (*output_written_size) += flush_bytes_written; - if (TAMP_UNLIKELY(res != TAMP_OK)) return res; - output_size -= flush_bytes_written; - output += flush_bytes_written; // cppcheck-suppress unreadVariable - } + // Make sure there's enough room in the bit buffer. + res = partial_flush(compressor, &output, &output_size, output_written_size); + if (TAMP_UNLIKELY(res != TAMP_OK)) return res; if (TAMP_UNLIKELY(output_size == 0)) return TAMP_OUTPUT_FULL; uint8_t match_size = 0; uint16_t match_index = 0; +#if TAMP_EXTENDED_COMPRESS + if (TAMP_UNLIKELY(compressor->conf.extended)) { + // Handle extended match continuation + RLE (outlined for code size) + res = poll_extended_handling(compressor, &output, &output_size, output_written_size); + if (res != TAMP_POLL_CONTINUE) return res; + // TAMP_POLL_CONTINUE: proceed to pattern matching below + } +#endif // TAMP_EXTENDED_COMPRESS + #if TAMP_LAZY_MATCHING - if (compressor->conf_lazy_matching) { + if (compressor->conf.lazy_matching) { // Check if we have a cached match from lazy matching if (TAMP_UNLIKELY(compressor->cached_match_index >= 0)) { match_index = compressor->cached_match_index; @@ -213,15 +503,7 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned } else { find_best_match(compressor, &match_index, &match_size); } - } else { - find_best_match(compressor, &match_index, &match_size); - } -#else - find_best_match(compressor, &match_index, &match_size); -#endif -#if TAMP_LAZY_MATCHING - if (compressor->conf_lazy_matching) { // Lazy matching: if we have a good match, check if position i+1 has a better match if (match_size >= compressor->min_pattern_size && match_size <= 8 && compressor->input_size > match_size + 2) { // Temporarily advance input position to check next position @@ -240,54 +522,51 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned // literal and cache the next match if (next_match_size > match_size && validate_no_match_overlap(compressor->window_pos, next_match_index, next_match_size)) { - // Write LITERAL at current position - match_size = 1; - unsigned char c = read_input(0); - if (TAMP_UNLIKELY(c >> compressor->conf_literal)) { - return TAMP_EXCESS_BITS; - } - write_to_bit_buffer(compressor, IS_LITERAL_FLAG | c, compressor->conf_literal + 1); + // Force literal at current position, cache next match + compressor->cached_match_index = next_match_index; + compressor->cached_match_size = next_match_size; + match_size = 0; // Will trigger literal write below } else { - // Use current match, clear cache compressor->cached_match_index = -1; - uint8_t huffman_index = match_size - compressor->min_pattern_size; - write_to_bit_buffer(compressor, huffman_codes[huffman_index], huffman_bits[huffman_index]); - write_to_bit_buffer(compressor, match_index, compressor->conf_window); + // Note: No V2 extended match check here - we're in the match_size <= 8 branch, + // so extended matches (which require match_size > min_pattern_size + 11) are impossible. } - } else if (TAMP_UNLIKELY(match_size < compressor->min_pattern_size)) { - // Write LITERAL - compressor->cached_match_index = -1; // Clear cache - match_size = 1; - unsigned char c = read_input(0); - if (TAMP_UNLIKELY(c >> compressor->conf_literal)) { - return TAMP_EXCESS_BITS; - } - write_to_bit_buffer(compressor, IS_LITERAL_FLAG | c, compressor->conf_literal + 1); } else { - // Write TOKEN compressor->cached_match_index = -1; // Clear cache - uint8_t huffman_index = match_size - compressor->min_pattern_size; - write_to_bit_buffer(compressor, huffman_codes[huffman_index], huffman_bits[huffman_index]); - write_to_bit_buffer(compressor, match_index, compressor->conf_window); } - } else + } else { + find_best_match(compressor, &match_index, &match_size); + } +#else + find_best_match(compressor, &match_index, &match_size); #endif - { - // Non-lazy matching path - if (TAMP_UNLIKELY(match_size < compressor->min_pattern_size)) { - // Write LITERAL - match_size = 1; - unsigned char c = read_input(0); - if (TAMP_UNLIKELY(c >> compressor->conf_literal)) { - return TAMP_EXCESS_BITS; - } - write_to_bit_buffer(compressor, IS_LITERAL_FLAG | c, compressor->conf_literal + 1); - } else { - // Write TOKEN - uint8_t huffman_index = match_size - compressor->min_pattern_size; - write_to_bit_buffer(compressor, huffman_codes[huffman_index], huffman_bits[huffman_index]); - write_to_bit_buffer(compressor, match_index, compressor->conf_window); + + // Shared token/literal writing logic + if (TAMP_UNLIKELY(match_size < compressor->min_pattern_size)) { + // Write LITERAL + match_size = 1; + unsigned char c = read_input(0); + if (TAMP_UNLIKELY(c >> conf_literal)) { + return TAMP_EXCESS_BITS; } + write_to_bit_buffer(compressor, (1 << conf_literal) | c, conf_literal + 1); + } else { +#if TAMP_EXTENDED_COMPRESS + // Extended: Start extended match continuation + if (compressor->conf.extended && match_size > compressor->min_pattern_size + 11) { + compressor->extended_match_count = match_size; + compressor->extended_match_position = match_index; + // Consume matched bytes from input + compressor->input_pos = input_add(match_size); + compressor->input_size -= match_size; + // Return - continuation code at start of poll will try to extend or emit + return TAMP_OK; + } +#endif // TAMP_EXTENDED_COMPRESS + // Write TOKEN (huffman code + window position) + uint8_t huffman_index = match_size - compressor->min_pattern_size; + write_to_bit_buffer(compressor, (huffman_codes[huffman_index] << conf_window) | match_index, + huffman_bits[huffman_index] + conf_window); } // Populate Window for (uint8_t i = 0; i < match_size; i++) { @@ -299,9 +578,12 @@ TAMP_NOINLINE tamp_res tamp_compressor_poll(TampCompressor *compressor, unsigned return TAMP_OK; } +#if TAMP_HAS_GCC_OPTIMIZE +#pragma GCC pop_options +#endif -void tamp_compressor_sink(TampCompressor *compressor, const unsigned char *input, size_t input_size, - size_t *consumed_size) { +void tamp_compressor_sink(TampCompressor* compressor, const unsigned char* input, size_t input_size, + size_t* consumed_size) { size_t consumed_size_proxy; if (TAMP_LIKELY(consumed_size)) *consumed_size = 0; @@ -316,9 +598,11 @@ void tamp_compressor_sink(TampCompressor *compressor, const unsigned char *input } } -tamp_res tamp_compressor_compress_cb(TampCompressor *compressor, unsigned char *output, size_t output_size, - size_t *output_written_size, const unsigned char *input, size_t input_size, - size_t *input_consumed_size, tamp_callback_t callback, void *user_data) { +TAMP_OPTIMIZE_SIZE tamp_res tamp_compressor_compress_cb(TampCompressor* compressor, unsigned char* output, + size_t output_size, size_t* output_written_size, + const unsigned char* input, size_t input_size, + size_t* input_consumed_size, tamp_callback_t callback, + void* user_data) { tamp_res res; size_t input_consumed_size_proxy = 0, output_written_size_proxy = 0; size_t total_input_size = input_size; @@ -357,8 +641,12 @@ tamp_res tamp_compressor_compress_cb(TampCompressor *compressor, unsigned char * return TAMP_OK; } -tamp_res tamp_compressor_flush(TampCompressor *compressor, unsigned char *output, size_t output_size, - size_t *output_written_size, bool write_token) { +#if TAMP_HAS_GCC_OPTIMIZE +#pragma GCC push_options +#pragma GCC optimize("-fno-tree-pre") +#endif +tamp_res tamp_compressor_flush(TampCompressor* compressor, unsigned char* output, size_t output_size, + size_t* output_written_size, bool write_token) { tamp_res res; size_t chunk_output_written_size; size_t output_written_size_proxy; @@ -366,51 +654,81 @@ tamp_res tamp_compressor_flush(TampCompressor *compressor, unsigned char *output if (!output_written_size) output_written_size = &output_written_size_proxy; *output_written_size = 0; - while (compressor->input_size) { - // Compress the remainder of the input buffer. +flush_check: + // Flush pending bits before checking for more work + chunk_output_written_size = 0; + res = partial_flush(compressor, &output, &output_size, output_written_size); + if (TAMP_UNLIKELY(res != TAMP_OK)) return res; + + if (TAMP_LIKELY(compressor->input_size)) { res = tamp_compressor_poll(compressor, output, output_size, &chunk_output_written_size); - (*output_written_size) += chunk_output_written_size; - if (TAMP_UNLIKELY(res != TAMP_OK)) return res; - output_size -= chunk_output_written_size; - output += chunk_output_written_size; } - - // Perform partial flush to see if we need a FLUSH token (check if output buffer in not empty), - // and to subsequently make room for the FLUSH token. - res = partial_flush(compressor, output, output_size, &chunk_output_written_size); - output_size -= chunk_output_written_size; +#if TAMP_EXTENDED_COMPRESS + else if (compressor->conf.extended && compressor->rle_count >= 1) { + if (compressor->rle_count == 1) { + // Single byte - write as literal (can't use RLE token for count < 2) + uint8_t literal = get_last_window_byte(compressor); + write_to_bit_buffer(compressor, IS_LITERAL_FLAG | literal, compressor->conf.literal + 1); + + // Write to window + const uint16_t window_mask = (1 << compressor->conf.window) - 1; + compressor->window[compressor->window_pos] = literal; + compressor->window_pos = (compressor->window_pos + 1) & window_mask; + } else { + // count >= 2: write as RLE token + write_rle_token(compressor, compressor->rle_count); + } + compressor->rle_count = 0; + } else if (compressor->conf.extended && compressor->extended_match_count) { + res = write_extended_match_token(compressor, &output, &output_size, output_written_size); + } +#endif // TAMP_EXTENDED_COMPRESS + else { + goto flush_done; + } (*output_written_size) += chunk_output_written_size; - output += chunk_output_written_size; if (TAMP_UNLIKELY(res != TAMP_OK)) return res; - - // Check if there's enough output buffer space - if (compressor->bit_buffer_pos) { - if (output_size == 0) { - return TAMP_OUTPUT_FULL; - } - if (write_token) { - if (output_size < 2) return TAMP_OUTPUT_FULL; - write_to_bit_buffer(compressor, FLUSH_CODE, 9); - } + output_size -= chunk_output_written_size; + output += chunk_output_written_size; + goto flush_check; + +flush_done: + // At this point, up to 7 bits may remain in the compressor->bit_buffer + // The output buffer may have 0 bytes remaining. + if (write_token && compressor->bit_buffer_pos) { + // We don't want to write the FLUSH token to the bit_buffer unless + // we are confident that it'll wind up in the output buffer + // in THIS function call. + // Otherwise, if we wind up with a TAMP_OUTPUT_FULL result, we could + // end up accidentally writing multiple FLUSH tokens. + if (TAMP_UNLIKELY(output_size < 2)) return TAMP_OUTPUT_FULL; + write_to_bit_buffer(compressor, FLUSH_CODE, 9); } - // Flush the remainder of the output bit-buffer - while (compressor->bit_buffer_pos) { + // At this point, up to 16 bits may remain in the compressor->bit_buffer + // The output buffer may have 0 bytes remaining. + + // Flush whole bytes, then write trailing partial byte + res = partial_flush(compressor, &output, &output_size, output_written_size); + if (compressor->bit_buffer_pos) { + if (TAMP_UNLIKELY(output_size == 0)) return TAMP_OUTPUT_FULL; *output = compressor->bit_buffer >> 24; - output++; - compressor->bit_buffer <<= 8; - compressor->bit_buffer_pos -= MIN(compressor->bit_buffer_pos, 8); - output_size--; (*output_written_size)++; + compressor->bit_buffer_pos = 0; + compressor->bit_buffer = 0; } - return TAMP_OK; + return res; } +#if TAMP_HAS_GCC_OPTIMIZE +#pragma GCC pop_options +#endif -tamp_res tamp_compressor_compress_and_flush_cb(TampCompressor *compressor, unsigned char *output, size_t output_size, - size_t *output_written_size, const unsigned char *input, - size_t input_size, size_t *input_consumed_size, bool write_token, - tamp_callback_t callback, void *user_data) { +TAMP_OPTIMIZE_SIZE tamp_res tamp_compressor_compress_and_flush_cb(TampCompressor* compressor, unsigned char* output, + size_t output_size, size_t* output_written_size, + const unsigned char* input, size_t input_size, + size_t* input_consumed_size, bool write_token, + tamp_callback_t callback, void* user_data) { tamp_res res; size_t flush_size; size_t output_written_size_proxy; @@ -433,9 +751,10 @@ tamp_res tamp_compressor_compress_and_flush_cb(TampCompressor *compressor, unsig #if TAMP_STREAM -tamp_res tamp_compress_stream(TampCompressor *compressor, tamp_read_t read_cb, void *read_handle, tamp_write_t write_cb, - void *write_handle, size_t *input_consumed_size, size_t *output_written_size, - tamp_callback_t callback, void *user_data) { +TAMP_OPTIMIZE_SIZE tamp_res tamp_compress_stream(TampCompressor* compressor, tamp_read_t read_cb, void* read_handle, + tamp_write_t write_cb, void* write_handle, size_t* input_consumed_size, + size_t* output_written_size, tamp_callback_t callback, + void* user_data) { size_t input_consumed_size_proxy, output_written_size_proxy; if (!input_consumed_size) input_consumed_size = &input_consumed_size_proxy; if (!output_written_size) output_written_size = &output_written_size_proxy; diff --git a/tamp/_c_src/tamp/compressor.h b/tamp/_c_src/tamp/compressor.h index cd6ca1f..577eac5 100644 --- a/tamp/_c_src/tamp/compressor.h +++ b/tamp/_c_src/tamp/compressor.h @@ -7,58 +7,52 @@ extern "C" { #include "common.h" -/* Externally, do not directly edit ANY of these attributes */ +/* Externally, do not directly edit ANY of these attributes. + * Fields are ordered by access frequency for cache efficiency. + */ typedef struct TampCompressor { - /* nicely aligned attributes */ - #if TAMP_ESP32 // Avoid bitfields for speed. - uint32_t window_pos; - uint32_t bit_buffer_pos; + /* HOT: accessed every iteration of the compression loop */ + unsigned char *window; // Pointer to window buffer + uint32_t bit_buffer; // Bit buffer for output (32 bits) + uint32_t window_pos; // Current position in window (15 bits used) + uint32_t bit_buffer_pos; // Bits currently in bit_buffer (6 bits used) + uint32_t input_size; // Bytes in input buffer (5 bits used; 0-16) + uint32_t input_pos; // Current position in input buffer (4 bits used; 0-15) + unsigned char input[16]; // Input ring buffer - uint32_t input_size; - uint32_t input_pos; + /* WARM: read frequently, often cached in locals */ + uint8_t min_pattern_size; // Minimum pattern size (2 bits used; 2 or 3) + TampConf conf; +#else // Use bitfields for reduced memory-usage + /* HOT: accessed every iteration of the compression loop */ + unsigned char *window; // Pointer to window buffer + uint32_t bit_buffer; // Bit buffer for output (32 bits) + uint16_t window_pos; // Current position in window (15 bits used) + uint8_t bit_buffer_pos; // Bits currently in bit_buffer (6 bits used) + uint8_t input_size; // Bytes in input buffer (5 bits used; 0-16) + uint8_t input_pos; // Current position in input buffer (4 bits used; 0-15) + unsigned char input[16]; // Input ring buffer - /* Conf attributes */ - uint8_t conf_window; // number of window bits - uint8_t conf_literal; // number of literal bits - uint8_t conf_use_custom_dictionary; // Use a custom initialized dictionary. -#if TAMP_LAZY_MATCHING - uint8_t conf_lazy_matching; // Use lazy matching for better compression -#endif - uint8_t min_pattern_size; + /* WARM: read frequently, often cached in locals */ + uint8_t min_pattern_size; // Minimum pattern size (2 or 3) + TampConf conf; +#endif // TAMP_ESP32 + /* Fields interleaved to avoid internal padding when both LAZY_MATCHING and EXTENDED_COMPRESS enabled */ #if TAMP_LAZY_MATCHING - /* Lazy matching cache */ - int16_t cached_match_index; - uint8_t cached_match_size; + int16_t cached_match_index; // Lazy matching cache #endif -#else // Use bitfields for reduced memory-usage - /* Conf attributes */ - uint32_t conf_window : 4; // number of window bits - uint32_t conf_literal : 4; // number of literal bits - uint32_t conf_use_custom_dictionary : 1; // Use a custom initialized dictionary. -#if TAMP_LAZY_MATCHING - uint32_t conf_lazy_matching : 1; // Use lazy matching for better compression +#if TAMP_EXTENDED_COMPRESS + uint16_t extended_match_position; // Window position for extended match #endif - - /* Other small attributes */ - uint32_t window_pos : 15; - uint32_t bit_buffer_pos : 6; - uint32_t min_pattern_size : 2; - - uint32_t input_size : 5; - uint32_t input_pos : 4; - #if TAMP_LAZY_MATCHING - /* Lazy matching cache */ - int16_t cached_match_index; uint8_t cached_match_size; #endif -#endif // TAMP_ESP32 - unsigned char input[16] /* __attribute__ ((aligned (16)))*/; - uint32_t bit_buffer; - - unsigned char *window; +#if TAMP_EXTENDED_COMPRESS + uint8_t rle_count; // Current RLE run length (max 225) + uint8_t extended_match_count; // Current extended match size (max ~126) +#endif } TampCompressor; /** diff --git a/tamp/_c_src/tamp/decompressor.c b/tamp/_c_src/tamp/decompressor.c index a7c0baa..92af94c 100644 --- a/tamp/_c_src/tamp/decompressor.c +++ b/tamp/_c_src/tamp/decompressor.c @@ -7,15 +7,24 @@ #define FLUSH 15 +#if TAMP_EXTENDED_DECOMPRESS +/* Token state for extended decode suspend/resume (2 bits). + * TOKEN_RLE and TOKEN_EXT_MATCH_FRESH are arranged so that: + * token_state = match_size - (TAMP_RLE_SYMBOL - 1) + * maps TAMP_RLE_SYMBOL (12) -> 1 and TAMP_EXTENDED_MATCH_SYMBOL (13) -> 2. + */ +#define TOKEN_NONE 0 +#define TOKEN_RLE 1 +#define TOKEN_EXT_MATCH_FRESH 2 +#define TOKEN_EXT_MATCH 3 /* Resume: have match_size, need window_offset */ +#endif + /** - * This array was generated with tools/huffman_jump_table.py + * Huffman lookup table indexed by 7 bits (after first "1" bit consumed). + * Upper 4 bits = additional bits to consume, lower 4 bits = symbol (15 = FLUSH). * - * The idea is that the resulting code is smaller/faster as a lookup table than a bunch of if/else - * statements. - * - * Of each element: - * * The upper 4 bits express the number of bits to decode. - * * The lower 4 bits express the decoded value, with FLUSH being represented as 0b1111 + * Note: A 64-byte table with special-cased symbol 1 was tried but was ~10% slower + * and only saved 8 bytes in final firmware due to added branch logic. */ static const uint8_t HUFFMAN_TABLE[128] = { 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 85, 85, 85, 85, 122, 123, 104, 104, 86, 86, @@ -25,72 +34,235 @@ static const uint8_t HUFFMAN_TABLE[128] = { 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; /** - * @brief Decode a huffman match-size symbol from the decompressor's bit_buffer. - * - * Internally updates bit_buffer and bit_buffer_pos. + * @brief Decode huffman symbol + optional trailing bits from bit buffer. * - * bit_buffer MUST have at least 8 bits prior to calling. + * Modifies bit_buffer and bit_buffer_pos in place. Caller is responsible + * for committing to decompressor state if needed. * - * @returns Decoded match_size + * @param bit_buffer Pointer to bit buffer (modified in place) + * @param bit_buffer_pos Pointer to bit position (modified in place) + * @param trailing_bits Number of trailing bits to read (0, 3, or 4) + * @param result Output: (huffman << trailing_bits) + trailing (max 223 for trailing_bits=4) + * @return TAMP_OK on success, TAMP_INPUT_EXHAUSTED if more bits needed */ -static inline int8_t huffman_decode(uint32_t *bit_buffer, uint8_t *bit_buffer_pos) { - uint8_t code; - uint8_t bit_len; +static tamp_res decode_huffman(uint32_t* bit_buffer, uint8_t* bit_buffer_pos, uint8_t trailing_bits, uint8_t* result) { + /* Need at least 1 bit for huffman, plus trailing bits */ + if (TAMP_UNLIKELY(*bit_buffer_pos < 1 + trailing_bits)) return TAMP_INPUT_EXHAUSTED; + /* Decode huffman symbol */ + int8_t huffman_value; (*bit_buffer_pos)--; - code = *bit_buffer >> 31; - *bit_buffer <<= 1; - if (TAMP_LIKELY(code == 0)) return 0; + if (TAMP_LIKELY((*bit_buffer >> 31) == 0)) { + /* Symbol 0: code "0" */ + *bit_buffer <<= 1; + huffman_value = 0; + } else { + /* All other symbols: use 128-entry table indexed by next 7 bits */ + *bit_buffer <<= 1; + uint8_t code = HUFFMAN_TABLE[*bit_buffer >> (32 - 7)]; + uint8_t bit_len = code >> 4; + if (TAMP_UNLIKELY(*bit_buffer_pos < bit_len + trailing_bits)) return TAMP_INPUT_EXHAUSTED; + *bit_buffer <<= bit_len; + *bit_buffer_pos -= bit_len; + huffman_value = code & 0xF; + } - code = *bit_buffer >> (32 - 7); - code = HUFFMAN_TABLE[code]; - bit_len = code >> 4; - *bit_buffer <<= bit_len; - (*bit_buffer_pos) -= bit_len; + /* Read trailing bits (skip if trailing_bits==0 to avoid undefined shift) */ + if (trailing_bits) { + uint8_t trailing = *bit_buffer >> (32 - trailing_bits); + *bit_buffer <<= trailing_bits; + *bit_buffer_pos -= trailing_bits; + *result = (huffman_value << trailing_bits) + trailing; + } else { + *result = huffman_value; + } - return code & 0xF; + return TAMP_OK; } +#if TAMP_EXTENDED_DECOMPRESS + /** - * @brief Copy pattern from window to window, updating window_pos. + * @brief Decode RLE token and write repeated bytes to output. * - * Handles potential overlap between source and destination regions by - * using a temporary buffer when necessary. Overlap occurs when the - * destination would "catch up" to the source during copying. + * RLE format: huffman(count_high) + trailing_bits(count_low) + * rle_count = (count_high << 4) + count_low + 2 */ -static inline void window_copy(unsigned char *window, uint16_t *window_pos, uint16_t window_offset, uint8_t match_size, - uint16_t window_mask) { - const uint16_t src_to_dst = (*window_pos - window_offset) & window_mask; - const bool overlap = (src_to_dst < match_size) && (src_to_dst > 0); - - if (TAMP_UNLIKELY(overlap)) { - uint8_t tmp_buf[16]; - for (uint8_t i = 0; i < match_size; i++) { - tmp_buf[i] = window[window_offset + i]; - } - for (uint8_t i = 0; i < match_size; i++) { - window[*window_pos] = tmp_buf[i]; - *window_pos = (*window_pos + 1) & window_mask; +static tamp_res decode_rle(TampDecompressor* d, unsigned char** output, const unsigned char* output_end, + size_t* output_written_size) { + uint8_t rle_count; /* max 225: (13 << 4) + 15 + 2 */ + uint8_t skip = d->skip_bytes; + + if (skip > 0) { + /* Resume from output-full: rle_count saved in pending_window_offset */ + rle_count = d->pending_window_offset; + } else { + /* Fresh decode */ + uint32_t bit_buffer = d->bit_buffer; + uint8_t bit_buffer_pos = d->bit_buffer_pos; + uint8_t raw; + tamp_res res = decode_huffman(&bit_buffer, &bit_buffer_pos, TAMP_LEADING_RLE_BITS, &raw); + if (res != TAMP_OK) return res; + d->bit_buffer = bit_buffer; + d->bit_buffer_pos = bit_buffer_pos; + rle_count = raw + 2; + } + + /* Get the byte to repeat (last written byte) */ + uint16_t prev_pos = (d->window_pos - 1) & ((1u << d->conf_window) - 1); + uint8_t symbol = d->window[prev_pos]; + + /* Calculate how many to write this call */ + uint8_t remaining_count = rle_count - skip; + size_t output_space = output_end - *output; + uint8_t to_write; + + if (TAMP_UNLIKELY(remaining_count > output_space)) { + /* Partial write - save state for resume */ + to_write = output_space; + d->skip_bytes = skip + to_write; + d->token_state = TOKEN_RLE; + d->pending_window_offset = rle_count; + } else { + /* Complete write */ + to_write = remaining_count; + d->skip_bytes = 0; + d->token_state = TOKEN_NONE; + } + + /* Write repeated bytes to output */ + TAMP_MEMSET(*output, symbol, to_write); + *output += to_write; + *output_written_size += to_write; + + /* Update window only on first chunk (skip==0). + * Write up to TAMP_RLE_MAX_WINDOW or until end of buffer (no wrap). */ + if (skip == 0) { + const uint16_t window_size = 1u << d->conf_window; + uint16_t remaining = window_size - d->window_pos; + uint8_t window_write = MIN(MIN(rle_count, TAMP_RLE_MAX_WINDOW), remaining); /* max 8 */ + for (uint8_t i = 0; i < window_write; i++) { + d->window[d->window_pos++] = symbol; } + d->window_pos &= (window_size - 1); + } + + return (d->token_state == TOKEN_NONE) ? TAMP_OK : TAMP_OUTPUT_FULL; +} + +/** + * @brief Decode extended match token and copy from window to output. + * + * NEW FORMAT: huffman(size_high) + trailing_bits(size_low) + window_offset + * match_size = (size_high << 3) + size_low + min_pattern_size + 12 + * + * State machine: + * - Fresh: decode huffman+trailing, then window_offset + * - TOKEN_EXT_MATCH: have match_size, need window_offset + * - Output-full resume (skip > 0): have both match_size and window_offset + */ +static tamp_res decode_extended_match(TampDecompressor* d, unsigned char** output, const unsigned char* output_end, + size_t* output_written_size) { + const uint8_t conf_window = d->conf_window; + uint16_t window_offset; + uint8_t match_size; /* max 126: (13<<3)+7 + 3 + 12 */ + uint8_t skip = d->skip_bytes; + + if (skip > 0) { + /* Resume from output-full: both values saved */ + window_offset = d->pending_window_offset; + match_size = d->pending_match_size; + } else if (d->token_state == TOKEN_EXT_MATCH) { + /* Resume: have match_size, need window_offset */ + match_size = d->pending_match_size; + + if (TAMP_UNLIKELY(d->bit_buffer_pos < conf_window)) return TAMP_INPUT_EXHAUSTED; + window_offset = d->bit_buffer >> (32 - conf_window); + d->bit_buffer <<= conf_window; + d->bit_buffer_pos -= conf_window; } else { - for (uint8_t i = 0; i < match_size; i++) { - window[*window_pos] = window[window_offset + i]; - *window_pos = (*window_pos + 1) & window_mask; + /* Fresh decode: huffman+trailing first, then window_offset */ + uint32_t bit_buffer = d->bit_buffer; + uint8_t bit_buffer_pos = d->bit_buffer_pos; + uint8_t raw; + tamp_res res = decode_huffman(&bit_buffer, &bit_buffer_pos, TAMP_LEADING_EXTENDED_MATCH_BITS, &raw); + if (res != TAMP_OK) return res; + match_size = raw + d->min_pattern_size + 12; + + /* Now decode window_offset */ + if (TAMP_UNLIKELY(bit_buffer_pos < conf_window)) { + /* Save match_size and return */ + d->bit_buffer = bit_buffer; + d->bit_buffer_pos = bit_buffer_pos; + d->token_state = TOKEN_EXT_MATCH; + d->pending_match_size = match_size; + return TAMP_INPUT_EXHAUSTED; } + window_offset = bit_buffer >> (32 - conf_window); + bit_buffer <<= conf_window; + bit_buffer_pos -= conf_window; + d->bit_buffer = bit_buffer; + d->bit_buffer_pos = bit_buffer_pos; } + + /* Security check: validate window bounds */ + const uint32_t window_size = (1u << conf_window); + if (TAMP_UNLIKELY((uint32_t)window_offset >= window_size || + (uint32_t)window_offset + (uint32_t)match_size > window_size)) { + return TAMP_OOB; + } + + /* Calculate how many to write this call */ + uint8_t remaining_count = match_size - skip; + size_t output_space = output_end - *output; + uint8_t to_write; + + if (TAMP_UNLIKELY(remaining_count > output_space)) { + /* Partial write - save state for resume */ + to_write = output_space; + d->skip_bytes = skip + output_space; + d->token_state = TOKEN_EXT_MATCH; /* Reuse for output-full */ + d->pending_window_offset = window_offset; + d->pending_match_size = match_size; + } else { + /* Complete write */ + to_write = remaining_count; + d->skip_bytes = 0; + d->token_state = TOKEN_NONE; + } + + /* Copy from window to output */ + uint16_t src_offset = window_offset + skip; + for (uint8_t i = 0; i < to_write; i++) { + *(*output)++ = d->window[src_offset + i]; + } + *output_written_size += to_write; + + /* Update window only on complete decode. + * Write up to end of buffer (no wrap), matching RLE behavior. */ + if (d->token_state == TOKEN_NONE) { + uint16_t wp = d->window_pos; + uint16_t remaining = window_size - wp; + uint8_t window_write = (match_size < remaining) ? match_size : remaining; + tamp_window_copy(d->window, &wp, window_offset, window_write, window_size - 1); + d->window_pos = wp; + } + + return (d->token_state == TOKEN_NONE) ? TAMP_OK : TAMP_OUTPUT_FULL; } +#endif /* TAMP_EXTENDED_DECOMPRESS */ -tamp_res tamp_decompressor_read_header(TampConf *conf, const unsigned char *input, size_t input_size, - size_t *input_consumed_size) { +tamp_res tamp_decompressor_read_header(TampConf* conf, const unsigned char* input, size_t input_size, + size_t* input_consumed_size) { if (input_consumed_size) (*input_consumed_size) = 0; if (input_size == 0) return TAMP_INPUT_EXHAUSTED; - if (input[0] & 0x2) return TAMP_INVALID_CONF; // Reserved if (input[0] & 0x1) return TAMP_INVALID_CONF; // Currently only a single header byte is supported. if (input_consumed_size) (*input_consumed_size)++; conf->window = ((input[0] >> 5) & 0x7) + 8; conf->literal = ((input[0] >> 3) & 0x3) + 5; conf->use_custom_dictionary = ((input[0] >> 2) & 0x1); + conf->extended = ((input[0] >> 1) & 0x1); return TAMP_OK; } @@ -100,8 +272,10 @@ tamp_res tamp_decompressor_read_header(TampConf *conf, const unsigned char *inpu * * window * * window_bits_max */ -static tamp_res tamp_decompressor_populate_from_conf(TampDecompressor *decompressor, uint8_t conf_window, - uint8_t conf_literal, uint8_t conf_use_custom_dictionary) { +static TAMP_OPTIMIZE_SIZE tamp_res tamp_decompressor_populate_from_conf(TampDecompressor* decompressor, + uint8_t conf_window, uint8_t conf_literal, + uint8_t conf_use_custom_dictionary, + uint8_t conf_extended) { if (conf_window < 8 || conf_window > 15) return TAMP_INVALID_CONF; if (conf_literal < 5 || conf_literal > 8) return TAMP_INVALID_CONF; if (conf_window > decompressor->window_bits_max) return TAMP_INVALID_CONF; @@ -111,37 +285,62 @@ static tamp_res tamp_decompressor_populate_from_conf(TampDecompressor *decompres decompressor->conf_literal = conf_literal; decompressor->min_pattern_size = tamp_compute_min_pattern_size(conf_window, conf_literal); decompressor->configured = true; + decompressor->conf_extended = conf_extended; +#if !TAMP_EXTENDED_DECOMPRESS + if (conf_extended) return TAMP_INVALID_CONF; // Extended stream but extended support not compiled in +#endif return TAMP_OK; } -tamp_res tamp_decompressor_init(TampDecompressor *decompressor, const TampConf *conf, unsigned char *window, +tamp_res tamp_decompressor_init(TampDecompressor* decompressor, const TampConf* conf, unsigned char* window, uint8_t window_bits) { tamp_res res = TAMP_OK; // Validate window_bits parameter if (window_bits < 8 || window_bits > 15) return TAMP_INVALID_CONF; - for (uint8_t i = 0; i < sizeof(TampDecompressor); i++) // Zero-out the struct - ((unsigned char *)decompressor)[i] = 0; + TAMP_MEMSET(decompressor, 0, sizeof(TampDecompressor)); decompressor->window = window; decompressor->window_bits_max = window_bits; if (conf) { res = tamp_decompressor_populate_from_conf(decompressor, conf->window, conf->literal, - conf->use_custom_dictionary); + conf->use_custom_dictionary, conf->extended); } return res; } -tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigned char *output, size_t output_size, - size_t *output_written_size, const unsigned char *input, size_t input_size, - size_t *input_consumed_size, tamp_callback_t callback, void *user_data) { +/** + * @brief Refill bit buffer from input stream. + * + * Consumes bytes from input until bit_buffer has at least 25 bits or input is exhausted. + * + * NOTE: NOINLINE saves ~192 bytes on armv6m but causes ~10% decompression + * speed regression. Keep this inlined for performance. + */ +static inline void refill_bit_buffer(TampDecompressor* d, const unsigned char** input, const unsigned char* input_end, + size_t* input_consumed_size) { + while (*input != input_end && d->bit_buffer_pos <= 24) { + d->bit_buffer_pos += 8; + d->bit_buffer |= (uint32_t) * (*input) << (32 - d->bit_buffer_pos); + (*input)++; + (*input_consumed_size)++; + } +} + +#if TAMP_HAS_GCC_OPTIMIZE +#pragma GCC push_options +#pragma GCC optimize("-fno-tree-pre") +#endif +tamp_res tamp_decompressor_decompress_cb(TampDecompressor* decompressor, unsigned char* output, size_t output_size, + size_t* output_written_size, const unsigned char* input, size_t input_size, + size_t* input_consumed_size, tamp_callback_t callback, void* user_data) { size_t input_consumed_size_proxy; size_t output_written_size_proxy; tamp_res res; - const unsigned char *input_end = input + input_size; - const unsigned char *output_end = output + output_size; + const unsigned char* input_end = input + input_size; + const unsigned char* output_end = output + output_size; if (!output_written_size) output_written_size = &output_written_size_proxy; if (!input_consumed_size) input_consumed_size = &input_consumed_size_proxy; @@ -156,7 +355,8 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne res = tamp_decompressor_read_header(&conf, input, input_end - input, &header_consumed_size); if (res != TAMP_OK) return res; - res = tamp_decompressor_populate_from_conf(decompressor, conf.window, conf.literal, conf.use_custom_dictionary); + res = tamp_decompressor_populate_from_conf(decompressor, conf.window, conf.literal, conf.use_custom_dictionary, + conf.extended); if (res != TAMP_OK) return res; input += header_consumed_size; @@ -169,20 +369,42 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne const uint8_t min_pattern_size = decompressor->min_pattern_size; const uint16_t window_mask = (1 << conf_window) - 1; - while (input != input_end || decompressor->bit_buffer_pos) { +#if TAMP_EXTENDED_DECOMPRESS + const bool extended_enabled = decompressor->conf_extended; +#endif + + while (input != input_end || decompressor->pos_and_state) { + if (TAMP_UNLIKELY(output == output_end)) return TAMP_OUTPUT_FULL; + // Populate the bit buffer - while (input != input_end && decompressor->bit_buffer_pos <= 24) { - uint32_t t = *input; - decompressor->bit_buffer_pos += 8; - decompressor->bit_buffer |= t << (32 - decompressor->bit_buffer_pos); - input++; - (*input_consumed_size)++; + refill_bit_buffer(decompressor, &input, input_end, input_consumed_size); + +#if TAMP_EXTENDED_DECOMPRESS + /* Handle extended tokens - either resuming or fresh from match_size detection below. */ + if (TAMP_UNLIKELY(decompressor->token_state)) { + extended_dispatch: + if (decompressor->token_state == TOKEN_RLE) { + res = decode_rle(decompressor, &output, output_end, output_written_size); + } else { + res = decode_extended_match(decompressor, &output, output_end, output_written_size); + } + if (res == TAMP_INPUT_EXHAUSTED) { + uint8_t old_bit_pos = decompressor->bit_buffer_pos; + refill_bit_buffer(decompressor, &input, input_end, input_consumed_size); + /* If we couldn't get more bits and input is exhausted, stop. + * Otherwise the loop would run forever with token_state set. */ + if (decompressor->bit_buffer_pos == old_bit_pos && input == input_end) { + return TAMP_INPUT_EXHAUSTED; + } + continue; + } + if (res != TAMP_OK) return res; + continue; } +#endif // TAMP_EXTENDED_DECOMPRESS if (TAMP_UNLIKELY(decompressor->bit_buffer_pos == 0)) return TAMP_INPUT_EXHAUSTED; - if (TAMP_UNLIKELY(output == output_end)) return TAMP_OUTPUT_FULL; - // Hint that patterns are more likely than literals if (TAMP_UNLIKELY(decompressor->bit_buffer >> 31)) { // is literal @@ -214,10 +436,10 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne bit_buffer <<= 1; bit_buffer_pos--; - // There must be at least 8 bits, otherwise no possible decoding. - if (TAMP_UNLIKELY(bit_buffer_pos < 8)) return TAMP_INPUT_EXHAUSTED; + uint8_t match_size_u8; + if (decode_huffman(&bit_buffer, &bit_buffer_pos, 0, &match_size_u8) != TAMP_OK) return TAMP_INPUT_EXHAUSTED; + match_size = match_size_u8; - match_size = huffman_decode(&bit_buffer, &bit_buffer_pos); if (TAMP_UNLIKELY(match_size == FLUSH)) { // flush bit_buffer to the nearest byte and skip the remainder of decoding decompressor->bit_buffer = bit_buffer << (bit_buffer_pos & 7); @@ -225,6 +447,18 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne bit_buffer_pos & ~7; // Round bit_buffer_pos down to nearest multiple of 8. continue; } + +#if TAMP_EXTENDED_DECOMPRESS + /* Check for extended symbols (RLE=12, extended match=13). + * Convert match_size to token_state via subtraction (see TOKEN_* defines). */ + if (TAMP_UNLIKELY(extended_enabled && match_size >= TAMP_RLE_SYMBOL)) { + decompressor->bit_buffer = bit_buffer; + decompressor->bit_buffer_pos = bit_buffer_pos; + decompressor->token_state = match_size - (TAMP_RLE_SYMBOL - 1); + goto extended_dispatch; + } +#endif // TAMP_EXTENDED_DECOMPRESS + if (TAMP_UNLIKELY(bit_buffer_pos < conf_window)) { // There are not enough bits to decode window offset return TAMP_INPUT_EXHAUSTED; @@ -268,7 +502,7 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne if (TAMP_LIKELY(decompressor->skip_bytes == 0)) { uint16_t wp = decompressor->window_pos; - window_copy(decompressor->window, &wp, window_offset, match_size, window_mask); + tamp_window_copy(decompressor->window, &wp, window_offset, match_size, window_mask); decompressor->window_pos = wp; } } @@ -277,12 +511,16 @@ tamp_res tamp_decompressor_decompress_cb(TampDecompressor *decompressor, unsigne } return TAMP_INPUT_EXHAUSTED; } +#if TAMP_HAS_GCC_OPTIMIZE +#pragma GCC pop_options +#endif #if TAMP_STREAM -tamp_res tamp_decompress_stream(TampDecompressor *decompressor, tamp_read_t read_cb, void *read_handle, - tamp_write_t write_cb, void *write_handle, size_t *input_consumed_size, - size_t *output_written_size, tamp_callback_t callback, void *user_data) { +TAMP_OPTIMIZE_SIZE tamp_res tamp_decompress_stream(TampDecompressor* decompressor, tamp_read_t read_cb, + void* read_handle, tamp_write_t write_cb, void* write_handle, + size_t* input_consumed_size, size_t* output_written_size, + tamp_callback_t callback, void* user_data) { size_t input_consumed_size_proxy, output_written_size_proxy; if (!input_consumed_size) input_consumed_size = &input_consumed_size_proxy; if (!output_written_size) output_written_size = &output_written_size_proxy; diff --git a/tamp/_c_src/tamp/decompressor.h b/tamp/_c_src/tamp/decompressor.h index 1608a6d..9659817 100644 --- a/tamp/_c_src/tamp/decompressor.h +++ b/tamp/_c_src/tamp/decompressor.h @@ -11,21 +11,40 @@ extern "C" { * Fields are ordered by access frequency for cache efficiency. */ typedef struct { - /* HOT: accessed every iteration of the decompression loop. - * Full-width types avoid bitfield access overhead. */ - unsigned char *window; // Pointer to window buffer - uint32_t bit_buffer; // Bit buffer for reading compressed data (32 bits) - uint16_t window_pos; // Current position in window (15 bits) - uint8_t bit_buffer_pos; // Bits currently in bit_buffer (6 bits) + /* HOT: accessed every iteration of the decompression loop. */ + unsigned char *window; // Pointer to window buffer + uint32_t bit_buffer; // Bit buffer for reading compressed data (32 bits) + uint16_t window_pos; // Current position in window (15 bits) + + /* Union allows single zero-check in main loop instead of two separate checks. */ +#if TAMP_EXTENDED_DECOMPRESS + union { + struct { + uint8_t bit_buffer_pos; // Bits currently in bit_buffer (6 bits needed) + uint8_t token_state; // 0=none, 1=RLE, 2=ext match, 3=ext match fresh (2 bits used) + }; + uint16_t pos_and_state; // Combined for fast 16-bit zero-check + }; +#else + union { + uint8_t bit_buffer_pos; // Bits currently in bit_buffer (6 bits needed) + uint8_t pos_and_state; // Alias for consistent access in main loop + }; +#endif +#if TAMP_EXTENDED_DECOMPRESS + uint16_t pending_window_offset; // Saved window_offset for extended match output-full resume + uint16_t pending_match_size; // Saved match_size for extended match resume +#endif /* WARM: read once at start of decompress, cached in locals */ uint8_t conf_window : 4; // Window bits from config uint8_t conf_literal : 4; // Literal bits from config uint8_t min_pattern_size : 2; // Minimum pattern size, 2 or 3 + uint8_t conf_extended : 1; // Extended format enabled (from header) /* COLD: rarely accessed (init or edge cases). * Bitfields save space; add new cold fields here. */ - uint8_t skip_bytes : 4; // For output-buffer-limited resumption + uint8_t skip_bytes; // For output-buffer-limited resumption (v2 needs >4 bits) uint8_t window_bits_max : 4; // Max window bits buffer can hold uint8_t configured : 1; // Whether config has been set } TampDecompressor; diff --git a/tamp/cli/main.py b/tamp/cli/main.py index 27275c2..11f7062 100644 --- a/tamp/cli/main.py +++ b/tamp/cli/main.py @@ -119,6 +119,7 @@ def compress( ), ] = 8, lazy_matching: bool = False, + extended: bool = True, implementation: ImplementationType = None, ): """Compress an input file or stream. @@ -135,6 +136,8 @@ def compress( Number of bits used to represent a literal. lazy_matching: bool Use roughly 50% more cpu to get 0~2% better compression. + extended: bool + Use extended compression format (RLE, extended match encoding). implementation: Optional[Literal["c", "python"]] Explicitly specify which implementation to use (c or python). Defaults to auto-detection. """ @@ -145,6 +148,7 @@ def compress( window=window, literal=literal, lazy_matching=lazy_matching, + extended=extended, ) write(output, output_bytes) diff --git a/tamp/compressor.py b/tamp/compressor.py index 91c5516..f59b2e3 100644 --- a/tamp/compressor.py +++ b/tamp/compressor.py @@ -1,3 +1,11 @@ +"""Pure Python Tamp Compressor Reference Implementation. + +The goal of this module is for clarity and to be able to easily test new ideas. +Do not optimize this file for speed, unless it still maintains clarity. + +Some speed architectural optimizations might be tested here before implementing in other languages. +""" + from collections import deque from io import BytesIO @@ -13,26 +21,33 @@ from . import ExcessBitsError, bit_size, compute_min_pattern_size, initialize_dictionary -# encodes [min_pattern_bytes, min_pattern_bytes + 13] pattern lengths -_huffman_codes = b"\x00\x03\x08\x0b\x14$&+KT\x94\x95\xaa'" +# encodes [0, 14] pattern lengths +_huffman_codes = b"\x00\x03\x08\x0b\x14$&+KT\x94\x95\xaa'\xab" # These bit lengths pre-add the 1 bit for the 0-value is_literal flag. -_huffman_bits = b"\x02\x03\x05\x05\x06\x07\x07\x07\x08\x08\x09\x09\x09\x07" +_huffman_bits = b"\x02\x03\x05\x05\x06\x07\x07\x07\x08\x08\x09\x09\x09\x07\x09" _FLUSH_CODE = 0xAB # 8 bits +_RLE_SYMBOL = 12 +_RLE_MAX_WINDOW = 8 # Maximum number of RLE bytes to write to the window. +_EXTENDED_MATCH_SYMBOL = 13 +_LEADING_EXTENDED_MATCH_HUFFMAN_BITS = 3 +_LEADING_RLE_HUFFMAN_BITS = 4 class _BitWriter: """Writes bits to a stream.""" - def __init__(self, f, close_f_on_close=False): + def __init__(self, f, *, close_f_on_close: bool = False): self.close_f_on_close = close_f_on_close self.f = f - self.buffer = 0 # Basically a uint24 + self.buffer = 0 # Basically a uint32 self.bit_pos = 0 - def write_huffman(self, pattern_size): + def write_huffman_and_literal_flag(self, pattern_size): + # pattern_size in range [0, 14] return self.write(_huffman_codes[pattern_size], _huffman_bits[pattern_size]) def write(self, bits, num_bits, flush=True): + bits = int(bits) bits &= (1 << num_bits) - 1 self.bit_pos += num_bits self.buffer |= bits << (32 - self.bit_pos) @@ -75,7 +90,7 @@ def __init__(self, buffer): self.size = len(buffer) self.pos = 0 # Always pointing to the byte-to-be-overwritten - def write_byte(self, byte): # ~10% of time + def write_byte(self, byte): self.buffer[self.pos] = byte self.pos = (self.pos + 1) % self.size @@ -90,6 +105,32 @@ def index(self, pattern, start): raise ValueError("substring not found") return result + def write_from_self(self, position, size): + # Write up to end of buffer (no wrap) + remaining = self.size - self.pos + window_write = min(size, remaining) + # Read source data first to avoid overlap when source and destination ranges overlap + data = self.get(position, window_write) + for byte in data: + self.buffer[self.pos] = byte + self.pos += 1 + if self.pos == self.size: + self.pos = 0 + + def get(self, index, size): + out = bytearray(size) + for i in range(size): + pos = (index + i) % self.size + out[i] = self.buffer[pos] + return bytes(out) + + @property + def last_written_byte(self) -> int: + pos = self.pos - 1 + if pos < 0: + pos = self.size - 1 + return self.buffer[pos] # TODO: unit-test this thoroughly on initial start! + class Compressor: """Compresses data to a file or stream.""" @@ -102,6 +143,7 @@ def __init__( literal: int = 8, dictionary: Optional[bytearray] = None, lazy_matching: bool = False, + extended: bool = True, ): """ Parameters @@ -129,11 +171,24 @@ def __init__( lazy_matching: bool Use roughly 50% more cpu to get 0~2% better compression. """ - if lazy_matching: - raise NotImplementedError("lazy matching not implemented in pure python implementation.") + self.window_bits = window + self.literal_bits = literal + self.min_pattern_size = compute_min_pattern_size(window, literal) + self.extended: bool = extended + + self._rle_count = 0 + + # "+1" Because a RLE of 1 is not valid. + self._rle_max_size = (13 << _LEADING_RLE_HUFFMAN_BITS) + (1 << _LEADING_RLE_HUFFMAN_BITS) + 1 + + self._extended_match_count = 0 + self._extended_match_position = 0 + + self.lazy_matching = lazy_matching + self._cached_match_index = -1 + self._cached_match_size = 0 if not hasattr(f, "write"): # It's probably a path-like object. - # TODO: then close it on close f = open(str(f), "wb") close_f_on_close = True else: @@ -143,11 +198,15 @@ def __init__( if dictionary and bit_size(len(dictionary) - 1) != window: raise ValueError("Dictionary-window size mismatch.") - self.window_bits = window - self.literal_bits = literal - - self.min_pattern_size = compute_min_pattern_size(window, literal) - self.max_pattern_size = self.min_pattern_size + 13 + if self.extended: + self.max_pattern_size = ( + self.min_pattern_size + + 11 + + (13 << _LEADING_EXTENDED_MATCH_HUFFMAN_BITS) + + (1 << _LEADING_EXTENDED_MATCH_HUFFMAN_BITS) + ) + else: + self.max_pattern_size = self.min_pattern_size + 13 self.literal_flag = 1 << self.literal_bits @@ -155,26 +214,170 @@ def __init__( buffer=dictionary if dictionary else initialize_dictionary(1 << window), ) - self._input_buffer = deque(maxlen=self.max_pattern_size) + self._input_buffer = deque(maxlen=16) # matching the C implementation # Callbacks for debugging/metric collection; can be externally set. - self.token_cb = None + self.match_cb = None + self.extended_match_cb = None self.literal_cb = None self.flush_cb = None + self.rle_cb = None + + # For debugging: how many uncompressed bytes have we consumed so far. + self.input_index = 0 # Write header self._bit_writer.write(window - 8, 3, flush=False) self._bit_writer.write(literal - 5, 2, flush=False) self._bit_writer.write(bool(dictionary), 1, flush=False) - self._bit_writer.write(0, 1, flush=False) # Reserved + self._bit_writer.write(self.extended, 1, flush=False) self._bit_writer.write(0, 1, flush=False) # No other header bytes + def _validate_no_match_overlap(self, write_pos, match_index, match_size): + """Check if writing a single byte will overlap with a future match section.""" + return write_pos < match_index or write_pos >= match_index + match_size + def _compress_input_buffer_single(self) -> int: - target = bytes(self._input_buffer) bytes_written = 0 - search_i = 0 - match_size = 1 - for match_size in range(self.min_pattern_size, len(target) + 1): + + if not self._input_buffer: + return bytes_written + + if self._extended_match_count: + while self._input_buffer: + if (self._extended_match_position + self._extended_match_count) >= self._window_buffer.size: + # Reached window boundary - emit match (no wrap-around, only 0.02% compression loss) + bytes_written += self._write_extended_match() + return bytes_written + + # Search the remainder of the window buffer for a longer match. + target = self._window_buffer.get(self._extended_match_position, self._extended_match_count) + target += bytes([self._input_buffer[0]]) + search_i, match = self._search(target, start=self._extended_match_position) + match_size = len(match) + if match_size > self._extended_match_count: + self._input_buffer.popleft() + self._extended_match_count = match_size + self._extended_match_position = search_i + if self._extended_match_count == self.max_pattern_size: + bytes_written += self._write_extended_match() + return bytes_written + continue + else: + # We've found the end of the match + bytes_written += self._write_extended_match() + return bytes_written + + # We ran out of input_buffer, return so caller can re-populate the input_buffer + return bytes_written + + # RLE handling with persistent state (v2 only) + # Accumulate RLE count across compression cycles for better compression of long runs + if self.extended: + last_byte = self._window_buffer.last_written_byte + + # Count RLE bytes in current buffer WITHOUT consuming yet + rle_available = 0 + for byte in self._input_buffer: + if byte == last_byte and self._rle_count + rle_available < self._rle_max_size: + rle_available += 1 + else: + break + + total_rle = self._rle_count + rle_available + rle_ended = (rle_available < len(self._input_buffer)) or (total_rle >= self._rle_max_size) + + # If RLE hasn't ended and we haven't hit max, consume and wait for more + if not rle_ended and total_rle > 0: + self._rle_count = total_rle + for _ in range(rle_available): + self._input_buffer.popleft() + return bytes_written + + # RLE run has ended - decide between RLE and pattern match + if total_rle >= 2: + use_pattern = False + + # For short RLE runs (all from this call), check if pattern match is better + if total_rle == rle_available and total_rle <= 6: + target = bytes(self._input_buffer) + search_i, match = self._search(target, start=0) + match_size = len(match) + + if match_size > total_rle: + use_pattern = True + # Don't consume RLE bytes - fall through to pattern matching + + if not use_pattern: + # Use RLE - consume bytes and write token + for _ in range(rle_available): + self._input_buffer.popleft() + self._rle_count = total_rle + bytes_written += self._write_rle() + return bytes_written + self._rle_count = 0 + elif total_rle == 1: + # Single byte - not worth RLE, will be handled as literal/pattern + self._rle_count = 0 + + # Normal pattern matching + target = bytes(self._input_buffer) + + if self.lazy_matching and self._cached_match_index >= 0: + search_i = self._cached_match_index + match_size = self._cached_match_size + match = self._window_buffer.get(search_i, match_size) + self._cached_match_index = -1 + else: + search_i, match = self._search(target, start=0) + match_size = len(match) + + # Lazy matching logic + if ( + self.lazy_matching + and match_size >= self.min_pattern_size + and match_size <= 8 + and len(self._input_buffer) > match_size + 2 + ): + # Check if next position has a better match + next_target = bytes(list(self._input_buffer)[1:]) # Skip first byte + next_search_i, next_match = self._search(next_target, start=0) + next_match_size = len(next_match) + + # If next position has a better match, and the match doesn't overlap with the literal we are writing + if next_match_size > match_size and self._validate_no_match_overlap( + self._window_buffer.pos, next_search_i, next_match_size + ): + # Write literal at current position and cache the next match + literal = self._input_buffer.popleft() + bytes_written += self._write_literal(literal) + self._cached_match_index = next_search_i + self._cached_match_size = next_match_size + return bytes_written + + if match_size >= self.min_pattern_size: + if self.extended and match_size > (self.min_pattern_size + 11): + # Protects +12 to be RLE symbol, and +13 to be extended match symbol + self._extended_match_position = search_i + self._extended_match_count = match_size + else: + bytes_written += self._write_match(search_i, match) + + for _ in range(match_size): + self._input_buffer.popleft() + else: + literal = self._input_buffer.popleft() + bytes_written += self._write_literal(literal) + + return bytes_written + + def _search(self, target: bytes, start=0): + match_size = 0 + search_i = start + for match_size in range( + self.min_pattern_size, + min(len(target), self.max_pattern_size) + 1, + ): match = target[:match_size] try: search_i = self._window_buffer.index(match, search_i) @@ -183,30 +386,92 @@ def _compress_input_buffer_single(self) -> int: match_size -= 1 break match = target[:match_size] + return search_i, match - if match_size >= self.min_pattern_size: - if self.token_cb: - self.token_cb( - search_i, - match_size, - match, - ) - bytes_written += self._bit_writer.write_huffman(match_size - self.min_pattern_size) - bytes_written += self._bit_writer.write(search_i, self.window_bits) - self._window_buffer.write_bytes(match) + def _write_extended_huffman(self, value, leading_bits): + bytes_written = 0 + # the upper bits can have values [0, 13] + mask = (1 << leading_bits) - 1 + if value > ((13 << leading_bits) + mask) or value < 0: + raise ValueError + code_index = value >> leading_bits + # Don't use write_huffman_and_literal_flag since we don't want to write a flag. + bytes_written += self._bit_writer.write(_huffman_codes[code_index], _huffman_bits[code_index] - 1) + bytes_written += self._bit_writer.write(value & mask, leading_bits) + return bytes_written - for _ in range(match_size): - self._input_buffer.popleft() + def _write_extended_match(self): + bytes_written = 0 + if self.extended_match_cb: + string = self._window_buffer.get(self._extended_match_position, self._extended_match_count) + self.extended_match_cb( + self._window_buffer.pos, self._extended_match_position, self._extended_match_count, string + ) + # Format: symbol, size (huffman+trailing), position + bytes_written += self._bit_writer.write_huffman_and_literal_flag(_EXTENDED_MATCH_SYMBOL) + bytes_written += self._write_extended_huffman( + self._extended_match_count - self.min_pattern_size - 11 - 1, + _LEADING_EXTENDED_MATCH_HUFFMAN_BITS, + ) + bytes_written += self._bit_writer.write(self._extended_match_position, self.window_bits) + + self._window_buffer.write_from_self(self._extended_match_position, self._extended_match_count) + + # Reset state + self._extended_match_count = 0 + self._extended_match_position = 0 # Technically not necessary. + + return bytes_written + + def _write_literal(self, literal) -> int: + bytes_written = 0 + if self.literal_cb: + self.literal_cb(literal) + if literal >> self.literal_bits: + raise ExcessBitsError + + bytes_written += self._bit_writer.write(literal | self.literal_flag, self.literal_bits + 1) + self._window_buffer.write_byte(literal) + return bytes_written + + def _write_match(self, search_i, match) -> int: + match_size = len(match) + + if self.match_cb: + self.match_cb( + self._window_buffer.pos, + search_i, + match_size, + match, + ) + + bytes_written = 0 + bytes_written += self._bit_writer.write_huffman_and_literal_flag(match_size - self.min_pattern_size) + bytes_written += self._bit_writer.write(search_i, self.window_bits) + self._window_buffer.write_bytes(match) + return bytes_written + + def _write_rle(self) -> int: + bytes_written = 0 + last_written_byte = self._window_buffer.last_written_byte + + if self._rle_count == 0: + raise ValueError("No RLE to write.") + elif self._rle_count == 1: + # Just write a literal + bytes_written += self._write_literal(last_written_byte) else: - char = self._input_buffer.popleft() - if self.literal_cb: - self.literal_cb(char) - if char >> self.literal_bits: - raise ExcessBitsError + if self.rle_cb: + self.rle_cb(self._rle_count, last_written_byte) + bytes_written += self._bit_writer.write_huffman_and_literal_flag(_RLE_SYMBOL) + bytes_written += self._write_extended_huffman(self._rle_count - 2, _LEADING_RLE_HUFFMAN_BITS) - bytes_written += self._bit_writer.write(char | self.literal_flag, self.literal_bits + 1) - self._window_buffer.write_byte(char) + # Write up to 8 bytes to the window (up to end of buffer, no wrap). + remaining = self._window_buffer.size - self._window_buffer.pos + window_write = min(self._rle_count, _RLE_MAX_WINDOW, remaining) + self._window_buffer.write_bytes(bytes([last_written_byte]) * window_write) + self._rle_count = 0 return bytes_written def write(self, data: Union[bytes, bytearray]) -> int: @@ -225,8 +490,12 @@ def write(self, data: Union[bytes, bytearray]) -> int: """ bytes_written = 0 - for char in data: - self._input_buffer.append(char) + self.input_index = 0 + while self.input_index < len(data): + if len(self._input_buffer) != self._input_buffer.maxlen: + self._input_buffer.append(data[self.input_index]) + self.input_index += 1 + if len(self._input_buffer) == self._input_buffer.maxlen: bytes_written += self._compress_input_buffer_single() @@ -255,7 +524,18 @@ def flush(self, write_token: bool = True) -> int: self.flush_cb() while self._input_buffer: bytes_written += self._compress_input_buffer_single() - bytes_written += self._bit_writer.flush(write_token=write_token) + if self.extended and self._rle_count: + bytes_written += self._write_rle() + if self.extended and self._extended_match_count: + bytes_written += self._write_extended_match() + + # Clear any cached lazy matching state + if self.lazy_matching: + self._cached_match_index = -1 + self._cached_match_size = 0 + + bytes_written_flush = self._bit_writer.flush(write_token=write_token) + bytes_written += bytes_written_flush return bytes_written def close(self) -> int: @@ -300,6 +580,7 @@ def compress( literal: int = 8, dictionary: Optional[bytearray] = None, lazy_matching: bool = False, + extended: bool = True, ) -> bytes: """Single-call to compress data. @@ -326,6 +607,8 @@ def compress( first be initialized with :func:`~tamp.initialize_dictionary` lazy_matching: bool Use roughly 50% more cpu to get 0~2% better compression. + extended: bool + Use extended compression format. Defaults to True. Returns ------- @@ -340,6 +623,7 @@ def compress( literal=literal, dictionary=dictionary, lazy_matching=lazy_matching, + extended=extended, ) c.write(data) else: @@ -349,6 +633,7 @@ def compress( literal=literal, dictionary=dictionary, lazy_matching=lazy_matching, + extended=extended, ) c.write(data) c.flush(write_token=False) diff --git a/tamp/ctamp.pxd b/tamp/ctamp.pxd index 087ff62..3a70308 100644 --- a/tamp/ctamp.pxd +++ b/tamp/ctamp.pxd @@ -6,6 +6,7 @@ cdef extern from "tamp/common.h": int window int literal bool use_custom_dictionary + bool extended # Extended format (RLE, extended match). Read from header bit [1]. # The lazy_matching field is conditionally compiled based on TAMP_LAZY_MATCHING # We declare it here, but accessing it when the macro is disabled will cause compile errors # This is handled in the Cython code by always setting it when the struct is initialized diff --git a/tamp/decompressor.py b/tamp/decompressor.py index fb2fec7..6216b3e 100644 --- a/tamp/decompressor.py +++ b/tamp/decompressor.py @@ -10,6 +10,13 @@ _CHUNK_SIZE = 1 << 20 _FLUSH = object() +# These variables must match compressor.py +_RLE_SYMBOL = 12 +_EXTENDED_MATCH_SYMBOL = 13 +_RLE_MAX_WINDOW = 8 # Maximum number of RLE bytes to write to the window. +_LEADING_EXTENDED_MATCH_HUFFMAN_BITS = 3 +_LEADING_RLE_HUFFMAN_BITS = 4 + # Each key here are the huffman codes or'd with 0x80 # This is so that each lookup is easy/quick. _huffman_lookup = { @@ -57,15 +64,15 @@ def read(self, num_bits): if not byte: raise EOFError byte_value = int.from_bytes(byte, "little") - self.buffer |= byte_value << (24 - self.bit_pos) + self.buffer |= byte_value << (56 - self.bit_pos) self.bit_pos += 8 if self.backup_buffer is not None and self.backup_bit_pos is not None: - self.backup_buffer |= byte_value << (24 - self.backup_bit_pos) + self.backup_buffer |= byte_value << (56 - self.backup_bit_pos) self.backup_bit_pos += 8 - result = self.buffer >> (32 - num_bits) - mask = (1 << (32 - num_bits)) - 1 + result = self.buffer >> (64 - num_bits) + mask = (1 << (64 - num_bits)) - 1 self.buffer = (self.buffer & mask) << num_bits self.bit_pos -= num_bits @@ -120,6 +127,20 @@ def write_bytes(self, data): for byte in data: self.write_byte(byte) + def get(self, index, size): + out = bytearray(size) + for i in range(size): + pos = (index + i) % self.size + out[i] = self.buffer[pos] + return bytes(out) + + @property + def last_written_byte(self) -> int: + pos = self.pos - 1 + if pos < 0: + pos = self.size - 1 + return self.buffer[pos] # TODO: unit-test this thoroughly on initial start! + class Decompressor: """Decompresses a file or stream of tamp-compressed data. @@ -158,12 +179,9 @@ def __init__(self, f, *, dictionary: Optional[bytearray] = None): self.window_bits = self._bit_reader.read(3) + 8 self.literal_bits = self._bit_reader.read(2) + 5 uses_custom_dictionary = self._bit_reader.read(1) - reserved = self._bit_reader.read(1) + self.extended = self._bit_reader.read(1) more_header_bytes = self._bit_reader.read(1) - if reserved: - raise NotImplementedError - if more_header_bytes: raise NotImplementedError @@ -176,6 +194,7 @@ def __init__(self, f, *, dictionary: Optional[bytearray] = None): self.min_pattern_size = compute_min_pattern_size(self.window_bits, self.literal_bits) + # Used to store decoded bytes that do not currently fit in the output buffer. self.overflow = bytearray() def readinto(self, buf: bytearray) -> int: @@ -191,49 +210,82 @@ def readinto(self, buf: bytearray) -> int: int Number of bytes decompressed into buffer. """ + bytes_written = 0 + if len(self.overflow) > len(buf): buf[:] = self.overflow[: len(buf)] - written = len(buf) + bytes_written += len(buf) self.overflow = self.overflow[len(buf) :] - return written + return bytes_written elif self.overflow: buf[: len(self.overflow)] = self.overflow - written = len(self.overflow) + bytes_written += len(self.overflow) self.overflow = bytearray() - else: - written = 0 - while written < len(buf): + def write_to_output(string): + nonlocal bytes_written + match_size = len(string) + to_buf = min(len(buf) - bytes_written, match_size) + buf[bytes_written : bytes_written + to_buf] = string[:to_buf] + bytes_written += to_buf + if to_buf < match_size: + self.overflow[:] = string[to_buf:] + return False # stop decoding + return True + + while bytes_written < len(buf): try: with self._bit_reader: is_literal = self._bit_reader.read(1) if is_literal: - c = self._bit_reader.read(self.literal_bits) - self._window_buffer.write_byte(c) - buf[written] = c - written += 1 + string = bytes([self._bit_reader.read(self.literal_bits)]) + self._window_buffer.write_bytes(string) else: match_size = self._bit_reader.read_huffman() if match_size is _FLUSH: self._bit_reader.clear() continue - match_size += self.min_pattern_size - index = self._bit_reader.read(self.window_bits) - - string = self._window_buffer.buffer[index : index + match_size] - self._window_buffer.write_bytes(string) - - to_buf = min(len(buf) - written, match_size) - buf[written : written + to_buf] = string[:to_buf] - written += to_buf - if to_buf < match_size: - self.overflow[:] = string[to_buf:] - break + if self.extended and match_size > 11: + if match_size == _RLE_SYMBOL: + rle_count = self._bit_reader.read_huffman() + rle_count <<= _LEADING_RLE_HUFFMAN_BITS + rle_count += self._bit_reader.read(_LEADING_RLE_HUFFMAN_BITS) + rle_count += 1 + 1 + symbol = self._window_buffer.last_written_byte + string = bytes([symbol]) * rle_count + remaining = self._window_buffer.size - self._window_buffer.pos + window_write = min(rle_count, _RLE_MAX_WINDOW, remaining) + self._window_buffer.write_bytes(string[:window_write]) + elif match_size == _EXTENDED_MATCH_SYMBOL: + # Format: size (huffman+trailing), then position + match_size = self._bit_reader.read_huffman() + match_size <<= _LEADING_EXTENDED_MATCH_HUFFMAN_BITS + match_size += self._bit_reader.read(_LEADING_EXTENDED_MATCH_HUFFMAN_BITS) + match_size += self.min_pattern_size + 11 + 1 + index = self._bit_reader.read(self.window_bits) + + string = self._window_buffer.get(index, match_size) + + # Write up to end of buffer (no wrap) + remaining = self._window_buffer.size - self._window_buffer.pos + window_write = min(match_size, remaining) + self._window_buffer.write_bytes(string[:window_write]) + else: + raise ValueError("unreachable") + else: + match_size += self.min_pattern_size + index = self._bit_reader.read(self.window_bits) + + string = self._window_buffer.get(index, match_size) + self._window_buffer.write_bytes(string) + + if not write_to_output(string): + break except EOFError: break - return written + return bytes_written def read(self, size: int = -1) -> bytearray: """Decompresses data to bytes. diff --git a/tests/test_cli.py b/tests/test_cli.py index c23917d..b6b79ba 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -51,7 +51,7 @@ def test_compress_file_to_stdout(self): test_file.write_bytes(b"foo foo foo") with patch("sys.stdout.buffer.write") as mock_stdout: - app(["compress", str(test_file)], **_app_kwargs) + app(["compress", "--no-extended", str(test_file)], **_app_kwargs) mock_stdout.assert_called_once_with(compressed_foo_foo_foo) def test_compress_stdin_to_stdout(self): @@ -59,7 +59,7 @@ def test_compress_stdin_to_stdout(self): patch("sys.stdout.buffer.write") as mock_stdout, patch("sys.stdin.buffer.read", return_value="foo foo foo"), ): - app("compress", **_app_kwargs) + app(["compress", "--no-extended"], **_app_kwargs) mock_stdout.assert_called_once_with(compressed_foo_foo_foo) def test_decompress_file_to_stdout(self): diff --git a/tests/test_compressor.py b/tests/test_compressor.py index 188447c..309dcd7 100644 --- a/tests/test_compressor.py +++ b/tests/test_compressor.py @@ -36,12 +36,6 @@ NativeExcessBitsError = ExcessBitsError if micropython: - from tamp.compressor_viper import Compressor as ViperCompressor - from tamp.compressor_viper import compress as viper_compress - - Compressors.append(ViperCompressor) - compresses.append(viper_compress) - try: from tamp_native import Compressor as NativeCompressor from tamp_native import ExcessBitsError as NativeExcessBitsError @@ -94,7 +88,7 @@ def test_compressor_default(self): bytes_written = 0 with io.BytesIO() as f: - compressor = Compressor(f) + compressor = Compressor(f, extended=False) bytes_written += compressor.write(test_string) bytes_written += compressor.flush(write_token=False) @@ -106,7 +100,7 @@ def test_compressor_default(self): # Test Context Manager bytes_written = 0 - with io.BytesIO() as f, Compressor(f) as compressor: + with io.BytesIO() as f, Compressor(f, extended=False) as compressor: bytes_written += compressor.write(test_string) bytes_written += compressor.flush(write_token=False) @@ -137,7 +131,7 @@ def test_compressor_input_buffer(self): ) with io.BytesIO() as f: - compressor = Compressor(f) + compressor = Compressor(f, extended=False) compressor.write(b"f") compressor.write(b"oo") compressor.write(b" fo") @@ -171,7 +165,7 @@ def test_compressor_7bit(self): # fmt: on ) with io.BytesIO() as f: - compressor = Compressor(f, literal=7) + compressor = Compressor(f, literal=7, extended=False) compressor.write(test_string) compressor.flush(write_token=False) @@ -200,7 +194,7 @@ def test_compressor_predefined_dictionary(self): ) with io.BytesIO() as f: - compressor = Compressor(f, window=8, literal=7, dictionary=dictionary) + compressor = Compressor(f, window=8, literal=7, dictionary=dictionary, extended=False) compressor.write(test_string) compressor.flush(write_token=False) @@ -223,7 +217,7 @@ def test_oob_2_byte_pattern(self): test_string = memoryview(test_string_extended)[:3] # b"Q\x00Q" with io.BytesIO() as f: - compressor = Compressor(f) + compressor = Compressor(f, extended=False) compressor.write(test_string) compressor.flush(write_token=False) @@ -245,7 +239,7 @@ def test_oob_2_byte_pattern(self): def test_excess_bits(self): for Compressor in Compressors: with self.subTest(Compressor=Compressor), io.BytesIO() as f: - compressor = Compressor(f, literal=7) + compressor = Compressor(f, literal=7, extended=False) with self.assertRaises((ExcessBitsError, NativeExcessBitsError)): compressor.write(b"\xff") @@ -271,7 +265,7 @@ def test_single_shot_compress_text(self): ] # fmt: on ) - self.assertEqual(compress("foo foo foo"), expected) + self.assertEqual(compress("foo foo foo", extended=False), expected) def test_single_shot_compress_binary(self): for compress in compresses: @@ -293,7 +287,7 @@ def test_single_shot_compress_binary(self): ] # fmt: on ) - self.assertEqual(compress(b"foo foo foo"), expected) + self.assertEqual(compress(b"foo foo foo", extended=False), expected) def test_invalid_conf(self): for Compressor in Compressors: diff --git a/tests/test_compressor_decompressor.py b/tests/test_compressor_decompressor.py index 22b964a..aa0a8e1 100644 --- a/tests/test_compressor_decompressor.py +++ b/tests/test_compressor_decompressor.py @@ -19,21 +19,16 @@ CCompressor = None CDecompressor = None - ViperCompressor = None - ViperDecompressor = None NativeCompressor = None NativeDecompressor = None else: - # MicroPython: only test Viper and Native implementations + # MicroPython: only test Native implementation # Pure Python and Cython implementations use CPython-specific features PyCompressor = None PyDecompressor = None CCompressor = None CDecompressor = None - from tamp.compressor_viper import Compressor as ViperCompressor - from tamp.decompressor_viper import Decompressor as ViperDecompressor - try: from tamp_native import Compressor as NativeCompressor from tamp_native import Decompressor as NativeDecompressor @@ -43,8 +38,8 @@ NativeDecompressor = None -Compressors = (PyCompressor, CCompressor, ViperCompressor, NativeCompressor) -Decompressors = (PyDecompressor, CDecompressor, ViperDecompressor, NativeDecompressor) +Compressors = (PyCompressor, CCompressor, NativeCompressor) +Decompressors = (PyDecompressor, CDecompressor, NativeDecompressor) def walk_compressors_decompressors(): diff --git a/tests/test_dataset_regression.py b/tests/test_dataset_regression.py index 734401e..a6716cf 100644 --- a/tests/test_dataset_regression.py +++ b/tests/test_dataset_regression.py @@ -73,6 +73,66 @@ ), ] +# Extended format datasets (uses RLE and Extended Match encoding) +EXTENDED_DATASETS = [ + ( + "datasets/extended-compressed/RPI_PICO-20250415-v1.25.0.uf2.tamp", + "e0c40eacf1afc550a6add74888c48bb981b28788a6d75a62a0e2444e997b9864", + ), + ( + "datasets/extended-compressed/dickens.tamp", + "b24c37886142e11d0ee687db6ab06f936207aa7f2ea1fd1d9a36763c7a507e6a", + ), + ( + "datasets/extended-compressed/mr.tamp", + "68637ed52e3e4860174ed2dc0840ac77d5f1a60abbcb13770d5754e3774d53e6", + ), + ( + "datasets/extended-compressed/ooffice.tamp", + "e7ee013880d34dd5208283d0d3d91b07f442e067454276095ded14f322a656eb", + ), + ( + "datasets/extended-compressed/osdb.tamp", + "60f027179302ca3ad87c58ac90b6be72ec23588aaa7a3b7fe8ecc0f11def3fa3", + ), + ( + "datasets/extended-compressed/reymont.tamp", + "0eac0114a3dfe6e2ee1f345a0f79d653cb26c3bc9f0ed79238af4933422b7578", + ), + ( + "datasets/extended-compressed/sao.tamp", + "c2d0ea2cc59d4c21b7fe43a71499342a00cbe530a1d5548770e91ecd6214adcc", + ), + ( + "datasets/extended-compressed/x-ray.tamp", + "7de9fce1405dc44ae5e6813ed21cd5751e761bd4265655a005d39b9685d1c9ad", + ), + ( + "datasets/extended-compressed/xml.tamp", + "0e82e54e695c1938e4193448022543845b33020c8be6bf3bf3ead2224903e08c", + ), + ( + "datasets/extended-compressed/samba.tamp", + "93ba07bc44d8267789c1d911992f40b089ffa2140b4a160fac11ccae9a40e7b2", + ), + ( + "datasets/extended-compressed/nci.tamp", + "fc63a31770947b8c2062d3b19ca94c00485a232bb91b502021948fee983e1635", + ), + ( + "datasets/extended-compressed/webster.tamp", + "6a68f69b26daf09f9dd84f7470368553194a0b294fcfa80f1604efb11143a383", + ), + ( + "datasets/extended-compressed/mozilla.tamp", + "657fc3764b0c75ac9de9623125705831ebbfbe08fed248df73bc2dc66e2a963b", + ), + ( + "datasets/extended-compressed/enwik8.tamp", + "2b49720ec4d78c3c9fabaee6e4179a5e997302b3a70029f30f2d582218c024a8", + ), +] + class TestV1Decompression(unittest.TestCase): @pytest.mark.dataset @@ -90,5 +150,21 @@ def test_v1_decompress(self): self.assertEqual(actual, expected_sha256, f"SHA256 mismatch for {rel_path} using {impl_name}") +class TestExtendedDecompression(unittest.TestCase): + @pytest.mark.dataset + def test_extended_decompress(self): + for impl_name, decompress_func in DECOMPRESSOR_IMPLEMENTATIONS: + for rel_path, expected_sha256 in EXTENDED_DATASETS: + with self.subTest(implementation=impl_name, dataset=rel_path): + path = PROJECT_DIR / rel_path + + with open(path, "rb") as f: + data = f.read() + + decompressed = decompress_func(data) + actual = hashlib.sha256(decompressed).hexdigest() + self.assertEqual(actual, expected_sha256, f"SHA256 mismatch for {rel_path} using {impl_name}") + + if __name__ == "__main__": unittest.main() diff --git a/tests/test_decompressor.py b/tests/test_decompressor.py index 26e9d2f..df4cc0b 100644 --- a/tests/test_decompressor.py +++ b/tests/test_decompressor.py @@ -26,11 +26,14 @@ pass else: - from tamp.decompressor_viper import Decompressor as ViperDecompressor - from tamp.decompressor_viper import decompress as viper_decompress + try: + from tamp_native import Decompressor as NativeDecompressor + from tamp_native import decompress as native_decompress - Decompressors.append(ViperDecompressor) - decompresses.append(viper_decompress) + Decompressors.append(NativeDecompressor) + decompresses.append(native_decompress) + except ImportError: + pass class TestDecompressor(unittest.TestCase): diff --git a/tests/test_pseudorandom.py b/tests/test_pseudorandom.py index f2ca6f5..62d0e75 100644 --- a/tests/test_pseudorandom.py +++ b/tests/test_pseudorandom.py @@ -12,12 +12,6 @@ micropython = None if micropython: - import tamp.compressor_viper - import tamp.decompressor_viper - - modules.append(tamp.compressor_viper) - modules.append(tamp.decompressor_viper) - try: import tamp_native diff --git a/tools/print_compressed_sizes.py b/tools/print_compressed_sizes.py new file mode 100644 index 0000000..7cb809e --- /dev/null +++ b/tools/print_compressed_sizes.py @@ -0,0 +1,47 @@ +""" +Print compressed sizes for test files used in optimize-extended-huffman.py. + +This script compresses the same files that optimize-extended-huffman.py uses +and prints the compressed size for each file with thousands separators. +""" + +from pathlib import Path + +import tamp.compressor + + +def main(): + # Define test files (same as optimize-extended-huffman.py) + datasets_dir = Path(__file__).parent.parent / "datasets" + test_files = [ + datasets_dir / "enwik8", + datasets_dir / "RPI_PICO-20250415-v1.25.0.uf2", + *(datasets_dir / "silesia").iterdir(), + ] + test_files.sort() + + ratios = [] + for file_path in test_files: + # Read and compress the file + data = file_path.read_bytes() + if len(data) == 0: + print(f"{file_path.name}: Empty file") + continue + + compressed_data = tamp.compressor.compress(data) + + original_size = len(data) + compressed_size = len(compressed_data) + + ratio = original_size / compressed_size + ratios.append(ratio) + + # Print with thousands separators + print(f"{file_path.name}: {compressed_size:,} (**{ratio:.3f}**)") + + avg = sum(ratios) / len(ratios) + print(f"Average Ratio: {avg}") + + +if __name__ == "__main__": + main() diff --git a/wasm/src/streams.js b/wasm/src/streams.js index d342002..2a6c350 100644 --- a/wasm/src/streams.js +++ b/wasm/src/streams.js @@ -41,9 +41,17 @@ export class TampCompressionStream extends TransformStream { } finally { if (compressor) { compressor.destroy(); + compressor = null; } } }, + + cancel(_reason) { + if (compressor) { + compressor.destroy(); + compressor = null; + } + }, }); } } @@ -88,9 +96,17 @@ export class TampDecompressionStream extends TransformStream { } finally { if (decompressor) { decompressor.destroy(); + decompressor = null; } } }, + + cancel(_reason) { + if (decompressor) { + decompressor.destroy(); + decompressor = null; + } + }, }); } } diff --git a/wasm/src/tamp.d.ts b/wasm/src/tamp.d.ts index 584e2c4..8f19674 100644 --- a/wasm/src/tamp.d.ts +++ b/wasm/src/tamp.d.ts @@ -43,6 +43,8 @@ export interface TampOptions { literal?: number; /** Custom dictionary data. If null, no custom dictionary is used. If Uint8Array, uses the provided dictionary. Default: null */ dictionary?: Uint8Array | null; + /** Enable extended format (RLE, extended match) for better compression ratios. Default: true */ + extended?: boolean; /** Enable lazy matching for better compression ratios. Default: false */ lazy_matching?: boolean; } @@ -68,6 +70,7 @@ export interface TampDefaults { readonly window: 10; readonly literal: 8; readonly dictionary: null; + readonly extended: true; readonly lazy_matching: false; } diff --git a/wasm/src/tamp.js b/wasm/src/tamp.js index f52788d..df82ba7 100644 --- a/wasm/src/tamp.js +++ b/wasm/src/tamp.js @@ -122,6 +122,7 @@ export class TampCompressor { window: 10, literal: 8, dictionary: null, + extended: true, lazy_matching: false, ...options, }; @@ -183,7 +184,8 @@ export class TampCompressor { (this.options.window & 0xf) | ((this.options.literal & 0xf) << 4) | ((this.options.dictionary ? 1 : 0) << 8) | - ((this.options.lazy_matching ? 1 : 0) << 9); + ((this.options.extended ? 1 : 0) << 9) | + ((this.options.lazy_matching ? 1 : 0) << 10); this.module.setValue(confPtr, confValue, 'i32'); // Initialize compressor @@ -790,10 +792,12 @@ export async function compress(data, options = {}) { const callbackOptions = {}; // Extract compression-specific options - const { window, literal, dictionary, lazy_matching, onPoll, signal, pollIntervalMs, pollIntervalBytes } = options; + const { window, literal, dictionary, extended, lazy_matching, onPoll, signal, pollIntervalMs, pollIntervalBytes } = + options; if (window !== undefined) compressionOptions.window = window; if (literal !== undefined) compressionOptions.literal = literal; if (dictionary !== undefined) compressionOptions.dictionary = dictionary; + if (extended !== undefined) compressionOptions.extended = extended; if (lazy_matching !== undefined) compressionOptions.lazy_matching = lazy_matching; // Extract callback options diff --git a/website/index.html b/website/index.html index 8847451..50ec9a6 100644 --- a/website/index.html +++ b/website/index.html @@ -144,6 +144,16 @@

Configuration +
+ + +
+ ? +
+ Enables extended compression format with RLE and extended match encoding for better compression ratios. Recommended for most use cases. +
+
+
@@ -182,6 +192,16 @@

Configuration

+
+ + +
+ ? +
+ Enables extended compression format with RLE and extended match encoding for better compression ratios. Recommended for most use cases. +
+
+
@@ -194,7 +214,7 @@

Configuration

- +