From f315820880085c9cd99261232306aa185f383484 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Sun, 22 Mar 2026 16:58:41 -0400
Subject: [PATCH 01/26] reset to main with changes

---
 .github/workflows/ci.yml                      |  79 +-
 Cargo.toml                                    |   4 +-
 README.md                                     | 244 +++--
 benches/bench_utils.rs                        | 215 ++--
 benches/fastpfor_benchmark.rs                 | 290 ++++--
 cpp                                           |   2 +-
 fuzz/Cargo.toml                               |  34 +-
 fuzz/README.md                                |  19 +
 fuzz/clippy.toml                              |   1 +
 fuzz/fuzz_targets/common.rs                   | 260 ++---
 fuzz/fuzz_targets/compare_fastpfor_128.rs     |  64 ++
 fuzz/fuzz_targets/cpp_roundtrip.rs            |  40 +-
 fuzz/fuzz_targets/decode_arbitrary.rs         |  61 ++
 fuzz/fuzz_targets/decode_oracle.rs            |  65 ++
 fuzz/fuzz_targets/encode_compare.rs           |  62 ++
 fuzz/fuzz_targets/encode_oracle.rs            |  35 +
 fuzz/fuzz_targets/rust_compress_oracle.rs     |  79 --
 .../fuzz_targets/rust_decompress_arbitrary.rs |  81 --
 fuzz/fuzz_targets/rust_decompress_oracle.rs   |  73 --
 fuzz/justfile                                 |  22 +-
 justfile                                      |   6 +-
 src/codec.rs                                  |   3 +
 src/cpp/codecs.rs                             |   6 +-
 src/cpp/tests.rs                              |  80 +-
 src/error.rs                                  |  12 +-
 src/helpers.rs                                |   2 -
 src/lib.rs                                    |  56 +-
 src/rust/composite.rs                         | 258 +++++
 src/rust/integer_compression/codec.rs         | 153 ---
 src/rust/integer_compression/composition.rs   | 100 --
 .../integer_compression/differential/mod.rs   |  61 --
 src/rust/integer_compression/fastpfor.rs      | 986 +++++++++++-------
 src/rust/integer_compression/integer_codec.rs |  39 -
 src/rust/integer_compression/just_copy.rs     | 136 +--
 src/rust/integer_compression/mod.rs           |   5 -
 .../integer_compression/skippable_codec.rs    | 101 --
 src/rust/integer_compression/variable_byte.rs | 314 ++++--
 src/rust/mod.rs                               |  23 +-
 tests/basic_tests.rs                          | 513 ++-------
 tests/benchmark_smoke.rs                      | 126 +--
 tests/common.rs                               |  95 +-
 tests/cpp_compat_tests.rs                     | 160 ++-
 tests/decode_error_paths.rs                   | 493 ---------
 tests/encode_paths.rs                         | 175 ++--
 44 files changed, 2558 insertions(+), 3075 deletions(-)
 create mode 120000 fuzz/clippy.toml
 create mode 100644 fuzz/fuzz_targets/compare_fastpfor_128.rs
 create mode 100644 fuzz/fuzz_targets/decode_arbitrary.rs
 create mode 100644 fuzz/fuzz_targets/decode_oracle.rs
 create mode 100644 fuzz/fuzz_targets/encode_compare.rs
 create mode 100644 fuzz/fuzz_targets/encode_oracle.rs
 delete mode 100644 fuzz/fuzz_targets/rust_compress_oracle.rs
 delete mode 100644 fuzz/fuzz_targets/rust_decompress_arbitrary.rs
 delete mode 100644 fuzz/fuzz_targets/rust_decompress_oracle.rs
 create mode 100644 src/rust/composite.rs
 delete mode 100644 src/rust/integer_compression/codec.rs
 delete mode 100644 src/rust/integer_compression/composition.rs
 delete mode 100644 src/rust/integer_compression/differential/mod.rs
 delete mode 100644 src/rust/integer_compression/integer_codec.rs
 delete mode 100644 src/rust/integer_compression/skippable_codec.rs
 delete mode 100644 tests/decode_error_paths.rs

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 994260b..0794361 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -31,7 +31,78 @@ jobs:
           prefix-key: "v0-${{ matrix.simd_mode }}"
       - uses: taiki-e/install-action@v2
         with: { tool: 'just,cargo-binstall' }
-      - run: just ci-test
+      # Enable core dumps so SIGSEGV crashes produce a dump for post-mortem analysis.
+      # ulimit is per-process, so it must be set in the same shell that runs the tests.
+      - name: Run tests (with core dumps enabled)
+        run: |
+          if [[ "$RUNNER_OS" == "macOS" ]]; then
+            sudo sysctl -w kern.coredump=1
+            sudo sysctl -w kern.corefile='/cores/core.%N.%P'
+            sudo mkdir -p /cores && sudo chmod 1777 /cores
+          else
+            sudo sysctl -w kernel.core_pattern='/tmp/cores/core.%e.%p'
+            sudo mkdir -p /tmp/cores && sudo chmod 1777 /tmp/cores
+          fi
+          ulimit -c unlimited
+          just ci-test
+      # On failure, extract a minimal text report from any core dump (backtrace, file info).
+      # We deliberately avoid uploading the raw core dump (can be ~1.5 GB).
+      - name: Collect crash report
+        if: failure()
+        run: |
+          mkdir -p crash-report
+          CORE_DIR=$([[ "$RUNNER_OS" == "macOS" ]] && echo /cores || echo /tmp/cores)
+          for core in "$CORE_DIR"/core.* ; do
+            [[ -f "$core" ]] || continue
+            report="crash-report/$(basename "$core").txt"
+            {
+              echo "=== Core dump ==="
+              echo "Path: $core"
+              echo "File: $(file "$core")"
+              echo ""
+              if [[ "$RUNNER_OS" == "macOS" ]]; then
+                bin_name=$(basename "$core" | sed -E 's/core\.([^.]+)\.[0-9]+/\1/')
+                bin=$(find target -path '*/deps/*' -name "${bin_name}*" -type f -perm /111 ! -name '*.d' 2>/dev/null | head -1)
+                [[ -z "$bin" ]] && bin=$(find target/debug -maxdepth 2 -name "${bin_name}*" -type f -perm /111 2>/dev/null | head -1)
+                echo "Binary: ${bin:-not found}"
+                echo ""
+                echo "=== Backtrace ==="
+                if [[ -n "$bin" && -x "$bin" ]]; then
+                  lldb --no-lldbinit --batch \
+                    -o "target create --core '$core' '$bin'" \
+                    -o "thread backtrace all" \
+                    -o quit 2>&1
+                else
+                  echo "(trying lldb with core only)"
+                  lldb --no-lldbinit --batch \
+                    -o "target create --core '$core'" \
+                    -o "thread backtrace all" \
+                    -o quit 2>&1
+                fi
+              else
+                bin=$(file "$core" | sed -n "s/.*from '\([^']*\)'.*/\1/p")
+                [[ -z "$bin" ]] && bin=$(file "$core" | grep -oE "execfn: '[^']+'" | cut -d"'" -f2)
+                echo "Binary: ${bin:-not found}"
+                echo ""
+                echo "=== Backtrace ==="
+                if [[ -n "$bin" && -x "$bin" ]]; then
+                  gdb -batch -ex "thread apply all bt full" "$bin" "$core" 2>&1
+                else
+                  echo "(trying gdb with core only)"
+                  gdb -batch -ex "core-file $core" -ex "thread apply all bt" 2>&1
+                fi
+              fi
+            } > "$report" 2>&1
+          done
+          echo "=== crash-report ===" && ls -lh crash-report/ || true
+          shopt -s nullglob; reports=(crash-report/*.txt)
+          if [[ ${#reports[@]} -gt 0 ]]; then cat "${reports[@]}"; else echo "(no core dumps found)"; fi
+      - uses: actions/upload-artifact@v7
+        if: failure()
+        with:
+          name: crash-report-${{ matrix.os }}-${{ matrix.simd_mode }}-${{ github.sha }}
+          path: crash-report/
+          if-no-files-found: warn
 
   test-nightly:
     name: Nightly-specific tests
@@ -75,9 +146,9 @@ jobs:
       matrix:
         include:
           - fuzz_target: cpp_roundtrip
-          - fuzz_target: rust_compress_oracle
-          - fuzz_target: rust_decompress_oracle
-          - fuzz_target: rust_decompress_arbitrary
+          - fuzz_target: encode_oracle
+          - fuzz_target: decode_oracle
+          - fuzz_target: decode_arbitrary
     steps:
       - uses: actions/checkout@v6
         with: {persist-credentials: false, submodules: recursive}
diff --git a/Cargo.toml b/Cargo.toml
index fc171e5..ce87f1c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,9 +28,7 @@ harness = false
 bench = false
 
 [features]
-# Eventually we may want to build without the C++ bindings by default.
-# Keeping it on for now to simplify development.
-default = ["cpp", "rust"]
+default = ["rust"]
 # Used internally for testing and benchmarking. Not intended for public use.
 _all_compatible = ["cpp_portable", "rust"]
 # Use portable C++ code that will not rely on the latest CPU features. This is the default for the C++ bindings.
diff --git a/README.md b/README.md
index 24ab790..11abee1 100644
--- a/README.md
+++ b/README.md
@@ -8,48 +8,140 @@
 [![CI build status](https://github.com/fast-pack/FastPFOR-rs/actions/workflows/ci.yml/badge.svg)](https://github.com/fast-pack/FastPFOR-rs/actions)
 [![Codecov](https://img.shields.io/codecov/c/github/fast-pack/FastPFOR-rs)](https://app.codecov.io/gh/fast-pack/FastPFOR-rs)
 
-This is a Rust wrapper for the [C++ FastPFor library](https://github.com/fast-pack/FastPFor), as well as a pure Rust re-implementation.  Supports 32-bit and 64-bit integers, and SIMD-optimized codecs for 128-bit and 256-bit vectors. Based on the [Decoding billions of integers per second through vectorization, 2012](https://arxiv.org/abs/1209.2137) paper.
+Fast integer compression for Rust — both a pure-Rust implementation and a wrapper around the [C++ FastPFor library](https://github.com/fast-pack/FastPFor).
+Supports 32-bit (and for some codecs 64-bit) integers.
+Based on the [Decoding billions of integers per second through vectorization, 2012](https://arxiv.org/abs/1209.2137) paper.
 
 The Rust **decoder** is about 29% faster than the C++ version. The Rust implementation contains no `unsafe` code, and when built without the `cpp` feature this crate has `#![forbid(unsafe_code)]`.
 
-### Supported algorithms
-Unless otherwise specified, all codecs support `&[u32]` only.
-
-```text
-* BP32
-* Copy
-* FastBinaryPacking8
-* FastPFor128 (both `&[u32]` and `&[u64]`)
-* FastPFor256 (both `&[u32]` and `&[u64]`)
-* FastBinaryPacking16
-* FastBinaryPacking32
-* MaskedVByte
-* NewPFor
-* OptPFor
-* PFor2008
-* PFor
-* SimdBinaryPacking
-* SimdFastPFor128
-* SimdFastPFor256
-* SimdGroupSimple
-* SimdGroupSimpleRingBuf
-* SimdNewPFor
-* SimdOptPFor
-* SimdPFor
-* SimdSimplePFor
-* Simple16
-* Simple8b
-* Simple8bRle
-* Simple9
-* Simple9Rle
-* SimplePFor
-* StreamVByte
-* VByte
-* VarInt (both `&[u32]` and `&[u64]`)
-* VarIntGb
+## Usage
+
+### Rust Implementation (default)
+
+The simplest way is `FastPFor256` — a composite codec that handles any input
+length by compressing aligned 256-element blocks with `FastPForBlock256` and encoding any
+leftover values with `VariableByte`.
+
+```rust
+use fastpfor::{AnyLenCodec, FastPFor256};
+
+let mut codec = FastPFor256::default();
+let input: Vec<u32> = (0..1000).collect();
+
+let mut encoded = Vec::new();
+codec.encode(&input, &mut encoded).unwrap();
+
+let mut decoded = Vec::new();
+codec.decode(&encoded, &mut decoded, None).unwrap();
+
+assert_eq!(decoded, input);
 ```
 
+For block-aligned inputs you can use the lower-level `BlockCodec` API:
+
+```rust
+use fastpfor::{BlockCodec, FastPForBlock256, slice_to_blocks};
+
+let mut codec = FastPForBlock256::default();
+let input: Vec<u32> = (0..512).collect();   // exactly 2 blocks of 256
+
+let (blocks, remainder) = slice_to_blocks::<FastPForBlock256>(&input);
+assert_eq!(blocks.len(), 2);
+assert!(remainder.is_empty());
+
+let mut encoded = Vec::new();
+codec.encode_blocks(blocks, &mut encoded).unwrap();
+
+let mut decoded = Vec::new();
+codec.decode_blocks(&encoded, Some(u32::try_from(blocks.len() * 256).expect("block count fits in u32")), &mut decoded).unwrap();
+
+assert_eq!(decoded, input);
+```
+
+### C++ Wrapper (`cpp` feature)
+
+Enable the `cpp` feature in `Cargo.toml`:
+
+```toml
+fastpfor = { version = "0.1", features = ["cpp"] }
+```
+
+All C++ codecs implement the same `AnyLenCodec` trait (`encode` / `decode`), so
+the usage pattern is identical to the Rust examples above — just swap the codec type,
+e.g. `cpp::CppFastPFor128::new()`.
+
+**Thread safety:** C++ codec instances have internal state and are **not thread-safe**.
+Create one instance per thread or synchronize access externally.
+
+## Crate Features
+
+| Feature | Default | Description |
+|---------|---------|-------------|
+| `rust` | **yes** | Pure-Rust implementation — no `unsafe`, no build dependencies |
+| `cpp` | no | C++ wrapper via CXX — requires a C++14 compiler with SIMD support |
+| `cpp_portable` | no | Enables `cpp`, compiles C++ with SSE4.2 baseline (runs on any x86-64 from ~2008+) |
+| `cpp_native` | no | Enables `cpp`, compiles C++ with `-march=native` for maximum throughput on the build machine |
+
+The `FASTPFOR_SIMD_MODE` environment variable (`portable` or `native`) can override the SIMD mode at build time.
+
+**Recommendation:** Use `cpp_portable` (not `cpp_native`) for distributable binaries.
+
+## Supported Algorithms
+
+### Rust (`rust` feature)
+
+Rust block codecs require block-aligned input. `CompositeCodec` chains a block codec with a tail codec (e.g. `VariableByte`) to handle arbitrary-length input. `FastPFor256` and `FastPFor128` are type aliases for such composites.
+
+| Codec                      | Description                                                                               |
+|----------------------------|-------------------------------------------------------------------------------------------|
+| `FastPFor256`              | `CompositeCodec` of `FastPForBlock256` + `VariableByte`. **Recommended for general use.** |
+| `FastPFor128`              | `CompositeCodec` of `FastPForBlock128` + `VariableByte`                                   |
+| `VariableByte`             | Variable-byte encoding only; good for small integers                                      |
+| `JustCopy`                 | No compression; useful as a baseline                                                      |
+| `FastPForBlock256` (block) | `FastPFor` with 256-element blocks; block-aligned input only                              |
+| `FastPForBlock128` (block) | `FastPFor` with 128-element blocks; block-aligned input only                              |
+
+### C++ (`cpp` feature)
+
+All C++ codecs are composite (any-length) and implement `AnyLenCodec` only.
+`u64`-capable codecs (`CppFastPFor128`, `CppFastPFor256`, `CppVarInt`) also implement `BlockCodec64` with `encode64` / `decode64`.
+
+| Codec                       | Notes                                                                  |
+|-----------------------------|------------------------------------------------------------------------|
+| `CppFastPFor128`            | `FastPFor + VByte` composite, 128-element blocks. Also supports `u64`.  |
+| `CppFastPFor256`            | `FastPFor + VByte` composite, 256-element blocks. Also supports `u64`.  |
+| `CppSimdFastPFor128`        | SIMD-optimized 128-element variant                                     |
+| `CppSimdFastPFor256`        | SIMD-optimized 256-element variant                                     |
+| `CppBP32`                   | Binary packing, 32-bit blocks                                          |
+| `CppFastBinaryPacking8`     | Binary packing, 8-bit groups                                           |
+| `CppFastBinaryPacking16`    | Binary packing, 16-bit groups                                          |
+| `CppFastBinaryPacking32`    | Binary packing, 32-bit groups                                          |
+| `CppSimdBinaryPacking`      | SIMD-optimized binary packing                                          |
+| `CppPFor`                   | Patched frame-of-reference                                             |
+| `CppSimplePFor`             | Simplified `PFor` variant                                              |
+| `CppNewPFor`                | `PFor` with improved exception handling                                |
+| `CppOptPFor`                | Optimized `PFor`                                                       |
+| `CppPFor2008`               | Reference implementation from original paper                           |
+| `CppSimdPFor`               | SIMD `PFor`                                                            |
+| `CppSimdSimplePFor`         | SIMD `SimplePFor`                                                      |
+| `CppSimdNewPFor`            | SIMD `NewPFor`                                                         |
+| `CppSimdOptPFor`            | SIMD `OptPFor`                                                         |
+| `CppSimple16`               | 16 packing modes in 32-bit words                                       |
+| `CppSimple9`                | 9 packing modes                                                        |
+| `CppSimple9Rle`             | Simple9 with run-length encoding                                       |
+| `CppSimple8b`               | 8 packing modes in 64-bit words                                        |
+| `CppSimple8bRle`            | Simple8b with run-length encoding                                      |
+| `CppSimdGroupSimple`        | SIMD group-simple encoding                                             |
+| `CppSimdGroupSimpleRingBuf` | SIMD group-simple with ring buffer                                     |
+| `CppVByte`                  | Standard variable-byte encoding                                        |
+| `CppMaskedVByte`            | SIMD masked variable-byte                                              |
+| `CppStreamVByte`            | SIMD stream variable-byte                                              |
+| `CppVarInt`                 | Standard varint. Also supports `u64`.                                  |
+| `CppVarIntGb`               | Group varint                                                           |
+| `CppCopy`                   | No compression (baseline)                                              |
+
 ## Benchmarks
+
 ### Decoding
 
 Using Linux x86-64 running `just bench::cpp-vs-rust-decode native`. The values below are time measurements; smaller values indicate faster decoding.
@@ -67,92 +159,49 @@ Using Linux x86-64 running `just bench::cpp-vs-rust-decode native`. The values b
 | `uniform_small_value_distribution/1024` | 606.4    | 405.44    | 33.14%   |
 | `uniform_small_value_distribution/4096` | 2017.3   | 1403.7    | 30.42%   |
 
-Rust Encoding has not yet been either optimized or even fully verified.
-
-## Usage
-
-### Crate Features
-* `cpp` - C++ implementation (uses portable SIMD mode)
-* `rust` - Rust implementation (safe Rust code, no `unsafe` blocks)
-
-#### SIMD Mode Configuration
-
-The C++ backend can be compiled with different SIMD instruction sets. Control this by enabling one of these features:
-| Mode | Description |
-|------|-------------|
-| `cpp_portable` | **Default.** Uses SSE4.2 baseline only. Binaries run on any x86-64 CPU from ~2008+. Best for distributable libraries. |
-| `cpp_native` | Uses `-march=native` to enable all SIMD instructions supported by the build machine (AVX, AVX2, etc.). Maximum performance but may crash on CPUs lacking those instructions. |
-
-Feature selection can be overridden with the `FASTPFOR_SIMD_MODE` environment variable set to "portable" or "native".
-
-**Recommendation:** Use `portable` (default) for libraries and distributed binaries. Use `native` only when building for a specific machine where you need maximum performance.
-
-### Using C++ Wrapper
-
-```rust
-use fastpfor::AnyLenCodec as _;
-use fastpfor::cpp::CppSimdFastPFor128;
-
-fn main() {
-  let mut codec = CppSimdFastPFor128::new();
-
-  let input = vec![1u32, 2, 3, 4, 5];
-  let mut compressed = Vec::new();
-  codec.encode(&input, &mut compressed).unwrap();
-
-  let mut decoded = Vec::new();
-  codec
-    .decode(&compressed, &mut decoded, None)
-    .unwrap();
-
-  assert_eq!(input, decoded);
-}
-```
+Rust encoding has not yet been fully optimized or verified.
 
 ## Build Requirements
 
-- When using the **Rust implementation**:
-  no additional dependencies are required.
-- When using the **C++ implementation**:
-  you need to have a C++ compiler that supports C++14 and SIMD intrinsics.
+- **Rust feature** (`rust`, the default): no additional dependencies.
+- **C++ feature** (`cpp`): requires a C++14-capable compiler with SIMD intrinsics.
   See [FastPFor C++ requirements](https://github.com/fast-pack/FastPFor?tab=readme-ov-file#software-requirements).
 
 ### Linux
 
-The default GitHub action runner for Linux has all the needed dependencies.
+The default GitHub Actions runner has all needed dependencies.
 
-For local development, you may need to install the following packages:
+For local development:
 
 ```bash
 # This list may be incomplete
 sudo apt-get install build-essential
 ```
 
-`libsimde-dev` is optional. On ARM/aarch64, the C++ build fetches `SIMDe` via `CMake`,
-and the Rust CXX bridge now reuses that fetched include path automatically.
-Install `libsimde-dev` only if you prefer a system package fallback.
+`libsimde-dev` is optional. On ARM/aarch64, the C++ build fetches `SIMDe` via `CMake`
+and the CXX bridge reuses that include path automatically.
 
 ### macOS
-On Apple Silicon, manual `SIMDe` installation is usually not required.
-The C++ build fetches `SIMDe` via `CMake`, and the Rust CXX bridge reuses that path.
 
-If you prefer a system package fallback, install `SIMDe` with Homebrew and set include flags.
+On Apple Silicon, `SIMDe` installation is usually not required — the C++ build fetches it via `CMake`.
+
+If you prefer a Homebrew fallback:
 
 ```bash
-# optional: install SIMDe via Homebrew
 brew install simde
-
-# optional fallback: ensure the compiler can find Homebrew headers
 export CXXFLAGS="-I/opt/homebrew/include"
 export CFLAGS="-I/opt/homebrew/include"
 ```
 
 ## Development
 
-* This project is easier to develop with [just](https://github.com/casey/just#readme), a modern alternative to `make`.
-  Install it with `cargo install just`.
-* To get a list of available commands, run `just`.
-* To run tests, use `just test`.
+This project uses [just](https://github.com/casey/just#readme) as a task runner:
+
+```bash
+cargo install just   # install once
+just                 # list available commands
+just test            # run all tests
+```
 
 ## License
 
@@ -160,7 +209,8 @@ Licensed under either of
 
 * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or <https://www.apache.org/licenses/LICENSE-2.0>)
 * MIT license ([LICENSE-MIT](LICENSE-MIT) or <https://opensource.org/licenses/MIT>)
-  at your option.
+
+at your option.
 
 ### Contribution
 
diff --git a/benches/bench_utils.rs b/benches/bench_utils.rs
index 089eeec..a60a251 100644
--- a/benches/bench_utils.rs
+++ b/benches/bench_utils.rs
@@ -10,14 +10,9 @@
 #![allow(missing_docs)]
 
 use core::ops::Range;
-pub use std::io::Cursor;
-use std::num::NonZeroU32;
 
-#[cfg(feature = "cpp")]
-use fastpfor::AnyLenCodec as _;
-#[cfg(feature = "cpp")]
-use fastpfor::cpp::CppFastPFor128;
-pub use fastpfor::rust::{BLOCK_SIZE_128, BLOCK_SIZE_256, DEFAULT_PAGE_SIZE, FastPFOR, Integer};
+#[allow(unused_imports)]
+use fastpfor::{AnyLenCodec, BlockCodec, slice_to_blocks};
 use rand::rngs::StdRng;
 use rand::{RngExt as _, SeedableRng};
 
@@ -115,79 +110,57 @@ const ALL_PATTERNS: &[(&str, DataGeneratorFn)] = &[
 ];
 
 // ---------------------------------------------------------------------------
-// Codec helpers
+// Generic codec helpers
 // ---------------------------------------------------------------------------
 
-/// Compress `data` and return the compressed words.
-pub fn compress_data(codec: &mut FastPFOR, data: &[u32]) -> Vec<u32> {
-    let mut compressed = vec![0u32; data.len() * 2 + 1024];
-    let mut input_offset = Cursor::new(0);
-    let mut output_offset = Cursor::new(0);
-    codec
-        .compress(
-            data,
-            data.len() as u32,
-            &mut input_offset,
-            &mut compressed,
-            &mut output_offset,
-        )
-        .unwrap();
-    let len = output_offset.position() as usize;
-    compressed.truncate(len);
-    compressed
-}
-
-/// Decompress `compressed` into the caller-provided `decompressed` buffer and
-/// return the number of elements written.
+/// Compress `data` with codec `C`, appending to `out` (which is cleared first).
 ///
-/// The buffer must be allocated outside the timed loop so that allocation cost
-/// is not measured.
-pub fn decompress_data(
-    codec: &mut FastPFOR,
+/// Only the block-aligned prefix of `data` is compressed; any sub-block
+/// remainder is silently dropped, matching what the benchmarks measure.
+pub fn compress<C: BlockCodec + Default>(data: &[u32], out: &mut Vec<u32>) {
+    let mut codec = C::default();
+    let (blocks, _remainder) = slice_to_blocks::<C>(data);
+    out.clear();
+    codec.encode_blocks(blocks, out).unwrap();
+}
+
+/// Decompress `n_blocks` blocks of codec `C` from `compressed` into `out`
+/// (cleared first), returning the number of elements written.
+#[allow(dead_code)] // used by smoke tests; benches use codec directly
+pub fn decompress<C: BlockCodec + Default>(
     compressed: &[u32],
-    decompressed: &mut [u32],
+    n_blocks: usize,
+    out: &mut Vec<u32>,
 ) -> usize {
-    let mut input_offset = Cursor::new(0);
-    let mut output_offset = Cursor::new(0);
+    let mut codec = C::default();
+    out.clear();
+    let expected_values = n_blocks * C::size();
     codec
-        .uncompress(
+        .decode_blocks(
             compressed,
-            compressed.len() as u32,
-            &mut input_offset,
-            decompressed,
-            &mut output_offset,
+            Some(u32::try_from(expected_values).expect("expected_values fits in u32")),
+            out,
         )
         .unwrap();
-    output_offset.position() as usize
-}
-
-/// Pre-compress `data` with a specific `block_size` and return the compressed buffer.
-fn prepare_compressed_data(data: &[u32], block_size: NonZeroU32) -> Vec<u32> {
-    compress_data(&mut FastPFOR::new(DEFAULT_PAGE_SIZE, block_size), data)
-}
-
-// ---------------------------------------------------------------------------
-// C++ helpers (compiled only when the `cpp` feature is active)
-// ---------------------------------------------------------------------------
-
-#[cfg(feature = "cpp")]
-pub fn cpp_encode(codec: &mut CppFastPFor128, data: &[u32]) -> Vec<u32> {
-    let mut out = Vec::new();
-    codec.encode(data, &mut out).unwrap();
-    out
+    out.len()
 }
 
-#[cfg(feature = "cpp")]
-pub fn cpp_decode(
-    codec: &mut CppFastPFor128,
+/// Decompress with any-length codec `C`, using `expected_len` for validation/pre-allocation.
+#[allow(dead_code)] // used by smoke_cpp_vs_rust
+pub fn decompress_anylen<C: AnyLenCodec + Default>(
     compressed: &[u32],
-    decompressed: &mut [u32],
+    expected_len: usize,
+    out: &mut Vec<u32>,
 ) -> usize {
-    let mut out = Vec::new();
+    let mut codec = C::default();
+    out.clear();
     codec
-        .decode(compressed, &mut out, Some(decompressed.len() as u32))
+        .decode(
+            compressed,
+            out,
+            Some(u32::try_from(expected_len).expect("expected_len fits in u32")),
+        )
         .unwrap();
-    decompressed.copy_from_slice(&out);
     out.len()
 }
 
@@ -196,106 +169,78 @@ pub fn cpp_decode(
 // ---------------------------------------------------------------------------
 
 /// One row of pre-computed data for compression / decompression benchmarks.
-pub struct CompressFixture {
+///
+/// Parameterised by `C: BlockCodec` so the same struct works for both 128-
+/// and 256-element block codecs.
+pub struct CompressFixture<C: BlockCodec> {
     pub name: &'static str,
+    /// Block-aligned uncompressed data (exactly `n_blocks * C::elements_per_block()` elements).
     pub data: Vec<u32>,
-    /// Rust-compressed form (`BLOCK_SIZE_128`), ready for decompression benchmarks.
-    pub rust_compressed: Vec<u32>,
+    /// Pre-compressed form, ready for decompression benchmarks.
+    pub compressed: Vec<u32>,
+    /// Number of blocks in `data`.
+    pub n_blocks: usize,
+    _codec: std::marker::PhantomData<C>,
 }
 
-impl CompressFixture {
-    fn new(name: &'static str, generator: DataGeneratorFn, size: usize) -> Self {
-        let data = generator(size);
-        let rust_compressed = prepare_compressed_data(&data, BLOCK_SIZE_128);
+impl<C: BlockCodec + Default> CompressFixture<C> {
+    fn new(name: &'static str, generator: DataGeneratorFn, block_count: usize) -> Self {
+        let data = generator(block_count * C::size());
+        // Data is already exactly block_count * blen elements; no trimming needed.
+        let mut compressed = Vec::new();
+        compress::<C>(&data, &mut compressed);
         Self {
             name,
             data,
-            rust_compressed,
+            compressed,
+            n_blocks: block_count,
+            _codec: std::marker::PhantomData,
         }
     }
 }
 
-/// Build fixtures for every `COMPRESS_PATTERNS × sizes` combination.
-pub fn compress_fixtures(sizes: &[usize]) -> Vec<(usize, CompressFixture)> {
-    sizes
+/// Build fixtures for every `COMPRESS_PATTERNS × block_counts` combination.
+pub fn compress_fixtures<C: BlockCodec + Default>(
+    block_counts: &[usize],
+) -> Vec<(usize, CompressFixture<C>)> {
+    block_counts
         .iter()
-        .flat_map(|&size| {
+        .flat_map(|&bc| {
             COMPRESS_PATTERNS
                 .iter()
-                .map(move |&(name, generator)| (size, CompressFixture::new(name, generator, size)))
+                .map(move |&(name, generator)| (bc, CompressFixture::<C>::new(name, generator, bc)))
         })
         .collect()
 }
 
-/// Build fixtures for every `ALL_PATTERNS` at a single size.
-pub fn ratio_fixtures(size: usize) -> Vec<CompressFixture> {
+/// Build fixtures for every `ALL_PATTERNS` at a single block count.
+pub fn ratio_fixtures<C: BlockCodec + Default>(block_count: usize) -> Vec<CompressFixture<C>> {
     ALL_PATTERNS
         .iter()
-        .map(|&(name, generator)| CompressFixture::new(name, generator, size))
+        .map(|&(name, generator)| CompressFixture::<C>::new(name, generator, block_count))
         .collect()
 }
 
-/// One row for the block-size benchmark.
-pub struct BlockSizeFixture {
-    pub block_size: NonZeroU32,
+/// One row for the block-size comparison benchmark.
+///
+/// Parameterised by `C: BlockCodec` — create one per codec to compare.
+pub struct BlockSizeFixture<C: BlockCodec> {
     pub data: Vec<u32>,
     pub compressed: Vec<u32>,
+    pub n_blocks: usize,
+    _codec: std::marker::PhantomData<C>,
 }
 
-impl BlockSizeFixture {
-    fn new(block_size: NonZeroU32, size: usize) -> Self {
-        let data = generate_uniform_data_small_value_distribution(size);
-        let compressed = prepare_compressed_data(&data, block_size);
+impl<C: BlockCodec + Default> BlockSizeFixture<C> {
+    pub fn new(block_count: usize) -> Self {
+        let data = generate_uniform_data_small_value_distribution(block_count * C::size());
+        let mut compressed = Vec::new();
+        compress::<C>(&data, &mut compressed);
         Self {
-            block_size,
             data,
             compressed,
+            n_blocks: block_count,
+            _codec: std::marker::PhantomData,
         }
     }
 }
-
-/// Build fixtures for both block sizes at a given `size`.
-pub fn block_size_fixtures(size: usize) -> Vec<BlockSizeFixture> {
-    [BLOCK_SIZE_128, BLOCK_SIZE_256]
-        .iter()
-        .map(|&bs| BlockSizeFixture::new(bs, size))
-        .collect()
-}
-
-/// One row for the C++ vs Rust decode benchmark.
-#[cfg(feature = "cpp")]
-pub struct CppDecodeFixture {
-    pub name: &'static str,
-    pub cpp_compressed: Vec<u32>,
-    pub rust_compressed: Vec<u32>,
-    pub original_len: usize,
-}
-
-#[cfg(feature = "cpp")]
-impl CppDecodeFixture {
-    fn new(name: &'static str, generator: DataGeneratorFn, size: usize) -> Self {
-        let data = generator(size);
-        let mut codec = CppFastPFor128::new();
-        let cpp_compressed = cpp_encode(&mut codec, &data);
-        let rust_compressed = prepare_compressed_data(&data, BLOCK_SIZE_128);
-        Self {
-            name,
-            cpp_compressed,
-            rust_compressed,
-            original_len: size,
-        }
-    }
-}
-
-/// Build C++ vs Rust decode fixtures for every `COMPRESS_PATTERNS × sizes` combination.
-#[cfg(feature = "cpp")]
-pub fn cpp_decode_fixtures(sizes: &[usize]) -> Vec<(usize, CppDecodeFixture)> {
-    sizes
-        .iter()
-        .flat_map(|&size| {
-            COMPRESS_PATTERNS
-                .iter()
-                .map(move |&(name, generator)| (size, CppDecodeFixture::new(name, generator, size)))
-        })
-        .collect()
-}
diff --git a/benches/fastpfor_benchmark.rs b/benches/fastpfor_benchmark.rs
index 212e3a5..2e8f341 100644
--- a/benches/fastpfor_benchmark.rs
+++ b/benches/fastpfor_benchmark.rs
@@ -3,28 +3,37 @@
 use std::hint::black_box;
 
 use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
+#[cfg(feature = "cpp")]
+use fastpfor::AnyLenCodec;
+use fastpfor::{BlockCodec as _, FastPForBlock128, FastPForBlock256, slice_to_blocks};
 
 #[path = "bench_utils.rs"]
 mod bench_utils;
 use bench_utils::{
-    BLOCK_SIZE_128, Cursor, DEFAULT_PAGE_SIZE, FastPFOR, Integer, block_size_fixtures,
-    compress_data, compress_fixtures, decompress_data,
-    generate_uniform_data_small_value_distribution, ratio_fixtures,
+    BlockSizeFixture, compress_fixtures, generate_uniform_data_small_value_distribution,
+    ratio_fixtures,
 };
 #[cfg(feature = "cpp")]
-use bench_utils::{cpp_decode, cpp_decode_fixtures, cpp_encode};
-#[cfg(feature = "cpp")]
 use fastpfor::cpp::CppFastPFor128;
 
-const SIZES: &[usize] = &[1024, 4096];
+/// Number of blocks per benchmark run.  The element count per run is
+/// `BLOCK_COUNTS[i] * C::elements_per_block()`, e.g. 8 × 128 = 1,024 or 32 × 128 = 4,096.
+const BLOCK_COUNTS: &[usize] = &[8, 32];
 
 fn benchmark_compression(c: &mut Criterion) {
     let mut group = c.benchmark_group("compression");
-    for (size, fix) in compress_fixtures(SIZES) {
-        group.throughput(Throughput::Elements(size as u64));
-        group.bench_with_input(BenchmarkId::new(fix.name, size), &fix.data, |b, data| {
-            let mut codec = FastPFOR::default();
-            b.iter(|| black_box(compress_data(&mut codec, black_box(data))));
+    for (bc, fix) in compress_fixtures::<FastPForBlock128>(BLOCK_COUNTS) {
+        let n_elem = fix.data.len();
+        group.throughput(Throughput::Elements(n_elem as u64));
+        group.bench_with_input(BenchmarkId::new(fix.name, bc), &fix.data, |b, data| {
+            let mut codec = FastPForBlock128::default();
+            let (blocks, _) = slice_to_blocks::<FastPForBlock128>(data);
+            let mut out = Vec::new();
+            b.iter(|| {
+                out.clear();
+                codec.encode_blocks(black_box(blocks), &mut out).unwrap();
+                black_box(out.len())
+            });
         });
     }
     group.finish();
@@ -32,65 +41,61 @@ fn benchmark_compression(c: &mut Criterion) {
 
 fn benchmark_decompression(c: &mut Criterion) {
     let mut group = c.benchmark_group("decompression");
-    for (size, fix) in compress_fixtures(SIZES) {
-        group.throughput(Throughput::Elements(size as u64));
-        group.bench_with_input(
-            BenchmarkId::new(fix.name, size),
-            &fix.rust_compressed,
-            |b, compressed| {
-                let mut codec = FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128);
-                let mut decompressed = vec![0u32; size];
-                b.iter(|| {
-                    black_box(decompress_data(
-                        &mut codec,
-                        black_box(compressed),
-                        &mut decompressed,
-                    ))
-                });
-            },
-        );
+    for (bc, fix) in compress_fixtures::<FastPForBlock128>(BLOCK_COUNTS) {
+        let n_elem = fix.data.len();
+        group.throughput(Throughput::Elements(n_elem as u64));
+        group.bench_with_input(BenchmarkId::new(fix.name, bc), &fix, |b, fix| {
+            let mut codec = FastPForBlock128::default();
+            let mut out = Vec::new();
+            b.iter(|| {
+                out.clear();
+                codec
+                    .decode_blocks(
+                        black_box(&fix.compressed),
+                        Some(
+                            u32::try_from(fix.n_blocks * FastPForBlock128::size())
+                                .expect("expected_values fits in u32"),
+                        ),
+                        &mut out,
+                    )
+                    .unwrap();
+                black_box(out.len())
+            });
+        });
     }
     group.finish();
 }
 
 fn benchmark_roundtrip(c: &mut Criterion) {
     let mut group = c.benchmark_group("roundtrip");
-    for &size in SIZES {
-        let data = generate_uniform_data_small_value_distribution(size);
-        group.throughput(Throughput::Elements(size as u64));
+    for &bc in BLOCK_COUNTS {
+        let data = generate_uniform_data_small_value_distribution(bc * FastPForBlock128::size());
+        group.throughput(Throughput::Elements(data.len() as u64));
         group.bench_with_input(
-            BenchmarkId::new("compress_decompress", size),
+            BenchmarkId::new("compress_decompress", bc),
             &data,
             |b, data| {
-                let mut encoder = FastPFOR::default();
-                let mut decoder = FastPFOR::default();
-                let mut compressed = vec![0u32; data.len() * 2 + 1024];
-                let mut decompressed = vec![0u32; data.len()];
+                let mut codec = FastPForBlock128::default();
+                let (blocks, _) = slice_to_blocks::<FastPForBlock128>(data);
+                let mut compressed = Vec::new();
+                let mut decompressed = Vec::new();
                 b.iter(|| {
-                    let mut input_offset = Cursor::new(0);
-                    let mut output_offset = Cursor::new(0);
-                    encoder
-                        .compress(
-                            black_box(data),
-                            data.len() as u32,
-                            &mut input_offset,
-                            &mut compressed,
-                            &mut output_offset,
-                        )
+                    compressed.clear();
+                    codec
+                        .encode_blocks(black_box(blocks), &mut compressed)
                         .unwrap();
-                    input_offset.set_position(0);
-                    let compressed_len = output_offset.position();
-                    output_offset.set_position(0);
-                    decoder
-                        .uncompress(
+                    decompressed.clear();
+                    codec
+                        .decode_blocks(
                             &compressed,
-                            data.len() as u32,
-                            &mut input_offset,
+                            Some(
+                                u32::try_from(bc * FastPForBlock128::size())
+                                    .expect("expected_values fits in u32"),
+                            ),
                             &mut decompressed,
-                            &mut output_offset,
                         )
                         .unwrap();
-                    black_box((compressed_len, output_offset.position()))
+                    black_box(decompressed.len())
                 });
             },
         );
@@ -100,23 +105,74 @@ fn benchmark_roundtrip(c: &mut Criterion) {
 
 fn benchmark_block_sizes(c: &mut Criterion) {
     let mut group = c.benchmark_group("block_sizes");
-    let size = *SIZES.last().unwrap();
-    for fix in block_size_fixtures(size) {
-        group.throughput(Throughput::Elements(size as u64));
-        group.bench_function(format!("compress_{}", fix.block_size), |b| {
-            let mut codec = FastPFOR::new(DEFAULT_PAGE_SIZE, fix.block_size);
-            b.iter(|| black_box(compress_data(&mut codec, black_box(&fix.data))));
+    let bc = *BLOCK_COUNTS.last().unwrap();
+
+    let fix128 = BlockSizeFixture::<FastPForBlock128>::new(bc);
+    let fix256 = BlockSizeFixture::<FastPForBlock256>::new(bc);
+
+    for (label, data, compressed, n_blocks, is_256) in [
+        (
+            "128",
+            &fix128.data,
+            &fix128.compressed,
+            fix128.n_blocks,
+            false,
+        ),
+        (
+            "256",
+            &fix256.data,
+            &fix256.compressed,
+            fix256.n_blocks,
+            true,
+        ),
+    ] {
+        group.throughput(Throughput::Elements(data.len() as u64));
+        group.bench_function(format!("compress_{label}"), |b| {
+            if is_256 {
+                let mut codec = FastPForBlock256::default();
+                let (blocks, _) = slice_to_blocks::<FastPForBlock256>(data);
+                let mut out = Vec::new();
+                b.iter(|| {
+                    out.clear();
+                    codec.encode_blocks(black_box(blocks), &mut out).unwrap();
+                    black_box(out.len())
+                });
+            } else {
+                let mut codec = FastPForBlock128::default();
+                let (blocks, _) = slice_to_blocks::<FastPForBlock128>(data);
+                let mut out = Vec::new();
+                b.iter(|| {
+                    out.clear();
+                    codec.encode_blocks(black_box(blocks), &mut out).unwrap();
+                    black_box(out.len())
+                });
+            }
         });
-        group.bench_function(format!("decompress_{}", fix.block_size), |b| {
-            let mut codec = FastPFOR::new(DEFAULT_PAGE_SIZE, fix.block_size);
-            let mut decompressed = vec![0u32; size];
-            b.iter(|| {
-                black_box(decompress_data(
-                    &mut codec,
-                    black_box(&fix.compressed),
-                    &mut decompressed,
-                ))
-            });
+        group.bench_function(format!("decompress_{label}"), |b| {
+            if is_256 {
+                let mut codec = FastPForBlock256::default();
+                let mut out = Vec::new();
+                let expected = u32::try_from(n_blocks * FastPForBlock256::size())
+                    .expect("expected_values fits in u32");
+                b.iter(|| {
+                    out.clear();
+                    codec
+                        .decode_blocks(black_box(compressed), Some(expected), &mut out)
+                        .unwrap();
+                    black_box(out.len())
+                });
+            } else {
+                let mut codec = FastPForBlock128::default();
+                let mut out = Vec::new();
+                let expected = (n_blocks * FastPForBlock128::size()) as u32;
+                b.iter(|| {
+                    out.clear();
+                    codec
+                        .decode_blocks(black_box(compressed), Some(expected), &mut out)
+                        .unwrap();
+                    black_box(out.len())
+                });
+            }
         });
     }
     group.finish();
@@ -125,73 +181,103 @@ fn benchmark_block_sizes(c: &mut Criterion) {
 fn benchmark_compression_ratio(c: &mut Criterion) {
     let mut group = c.benchmark_group("compression_ratio");
     group.sample_size(20);
-    let size = *SIZES.last().unwrap();
-    for fix in ratio_fixtures(size) {
+    let bc = *BLOCK_COUNTS.last().unwrap();
+    for fix in ratio_fixtures::<FastPForBlock128>(bc) {
         group.bench_function(fix.name, |b| {
-            let mut codec = FastPFOR::default();
+            let mut codec = FastPForBlock128::default();
+            let (blocks, _) = slice_to_blocks::<FastPForBlock128>(&fix.data);
+            let mut out = Vec::new();
             b.iter(|| {
-                let compressed = compress_data(&mut codec, black_box(&fix.data));
+                out.clear();
+                codec.encode_blocks(black_box(blocks), &mut out).unwrap();
                 #[expect(
                     clippy::cast_precision_loss,
                     reason = "Loss of precision is acceptable for compression ratio calculation"
                 )]
-                black_box(fix.data.len() as f64 / compressed.len() as f64)
+                black_box(fix.data.len() as f64 / out.len() as f64)
             });
         });
     }
     group.finish();
 }
 
-/// Compare encoding and decoding speed of the C++ `FastPFor128` codec against
-/// the pure-Rust `FastPFOR` codec with `BLOCK_SIZE_128`.
+/// Compare encoding and decoding speed of the C++ `CppFastPFor128` (`AnyLenCodec`) against
+/// the pure-Rust `FastPForBlock128` (`BlockCodec`). Same wire format for block-aligned data.
 #[cfg(feature = "cpp")]
 fn benchmark_cpp_vs_rust(c: &mut Criterion) {
     let mut group = c.benchmark_group("cpp_vs_rust/encode");
-    for (size, fix) in compress_fixtures(SIZES) {
-        group.throughput(Throughput::Elements(size as u64));
+    for (bc, fix) in compress_fixtures::<FastPForBlock128>(BLOCK_COUNTS) {
+        let n_elem = fix.data.len();
+        group.throughput(Throughput::Elements(n_elem as u64));
         group.bench_with_input(
-            BenchmarkId::new(format!("cpp/{}", fix.name), size),
+            BenchmarkId::new(format!("cpp/{}", fix.name), bc),
             &fix.data,
             |b, data| {
-                let mut codec = CppFastPFor128::new();
-                b.iter(|| black_box(cpp_encode(&mut codec, black_box(data))));
+                let mut codec = CppFastPFor128::default();
+                let mut out = Vec::new();
+                b.iter(|| {
+                    out.clear();
+                    codec.encode(black_box(data), &mut out).unwrap();
+                    black_box(out.len())
+                });
             },
         );
         group.bench_with_input(
-            BenchmarkId::new(format!("rust/{}", fix.name), size),
+            BenchmarkId::new(format!("rust/{}", fix.name), bc),
             &fix.data,
             |b, data| {
-                let mut codec = FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128);
-                b.iter(|| black_box(compress_data(&mut codec, black_box(data))));
+                let mut codec = FastPForBlock128::default();
+                let (blocks, _) = slice_to_blocks::<FastPForBlock128>(data);
+                let mut out = Vec::new();
+                b.iter(|| {
+                    out.clear();
+                    codec.encode_blocks(black_box(blocks), &mut out).unwrap();
+                    black_box(out.len())
+                });
             },
         );
     }
     group.finish();
 
     let mut group = c.benchmark_group("cpp_vs_rust/decode");
-    for (size, fix) in cpp_decode_fixtures(SIZES) {
-        group.throughput(Throughput::Elements(size as u64));
+    for (bc, fix) in compress_fixtures::<FastPForBlock128>(BLOCK_COUNTS) {
+        let n_elem = fix.n_blocks * FastPForBlock128::size();
+        let expected_len = u32::try_from(n_elem).expect("n_elem fits in u32");
+        group.throughput(Throughput::Elements(n_elem as u64));
         group.bench_with_input(
-            BenchmarkId::new(format!("cpp/{}", fix.name), size),
-            &fix.cpp_compressed,
+            BenchmarkId::new(format!("cpp/{}", fix.name), bc),
+            &fix.compressed,
             |b, compressed| {
-                let mut codec = CppFastPFor128::new();
-                let mut out = vec![0u32; fix.original_len];
-                b.iter(|| black_box(cpp_decode(&mut codec, black_box(compressed), &mut out)));
+                let mut codec = CppFastPFor128::default();
+                let mut out = Vec::new();
+                b.iter(|| {
+                    out.clear();
+                    codec
+                        .decode(black_box(compressed), &mut out, Some(expected_len))
+                        .unwrap();
+                    black_box(out.len())
+                });
             },
         );
         group.bench_with_input(
-            BenchmarkId::new(format!("rust/{}", fix.name), size),
-            &fix.rust_compressed,
+            BenchmarkId::new(format!("rust/{}", fix.name), bc),
+            &fix.compressed,
             |b, compressed| {
-                let mut codec = FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128);
-                let mut decompressed = vec![0u32; fix.original_len];
+                let mut codec = FastPForBlock128::default();
+                let mut out = Vec::new();
                 b.iter(|| {
-                    black_box(decompress_data(
-                        &mut codec,
-                        black_box(compressed),
-                        &mut decompressed,
-                    ))
+                    out.clear();
+                    codec
+                        .decode_blocks(
+                            black_box(compressed),
+                            Some(
+                                u32::try_from(fix.n_blocks * FastPForBlock128::size())
+                                    .expect("expected_values fits in u32"),
+                            ),
+                            &mut out,
+                        )
+                        .unwrap();
+                    black_box(out.len())
                 });
             },
         );
diff --git a/cpp b/cpp
index 2be1f97..aa1a6c3 160000
--- a/cpp
+++ b/cpp
@@ -1 +1 @@
-Subproject commit 2be1f976935b8ff9296b029f574d7f964be9d35d
+Subproject commit aa1a6c36efbdee8ef4c3ff71ea455bba0117ac41
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index 9b0082f..cec0a38 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -16,6 +16,16 @@ fastpfor = { path = "..", features = ["cpp", "rust"] }
 [workspace]
 members = ["."]
 
+[lints.rust]
+unused_qualifications = "warn"
+
+[lints.clippy]
+cargo = { level = "warn", priority = -1 }
+pedantic = { level = "warn", priority = -1 }
+# Allow certain lints that are common in fuzz targets and not worth fixing.
+doc_markdown = "allow"
+wildcard_imports = "allow"
+
 [[bin]]
 name = "cpp_roundtrip"
 path = "fuzz_targets/cpp_roundtrip.rs"
@@ -23,19 +33,31 @@ test = false
 doc = false
 
 [[bin]]
-name = "rust_compress_oracle"
-path = "fuzz_targets/rust_compress_oracle.rs"
+name = "encode_oracle"
+path = "fuzz_targets/encode_oracle.rs"
+test = false
+doc = false
+
+[[bin]]
+name = "decode_oracle"
+path = "fuzz_targets/decode_oracle.rs"
+test = false
+doc = false
+
+[[bin]]
+name = "decode_arbitrary"
+path = "fuzz_targets/decode_arbitrary.rs"
 test = false
 doc = false
 
 [[bin]]
-name = "rust_decompress_oracle"
-path = "fuzz_targets/rust_decompress_oracle.rs"
+name = "encode_compare"
+path = "fuzz_targets/encode_compare.rs"
 test = false
 doc = false
 
 [[bin]]
-name = "rust_decompress_arbitrary"
-path = "fuzz_targets/rust_decompress_arbitrary.rs"
+name = "compare_fastpfor_128"
+path = "fuzz_targets/compare_fastpfor_128.rs"
 test = false
 doc = false
diff --git a/fuzz/README.md b/fuzz/README.md
index 4233efd..64f2e7f 100644
--- a/fuzz/README.md
+++ b/fuzz/README.md
@@ -32,6 +32,25 @@ cargo +nightly fuzz run rust_compress_oracle
 cargo +nightly fuzz run rust_decompress_oracle
 # or
 cargo +nightly fuzz run cpp_roundtrip
+# or encode_compare (Rust vs C++ bit-identical output)
+cargo +nightly fuzz run encode_compare
+```
+
+### encode_compare: Bit-identical encode test
+
+The `encode_compare` target encodes the same input with Rust and C++ implementations and asserts the compressed output is bit-identical. Use it to find discrepancies between implementations.
+
+Codec pairs: FastPFor128/CppFastPFor128, FastPFor256/CppFastPFor256, VariableByte/CppVarInt, JustCopy/CppCopy.
+
+**Environment variables:**
+
+| Variable                                            | Description                               |
+|-----------------------------------------------------|-------------------------------------------|
+| `FUZZ_PAIR=NAME` or `FUZZ_ENCODE_COMPARE_PAIR=NAME` | Restrict to one pair (e.g. `FastPFor128`) |
+
+Example:
+```bash
+FUZZ_PAIR=FastPFor128 cargo +nightly fuzz run encode_compare
 ```
 
 Run for a specific duration (e.g., 60 seconds):
diff --git a/fuzz/clippy.toml b/fuzz/clippy.toml
new file mode 120000
index 0000000..85f6167
--- /dev/null
+++ b/fuzz/clippy.toml
@@ -0,0 +1 @@
+../clippy.toml
\ No newline at end of file
diff --git a/fuzz/fuzz_targets/common.rs b/fuzz/fuzz_targets/common.rs
index c50f080..9d76b9d 100644
--- a/fuzz/fuzz_targets/common.rs
+++ b/fuzz/fuzz_targets/common.rs
@@ -1,141 +1,167 @@
+// The fuzz crate always enables both "rust" and "cpp" features of fastpfor.
+// Items here may only be used by some binaries; suppress dead_code lint.
+#![allow(dead_code)]
+
 use fastpfor::cpp::*;
-use fastpfor::{AnyLenCodec, rust};
+use fastpfor::{AnyLenCodec, FastPFor128, FastPFor256, JustCopy, VariableByte};
+// ── Debug helper ─────────────────────────────────────────────────────────────
 
-pub type BoxedCppCodec = Box<dyn AnyLenCodec>;
+pub struct HexSlice<'a>(pub &'a [u32]);
 
-#[derive(arbitrary::Arbitrary)]
+impl std::fmt::Debug for HexSlice<'_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        const MAX: usize = 20;
+        let total = self.0.len();
+        let shown = total.min(MAX);
+        let mut list = f.debug_list();
+        for v in &self.0[..shown] {
+            list.entry(&format_args!("{v:#010x}"));
+        }
+        if total > MAX {
+            list.entry(&format_args!(".. out of {total} total"));
+        }
+        list.finish()
+    }
+}
+
+/// A fuzz input pairing arbitrary data with a codec selector.
+#[derive(arbitrary::Arbitrary, Debug)]
 pub struct FuzzInput<C> {
     pub data: Vec<u32>,
     pub codec: C,
 }
 
-impl<C: std::fmt::Debug> std::fmt::Debug for FuzzInput<C> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("FuzzInput")
-            .field("codec", &self.codec)
-            .field("data", &HexSlice(&self.data))
-            .finish()
-    }
-}
+pub type AnyLen = Box<dyn AnyLenCodec>;
 
-#[derive(arbitrary::Arbitrary, Clone, Copy, PartialEq, Eq, Debug)]
-pub enum RustCodec {
-    FastPFOR256,
-    FastPFOR128,
-    VariableByte,
-    JustCopy,
-}
+// ── List entry type ───────────────────────────────────────────────────────────
 
-impl From<RustCodec> for rust::Codec {
-    fn from(codec: RustCodec) -> Self {
-        use rust::*;
-        match codec {
-            RustCodec::FastPFOR256 => Codec::from(FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_256)),
-            RustCodec::FastPFOR128 => Codec::from(FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128)),
-            RustCodec::VariableByte => Codec::from(VariableByte::new()),
-            RustCodec::JustCopy => Codec::from(JustCopy::new()),
-        }
-    }
+pub type CodecEntry = (&'static str, fn() -> AnyLen);
+
+// ── Two codec lists ──────────────────────────────────────────────────────────
+
+/// Generates `(name, || Box::new(T::default()))` entries from a list of types.
+macro_rules! codec_list {
+    ($($t:ty),* $(,)?) => {
+        &[
+            $( (stringify!($t), || Box::new(<$t>::default())) ),*
+        ]
+    };
 }
 
-#[derive(Clone, Copy, Eq, PartialEq, arbitrary::Arbitrary, Debug)]
-pub enum CppCodec {
-    BP32,
-    Copy,
-    FastBinaryPacking8,
-    FastPFor128,
-    FastPFor256,
-    FastBinaryPacking16,
-    FastBinaryPacking32,
-    MaskedVByte,
-    NewPFor,
-    OptPFor,
-    PFor2008,
-    PFor,
-    SimdBinaryPacking,
-    SimdFastPFor128,
-    SimdFastPFor256,
-    SimdGroupSimple,
-    SimdGroupSimpleRingBuf,
-    SimdNewPFor,
-    SimdOptPFor,
-    SimdPFor,
-    SimdSimplePFor,
-    // Simple16, // cannot encode arbitrary bytes
-    // Simple8b, // cannot encode arbitrary bytes
-    // Simple8bRle, // cannot encode arbitrary bytes
-    // Simple9, // cannot encode arbitrary bytes
-    // Simple9Rle, // cannot encode arbitrary bytes
-    // SimplePFor, // cannot encode arbitrary bytes
-    // Snappy,  // Conditional with #ifdef
-    StreamVByte,
-    VByte,
-    VarInt,
-    // VarIntG8iu,  // Conditional with #ifdef
-    VarIntGb,
-    // VsEncoding,  // This is leaking memory
+/// Rust codecs. Block codecs are wrapped in `CompositeCodec<_, VariableByte>`.
+pub static RUST: &[CodecEntry] = codec_list!(FastPFor256, FastPFor128, VariableByte, JustCopy,);
+
+/// C++ codecs (any-length; block codecs are already composites in the C++ library).
+pub static CPP: &[CodecEntry] = codec_list!(
+    CppBP32,
+    CppCopy,
+    CppFastBinaryPacking8,
+    CppFastPFor128,
+    CppFastPFor256,
+    CppFastBinaryPacking16,
+    CppFastBinaryPacking32,
+    CppMaskedVByte,
+    CppNewPFor,
+    CppOptPFor,
+    CppPFor2008,
+    CppPFor,
+    CppSimdBinaryPacking,
+    CppSimdFastPFor128,
+    CppSimdFastPFor256,
+    CppSimdGroupSimple,
+    CppSimdGroupSimpleRingBuf,
+    CppSimdNewPFor,
+    CppSimdOptPFor,
+    CppSimdPFor,
+    CppSimdSimplePFor,
+    // Simple16 / Simple8b / Simple8bRle / Simple9 / Simple9Rle / SimplePFor:
+    //   cannot encode arbitrary u32 values.
+    // Snappy / VarIntG8iu: conditional #ifdef in C++.
+    // VsEncoding: leaks memory.
+    CppStreamVByte,
+    CppVByte,
+    CppVarInt,
+    CppVarIntGb,
+);
+
+// ── Codec selector (Arbitrary) ─────────────────────────────────────────────────
+
+/// Selects a codec by index. `idx` is wrapped modulo the list length.
+/// `use_cpp` switches between [`RUST`] and [`CPP`].
+#[derive(arbitrary::Arbitrary, Clone, Copy, Debug)]
+pub struct AnyLenSelector {
+    pub idx: u8,
+    pub use_cpp: bool,
 }
 
-impl From<CppCodec> for BoxedCppCodec {
-    fn from(codec: CppCodec) -> Self {
-        match codec {
-            CppCodec::BP32 => Box::new(CppBP32::default()),
-            CppCodec::Copy => Box::new(CppCopy::default()),
-            CppCodec::FastBinaryPacking8 => Box::new(CppFastBinaryPacking8::default()),
-            CppCodec::FastPFor128 => Box::new(CppFastPFor128::default()),
-            CppCodec::FastPFor256 => Box::new(CppFastPFor256::default()),
-            CppCodec::FastBinaryPacking16 => Box::new(CppFastBinaryPacking16::default()),
-            CppCodec::FastBinaryPacking32 => Box::new(CppFastBinaryPacking32::default()),
-            CppCodec::MaskedVByte => Box::new(CppMaskedVByte::default()),
-            CppCodec::NewPFor => Box::new(CppNewPFor::default()),
-            CppCodec::OptPFor => Box::new(CppOptPFor::default()),
-            CppCodec::PFor2008 => Box::new(CppPFor2008::default()),
-            CppCodec::PFor => Box::new(CppPFor::default()),
-            CppCodec::SimdBinaryPacking => Box::new(CppSimdBinaryPacking::default()),
-            CppCodec::SimdFastPFor128 => Box::new(CppSimdFastPFor128::default()),
-            CppCodec::SimdFastPFor256 => Box::new(CppSimdFastPFor256::default()),
-            CppCodec::SimdGroupSimple => Box::new(CppSimdGroupSimple::default()),
-            CppCodec::SimdGroupSimpleRingBuf => Box::new(CppSimdGroupSimpleRingBuf::default()),
-            CppCodec::SimdNewPFor => Box::new(CppSimdNewPFor::default()),
-            CppCodec::SimdOptPFor => Box::new(CppSimdOptPFor::default()),
-            CppCodec::SimdPFor => Box::new(CppSimdPFor::default()),
-            CppCodec::SimdSimplePFor => Box::new(CppSimdSimplePFor::default()),
-            // CppCodec::Simple16 => Box::new(CppSimple16::default()),
-            // CppCodec::Simple8b => Box::new(CppSimple8b::default()),
-            // CppCodec::Simple8bRle => Box::new(CppSimple8bRle::default()),
-            // CppCodec::Simple9 => Box::new(CppSimple9::default()),
-            // CppCodec::Simple9Rle => Box::new(CppSimple9Rle::default()),
-            // CppCodec::SimplePFor => Box::new(CppSimplePFor::default()),
-            // CppCodec::Snappy => Box::new(CppSnappy::default()),
-            CppCodec::StreamVByte => Box::new(CppStreamVByte::default()),
-            CppCodec::VByte => Box::new(CppVByte::default()),
-            CppCodec::VarInt => Box::new(CppVarInt::default()),
-            // CppCodec::VarIntG8iu => Box::new(CppVarIntG8iu::default()),
-            CppCodec::VarIntGb => Box::new(CppVarIntGb::default()),
-            // CppCodec::VsEncoding => Box::new(CppVsEncoding::default()),
-        }
-    }
+/// Instantiate a codec, returning `(name, codec)`.
+pub fn instantiate_anylen_codec(sel: AnyLenSelector) -> (&'static str, AnyLen) {
+    let list = if sel.use_cpp { CPP } else { RUST };
+    let (name, make) = list[sel.idx as usize % list.len()];
+    (name, make())
 }
 
-pub struct HexSlice<'a>(pub &'a [u32]);
+// ── Encode compare: Rust vs C++ bit-identical pairs ────────────────────────────
 
-impl<'a> std::fmt::Debug for HexSlice<'a> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        const MAX: usize = 20;
+/// A pair of codecs that should produce bit-identical compressed output.
+/// `rust` = Rust implementation, `cpp` = C++ implementation
+#[derive(Clone, Copy)]
+pub struct CodecPair {
+    pub name: &'static str,
+    pub make_rust: fn() -> AnyLen,
+    pub make_cpp: fn() -> AnyLen,
+}
 
-        let total = self.0.len();
-        let shown = total.min(MAX);
+macro_rules! codec_pair {
+    ($name:expr, $rust:ty, $cpp:ty) => {
+        CodecPair {
+            name: $name,
+            make_rust: || Box::new(<$rust>::default()),
+            make_cpp: || Box::new(<$cpp>::default()),
+        }
+    };
+    ($name:expr, $rust:ty, $cpp:ty, $cpp_alt:ty) => {
+        CodecPair {
+            name: $name,
+            make_rust: || Box::new(<$rust>::default()),
+            make_cpp: || Box::new(<$cpp>::default()),
+        }
+    };
+}
 
-        let mut list = f.debug_list();
+/// Pairs of Rust and C++ codecs expected to produce bit-identical output.
+pub static ENCODE_COMPARE_PAIRS: &[CodecPair] = &[
+    codec_pair!("FastPFor128", FastPFor128, CppFastPFor128),
+    codec_pair!("FastPFor256", FastPFor256, CppFastPFor256),
+    codec_pair!("VariableByte", VariableByte, CppVarInt),
+    codec_pair!("JustCopy", JustCopy, CppCopy),
+];
 
-        for v in &self.0[..shown] {
-            list.entry(&format_args!("{:#010x}", v));
-        }
+/// Optional pair filter: if set, only the named pair is tested.
+/// Set `FUZZ_PAIR=FastPFor128` or `FUZZ_ENCODE_COMPARE_PAIR=FastPFor128` to restrict.
+pub fn encode_compare_pair_filter() -> Option<String> {
+    std::env::var("FUZZ_PAIR")
+        .ok()
+        .or_else(|| std::env::var("FUZZ_ENCODE_COMPARE_PAIR").ok())
+}
 
-        if total > MAX {
-            list.entry(&format_args!(".. out of {} total", total));
+/// Resolve a pair index to a `CodecPair`, applying filter and alternative substitution.
+pub fn resolve_encode_compare_pair(idx: u8) -> Option<CodecPair> {
+    let filter = encode_compare_pair_filter();
+    let pairs = ENCODE_COMPARE_PAIRS;
+    let i = idx as usize % pairs.len();
+    let pair = pairs[i];
+    if let Some(ref f) = filter {
+        if !f.eq_ignore_ascii_case(pair.name) {
+            return None;
         }
-
-        list.finish()
     }
+    Some(pair)
+}
+
+/// Instantiate both codecs for a pair, using the alternative C++ when requested.
+pub fn instantiate_pair(pair: CodecPair) -> (AnyLen, AnyLen) {
+    let rust_codec = (pair.make_rust)();
+    let cpp_codec = (pair.make_cpp)();
+    (rust_codec, cpp_codec)
 }
diff --git a/fuzz/fuzz_targets/compare_fastpfor_128.rs b/fuzz/fuzz_targets/compare_fastpfor_128.rs
new file mode 100644
index 0000000..ffb244d
--- /dev/null
+++ b/fuzz/fuzz_targets/compare_fastpfor_128.rs
@@ -0,0 +1,64 @@
+#![no_main]
+
+//! Cross-decoder fuzz: encode with `CppFastPFor128`, match it with `FastPFor128` output,
+//! then verify that `FastPFor128` (pure Rust) and `CppFastPFor128` both reproduce the original
+//! input exactly.
+//!
+//! # Why `CppSimdFastPFor128` is excluded
+//!
+//! `CppSimdFastPFor128` (`SIMDFastPFor`) uses a **different wire format** for
+//! block data than the scalar `CppFastPFor128` (`FastPFor`).  The scalar codec
+//! calls `fastpackwithoutmask`, which stores 32 consecutive values
+//! sequentially into one 32-bit word (v[0] | v[1]<<1 | … | v[31]<<31).
+//!
+//! The SIMD codec calls `SIMD_fastpack_32` / `simdpack`, which uses SSE2 to
+//! process four 32-bit values simultaneously across the four 128-bit lanes of
+//! an `__m128i`.  For `bit=1` this produces the **transposed** layout:
+//!
+//!   lane 0  = v[0] | v[4]<<1 | v[8]<<2 | … | v[28]<<7 | …
+//!   lane 1  = v[1] | v[5]<<1 | v[9]<<2 | … | v[29]<<7 | …
+//!   …
+//!
+//! which is entirely different from the scalar word layout.
+//!
+//! Because the block bit-packing formats are incompatible, a stream produced
+//! by `CppFastPFor128` cannot be correctly decoded by `CppSimdFastPFor128`
+//! (and vice versa).
+
+use fastpfor::cpp::CppFastPFor128;
+use fastpfor::{AnyLenCodec, FastPFor128};
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: Vec<u32>| {
+    let mut compressed = Vec::new();
+    CppFastPFor128::default()
+        .encode(&data, &mut compressed)
+        .expect("any data must be encodable");
+
+    let mut rust_compressed = Vec::new();
+    FastPFor128::default()
+        .encode(&data, &mut rust_compressed)
+        .expect("any data must be encodable");
+    assert_eq!(
+        compressed, rust_compressed,
+        "CppFastPFor128 and FastPFor128 produced different compressed output",
+    );
+
+    let mut rust_out = Vec::new();
+    FastPFor128::default()
+        .decode(&compressed, &mut rust_out, None)
+        .expect("FastPFor128 (Rust) failed to decode CppFastPFor128-encoded data");
+    assert_eq!(
+        rust_out, data,
+        "FastPFor128 (Rust) decoded output does not match original",
+    );
+
+    let mut cpp_out = Vec::new();
+    CppFastPFor128::default()
+        .decode(&compressed, &mut cpp_out, None)
+        .expect("CppFastPFor128 failed to decode its own encoded data");
+    assert_eq!(
+        cpp_out, data,
+        "CppFastPFor128 decoded output does not match original",
+    );
+});
diff --git a/fuzz/fuzz_targets/cpp_roundtrip.rs b/fuzz/fuzz_targets/cpp_roundtrip.rs
index a0559cf..9be7685 100644
--- a/fuzz/fuzz_targets/cpp_roundtrip.rs
+++ b/fuzz/fuzz_targets/cpp_roundtrip.rs
@@ -1,31 +1,31 @@
 #![no_main]
 
+//! Fuzz C++ codec roundtrip: compress then decompress and assert equality.
+
 use libfuzzer_sys::fuzz_target;
 mod common;
-use common::*;
+use common::{AnyLenSelector, FuzzInput, instantiate_anylen_codec};
 
-fuzz_target!(|data: FuzzInput<CppCodec>| {
-    let mut codec = BoxedCppCodec::from(data.codec);
-    let input = data.data;
+fuzz_target!(|data: FuzzInput<AnyLenSelector>| {
+    // Only exercise C++ codecs in this target; Rust self-roundtrip is in compress_oracle.
+    if !data.codec.use_cpp {
+        return;
+    }
+    let (name, mut codec) = instantiate_anylen_codec(data.codec);
 
+    let input = &data.data;
     let mut compressed = Vec::new();
-    codec.encode(&input, &mut compressed).unwrap();
+    codec
+        .encode(input, &mut compressed)
+        .expect("C++ compression failed");
 
-    let mut decoded = Vec::new();
+    let mut decompressed = Vec::new();
     codec
-        .decode(&compressed, &mut decoded, None)
-        .expect("decode");
+        .decode(&compressed, &mut decompressed, None)
+        .expect("C++ decompression failed");
 
-    // Verify roundtrip
-    if decoded.len() + input.len() < 200 {
-        assert_eq!(input, decoded.as_slice(), "Decompressed output mismatches");
-    } else {
-        assert_eq!(decoded.len(), input.len(), "Decompressed length mismatch");
-        for (i, (&original, &out)) in input.iter().zip(decoded.iter()).enumerate() {
-            assert_eq!(
-                original, out,
-                "Mismatch at position {i}: expected {original}, got {out}"
-            );
-        }
-    }
+    assert_eq!(
+        decompressed, *input,
+        "C++ roundtrip mismatch for codec {name}",
+    );
 });
diff --git a/fuzz/fuzz_targets/decode_arbitrary.rs b/fuzz/fuzz_targets/decode_arbitrary.rs
new file mode 100644
index 0000000..6eda4c9
--- /dev/null
+++ b/fuzz/fuzz_targets/decode_arbitrary.rs
@@ -0,0 +1,61 @@
+#![no_main]
+
+//! Fuzz the Rust FastPFOR decoder against **arbitrary** (potentially malformed) compressed bytes.
+//!
+//! Why this target is needed
+//! -------------------------
+//! The existing `compress_oracle` target only feeds *well-formed* data to the Rust
+//! decoder (it first compresses valid input, then decompresses).
+//! That means corrupted or truncated compressed streams never reach the decoder, so
+//! out-of-bounds panics in `decode_page` are invisible to the fuzzer.
+//!
+//! This target removes any oracle entirely: arbitrary bytes are reinterpreted as `u32` words
+//! and handed straight to the Rust decoder.  The only contract we enforce is:
+//!
+//!   * A successful `Ok(...)` must produce the expected output (we don't verify correctness,
+//!     only that no panic occurs).
+//!   * An `Err(...)` is acceptable — the decoder is allowed to reject garbage input.
+//!   * A **panic** is never acceptable.
+
+use arbitrary::Arbitrary;
+use libfuzzer_sys::fuzz_target;
+mod common;
+use common::{AnyLenSelector, instantiate_anylen_codec};
+
+/// Fuzz input: raw compressed bytes plus the codec selector.
+#[derive(Arbitrary, Debug)]
+struct FuzzInput {
+    /// Raw bytes that will be reinterpreted as `&[u32]` compressed data.
+    compressed_bytes: Vec<u8>,
+    codec: AnyLenSelector,
+}
+
+/// Maximum number of `u32` words to feed to the decoder.
+/// Keeps allocations bounded even with malicious `n_blocks` headers.
+const MAX_COMPRESSED_WORDS: usize = 4096;
+
+fuzz_target!(|data: FuzzInput| {
+    // Only fuzz Rust codecs — C++ panics on malformed input are out of scope.
+    if data.codec.use_cpp {
+        return;
+    }
+
+    // Align the byte slice to u32 by zero-padding to the next 4-byte boundary.
+    let mut bytes = data.compressed_bytes;
+    let rem = bytes.len() % 4;
+    if rem != 0 {
+        bytes.resize(bytes.len() + (4 - rem), 0);
+    }
+
+    let compressed: Vec<u32> = bytes
+        .chunks_exact(4)
+        .map(|c| u32::from_le_bytes([c[0], c[1], c[2], c[3]]))
+        .take(MAX_COMPRESSED_WORDS)
+        .collect();
+
+    let (_name, mut codec) = instantiate_anylen_codec(data.codec);
+
+    // The decoder must either succeed or return an error — a panic is a bug.
+    let mut output = Vec::new();
+    let _ = codec.decode(&compressed, &mut output, None);
+});
diff --git a/fuzz/fuzz_targets/decode_oracle.rs b/fuzz/fuzz_targets/decode_oracle.rs
new file mode 100644
index 0000000..980bcd5
--- /dev/null
+++ b/fuzz/fuzz_targets/decode_oracle.rs
@@ -0,0 +1,65 @@
+#![no_main]
+
+//! Cross-codec oracle: independent Rust and C++ roundtrips, decompressed values must match.
+//!
+//! Uses matching Rust/C++ pairs from [`RUST`] and [`CPP`] (by the same index).
+//! Both sides compress the input independently and decompress independently;
+//! the decompressed output from each must equal the original input.
+//!
+//! Both sides use the same wire format so only the final decompressed values are compared.
+
+use libfuzzer_sys::fuzz_target;
+mod common;
+use common::{CPP, FuzzInput, RUST};
+
+/// Selects a matching Rust/C++ pair by a single index into the shorter of the
+/// two lists.
+#[derive(arbitrary::Arbitrary, Clone, Copy, Debug)]
+struct CompatSelector {
+    idx: u8,
+}
+
+fuzz_target!(|data: FuzzInput<CompatSelector>| {
+    let input = &data.data;
+    if input.is_empty() {
+        return;
+    }
+
+    // Use the same index for both lists; clamp to the shorter list.
+    let n = RUST.len().min(CPP.len());
+    let i = data.codec.idx as usize % n;
+    let (rust_name, make_rust) = RUST[i];
+    let (cpp_name, make_cpp) = CPP[i];
+
+    let mut rust_codec = make_rust();
+    let mut cpp_codec = make_cpp();
+
+    // Rust roundtrip
+    let mut rust_compressed = Vec::new();
+    if rust_codec.encode(input, &mut rust_compressed).is_err() {
+        return;
+    }
+    let mut rust_decompressed = Vec::new();
+    rust_codec
+        .decode(&rust_compressed, &mut rust_decompressed, None)
+        .expect("Rust decompress of self-compressed data must not fail");
+
+    // C++ roundtrip (independent oracle)
+    let mut cpp_compressed = Vec::new();
+    cpp_codec
+        .encode(input, &mut cpp_compressed)
+        .expect("C++ compression failed");
+    let mut cpp_decompressed = Vec::new();
+    cpp_codec
+        .decode(&cpp_compressed, &mut cpp_decompressed, None)
+        .expect("C++ decompression failed");
+
+    assert_eq!(
+        rust_decompressed, *input,
+        "Rust roundtrip failed for codec {rust_name}",
+    );
+    assert_eq!(
+        cpp_decompressed, *input,
+        "C++ roundtrip failed for codec {cpp_name}",
+    );
+});
diff --git a/fuzz/fuzz_targets/encode_compare.rs b/fuzz/fuzz_targets/encode_compare.rs
new file mode 100644
index 0000000..ad4835f
--- /dev/null
+++ b/fuzz/fuzz_targets/encode_compare.rs
@@ -0,0 +1,62 @@
+#![no_main]
+
+//! Fuzz target: encode the same input with Rust and C++ implementations and assert bit-identical output.
+//!
+//! Codec pairs (Rust vs C++) expected to produce identical compressed bytes:
+//! - FastPFor128 vs CppFastPFor128
+//! - FastPFor256 vs CppFastPFor256
+//! - VariableByte vs CppVarInt
+//! - JustCopy vs CppCopy
+
+use libfuzzer_sys::fuzz_target;
+mod common;
+use common::{FuzzInput, instantiate_pair, resolve_encode_compare_pair};
+
+#[derive(arbitrary::Arbitrary, Debug)]
+struct PairSelector {
+    idx: u8,
+}
+
+fuzz_target!(|data: FuzzInput<PairSelector>| {
+    let Some(pair) = resolve_encode_compare_pair(data.codec.idx) else {
+        return;
+    };
+
+    let (mut rust_codec, mut cpp_codec) = instantiate_pair(pair);
+
+    let mut rust_out = Vec::new();
+    rust_codec
+        .encode(&data.data, &mut rust_out)
+        .expect("Rust encode failed");
+
+    let mut cpp_out = Vec::new();
+    cpp_codec
+        .encode(&data.data, &mut cpp_out)
+        .expect("C++ encode must not fail when Rust encode succeeded");
+
+    assert_eq!(
+        rust_out, cpp_out,
+        "Bit-identical output failed for pair {}: Rust and C++ compressed output differ",
+        pair.name,
+    );
+
+    let mut decoded = Vec::new();
+    rust_codec
+        .decode(&rust_out, &mut decoded, None)
+        .expect("Rust decode of self-compressed data must not fail");
+    assert_eq!(
+        decoded, data.data,
+        "Rust roundtrip failed for pair {}: decompressed value differs from original input",
+        pair.name,
+    );
+
+    decoded.truncate(0);
+    cpp_codec
+        .decode(&cpp_out, &mut decoded, None)
+        .expect("C++ decode of self-compressed data must not fail");
+    assert_eq!(
+        decoded, data.data,
+        "C++ roundtrip failed for pair {}: decompressed value differs from original input",
+        pair.name,
+    );
+});
diff --git a/fuzz/fuzz_targets/encode_oracle.rs b/fuzz/fuzz_targets/encode_oracle.rs
new file mode 100644
index 0000000..2b63de9
--- /dev/null
+++ b/fuzz/fuzz_targets/encode_oracle.rs
@@ -0,0 +1,35 @@
+#![no_main]
+
+//! Fuzz the pure-Rust `AnyLenCodec` implementations for self-consistency.
+//!
+//! Every codec must satisfy the fundamental invariant: compress(x) then
+//! decompress(compress(x)) == x.  This target exercises all Rust codecs
+//! against arbitrary input.
+
+use libfuzzer_sys::fuzz_target;
+mod common;
+use common::{AnyLenSelector, FuzzInput, instantiate_anylen_codec};
+
+fuzz_target!(|data: FuzzInput<AnyLenSelector>| {
+    // Only exercise Rust codecs in this target; C++ roundtrip is in cpp_roundtrip.
+    if data.codec.use_cpp {
+        return;
+    }
+    let (name, mut codec) = instantiate_anylen_codec(data.codec);
+
+    let input = &data.data;
+    let mut compressed = Vec::new();
+    if codec.encode(input, &mut compressed).is_err() {
+        return;
+    }
+
+    let mut decompressed = Vec::new();
+    codec
+        .decode(&compressed, &mut decompressed, None)
+        .expect("Rust decompress of self-compressed data must not fail");
+
+    assert_eq!(
+        decompressed, *input,
+        "Rust roundtrip mismatch for codec {name}",
+    );
+});
diff --git a/fuzz/fuzz_targets/rust_compress_oracle.rs b/fuzz/fuzz_targets/rust_compress_oracle.rs
deleted file mode 100644
index 291f9b5..0000000
--- a/fuzz/fuzz_targets/rust_compress_oracle.rs
+++ /dev/null
@@ -1,79 +0,0 @@
-#![no_main]
-
-use fastpfor::{AnyLenCodec, CodecToSlice, rust};
-use libfuzzer_sys::fuzz_target;
-mod common;
-use common::*;
-use fastpfor::cpp::*;
-
-fuzz_target!(|data: FuzzInput<RustCodec>| {
-    let input = data.data;
-
-    // TODO: Behaviour differs
-    if input.is_empty() {
-        return;
-    }
-
-    // TODO: To make the encoder not crash ->  Skip inputs smaller than block size
-    let block_size = match data.codec {
-        RustCodec::FastPFOR256 => 256,
-        RustCodec::FastPFOR128 => 128,
-        RustCodec::VariableByte => 1,
-        RustCodec::JustCopy => 1,
-    };
-    if input.len() < block_size {
-        return;
-    }
-
-    // TODO: To make the encoder not crash ->  Truncate to block size multiple
-    let last_block_size_multiple = input.len() / block_size * block_size;
-    let input = &input[..last_block_size_multiple];
-
-    // Allocate output buffer for Rust (slice API)
-    let mut rust_compressed = vec![0u32; input.len() * 2 + 1024];
-
-    // Compress with Rust implementation using Codec wrapper
-    let mut rust_codec = rust::Codec::from(data.codec);
-    let rust_result = rust_codec
-        .compress_to_slice(input, &mut rust_compressed)
-        .expect("Rust compression failed");
-
-    // Compress with C++ implementation (`AnyLenCodec` / Vec API)
-    let mut cpp_compressed = Vec::new();
-    match data.codec {
-        RustCodec::FastPFOR256 => CppFastPFor256::new()
-            .encode(input, &mut cpp_compressed)
-            .expect("C++ compression failed"),
-        RustCodec::FastPFOR128 => CppFastPFor128::new()
-            .encode(input, &mut cpp_compressed)
-            .expect("C++ compression failed"),
-        RustCodec::VariableByte => CppMaskedVByte::new()
-            .encode(input, &mut cpp_compressed)
-            .expect("C++ compression failed"),
-        RustCodec::JustCopy => CppCopy::new()
-            .encode(input, &mut cpp_compressed)
-            .expect("C++ compression failed"),
-    }
-    let compressed_oracle_from_cpp = cpp_compressed.as_slice();
-
-    // Compare compressed outputs
-    assert_eq!(
-        rust_result.len(),
-        compressed_oracle_from_cpp.len(),
-        "Compressed length mismatch: Rust={}, C++={}",
-        rust_result.len(),
-        compressed_oracle_from_cpp.len()
-    );
-
-    for (i, (&rust_val, &cpp_val)) in rust_result
-        .iter()
-        .zip(compressed_oracle_from_cpp.iter())
-        .enumerate()
-    {
-        assert_eq!(
-            rust_val, cpp_val,
-            "Compressed data mismatch at position {}: Rust={}, C++={}",
-            i, rust_val, cpp_val
-        );
-    }
-});
diff --git a/fuzz/fuzz_targets/rust_decompress_arbitrary.rs b/fuzz/fuzz_targets/rust_decompress_arbitrary.rs
deleted file mode 100644
index 2f637ec..0000000
--- a/fuzz/fuzz_targets/rust_decompress_arbitrary.rs
+++ /dev/null
@@ -1,81 +0,0 @@
-#![no_main]
-
-//! Fuzz the Rust FastPFOR decoder against **arbitrary** (potentially malformed) compressed bytes.
-//!
-//! Why this target is needed
-//! -------------------------
-//! The existing `rust_decompress_oracle` target only ever feeds *well-formed* data to the Rust
-//! decoder (it first compresses valid input with the C++ oracle, then decompresses with Rust).
-//! That means corrupted or truncated compressed streams never reach the decoder, so out-of-bounds
-//! index panics in `decode_page` are invisible to the fuzzer.
-//!
-//! This target removes the C++ oracle entirely: arbitrary bytes are reinterpreted as `u32` words
-//! and handed straight to the Rust decoder.  The only contract we enforce is:
-//!
-//!   * A successful `Ok(...)` must produce exactly `expected_len` decompressed integers.
-//!   * An `Err(...)` is also acceptable — the decoder is allowed to reject garbage input.
-//!   * A **panic** is never acceptable.
-//!
-//! Running this target against the `main` branch will reproduce the panic;
-//! running it against the `dont-panic` branch will produce only `Ok`/`Err` outcomes.
-
-use arbitrary::Arbitrary;
-use fastpfor::rust::{BLOCK_SIZE_128, BLOCK_SIZE_256, DEFAULT_PAGE_SIZE, FastPFOR, VariableByte};
-use fastpfor::{CodecToSlice, rust};
-use libfuzzer_sys::fuzz_target;
-
-/// Which Rust FastPFOR codec variant to exercise.
-#[derive(Arbitrary, Clone, Copy, Debug)]
-enum RustFastPForCodec {
-    FastPFOR256,
-    FastPFOR128,
-    VariableByte,
-}
-
-/// Fuzz input: raw compressed bytes plus the codec selector and the expected decompressed length.
-#[derive(Arbitrary, Debug)]
-struct FuzzInput {
-    /// Raw bytes that will be reinterpreted as `&[u32]` compressed data.
-    compressed_bytes: Vec<u8>,
-    /// How many `u32` values the decoder should attempt to produce.
-    /// Capped inside the target to avoid enormous allocations.
-    expected_len: u16,
-    codec: RustFastPForCodec,
-}
-
-fuzz_target!(|data: FuzzInput| {
-    // Align the byte slice to u32 by zero-padding to the next 4-byte boundary.
-    let mut bytes = data.compressed_bytes;
-    let rem = bytes.len() % 4;
-    if rem != 0 {
-        bytes.resize(bytes.len() + (4 - rem), 0);
-    }
-
-    // Safe reinterpret: bytemuck requires the slice to be properly aligned and sized.
-    // We just constructed a Vec<u8> that is a multiple of 4 bytes.
-    let compressed: Vec<u32> = bytes
-        .chunks_exact(4)
-        .map(|c| u32::from_le_bytes([c[0], c[1], c[2], c[3]]))
-        .collect();
-
-    // Cap the output length to prevent huge allocations while still exercising non-trivial sizes.
-    const MAX_LEN: usize = 4096;
-    let expected_len = (data.expected_len as usize).min(MAX_LEN);
-    let mut output = vec![0u32; expected_len];
-
-    // Build the codec under test.
-    let mut codec: rust::Codec = match data.codec {
-        RustFastPForCodec::FastPFOR256 => {
-            rust::Codec::from(FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_256))
-        }
-        RustFastPForCodec::FastPFOR128 => {
-            rust::Codec::from(FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128))
-        }
-        RustFastPForCodec::VariableByte => rust::Codec::from(VariableByte::new()),
-    };
-
-    // The decoder must either succeed or return an error.  A panic is a bug.
-    // We are ok if not all data is consumed because it tries to parse
-    // garbage - as long as we don't panic, we are good
-    let _ = codec.decompress_to_slice(&compressed, &mut output);
-});
diff --git a/fuzz/fuzz_targets/rust_decompress_oracle.rs b/fuzz/fuzz_targets/rust_decompress_oracle.rs
deleted file mode 100644
index 4807cd8..0000000
--- a/fuzz/fuzz_targets/rust_decompress_oracle.rs
+++ /dev/null
@@ -1,73 +0,0 @@
-#![no_main]
-
-use fastpfor::{AnyLenCodec, CodecToSlice, rust};
-use libfuzzer_sys::fuzz_target;
-mod common;
-use common::*;
-use fastpfor::cpp::*;
-
-fuzz_target!(|data: FuzzInput<RustCodec>| {
-    let input = data.data;
-
-    // TODO: Behaviour differs
-    if input.is_empty() {
-        return;
-    }
-
-    // TODO: To make the decoder not crash ->  Skip inputs smaller than block size
-    let block_size = match data.codec {
-        RustCodec::FastPFOR256 => 256,
-        RustCodec::FastPFOR128 => 128,
-        RustCodec::VariableByte => 1,
-        RustCodec::JustCopy => 1,
-    };
-    if input.len() < block_size {
-        return;
-    }
-
-    // TODO: To make the decoder not crash -> Truncate to block size multiple
-    let last_block_size_multiple = input.len() / block_size * block_size;
-    let input = &input[..last_block_size_multiple];
-
-    // First, compress with C++ implementation to get valid compressed data
-    let mut cpp_compressed = Vec::new();
-    match data.codec {
-        RustCodec::FastPFOR256 => CppFastPFor256::new()
-            .encode(input, &mut cpp_compressed)
-            .expect("C++ compression failed"),
-        RustCodec::FastPFOR128 => CppFastPFor128::new()
-            .encode(input, &mut cpp_compressed)
-            .expect("C++ compression failed"),
-        RustCodec::VariableByte => CppMaskedVByte::new()
-            .encode(input, &mut cpp_compressed)
-            .expect("C++ compression failed"),
-        RustCodec::JustCopy => CppCopy::new()
-            .encode(input, &mut cpp_compressed)
-            .expect("C++ compression failed"),
-    }
-    let compressed_oracle_from_cpp = cpp_compressed.as_slice();
-
-    // Now decompress with rust
-    let mut rust_decompressed = vec![0u32; input.len()];
-    let mut rust_codec = rust::Codec::from(data.codec);
-    let rust_result = rust_codec
-        .decompress_to_slice(compressed_oracle_from_cpp, &mut rust_decompressed)
-        .expect("Rust decompression failed");
-
-    // Compare decompressed outputs
-    assert_eq!(
-        rust_result.len(),
-        input.len(),
-        "Decompressed length mismatch: Rust={}, C++={}",
-        rust_result.len(),
-        input.len()
-    );
-
-    for (i, (&rust_val, &cpp_val)) in rust_result.iter().zip(input.iter()).enumerate() {
-        assert_eq!(
-            rust_val, cpp_val,
-            "Decompressed data mismatch at position {}: Rust={}, C++={}",
-            i, rust_val, cpp_val
-        );
-    }
-});
diff --git a/fuzz/justfile b/fuzz/justfile
index 35533bf..22cac61 100755
--- a/fuzz/justfile
+++ b/fuzz/justfile
@@ -1,6 +1,6 @@
 #!/usr/bin/env just --justfile
 # Fuzz testing recipes. All commands must be run from the repo root:
-#   just fuzz::run rust_compress_oracle
+#   just fuzz::run encode_oracle
 # cargo-fuzz requires nightly Rust and must be run from inside the fuzz/ directory.
 
 # How to call the current just executable. Note that just_executable() may have `\` in Windows paths, so we need to quote it.
@@ -31,21 +31,23 @@ run-time target seconds='60' *args:
 run-iters target iters='10000' *args:
     JUST_FUZZ_EXTRA_ARGS='-runs={{iters}}' {{just}} run {{target}} {{args}}
 
-# Run rust_compress_oracle (Rust only, no C++ required)
-rust-compress *args: (run 'rust_compress_oracle' args)
+# Run encode_oracle (pure Rust roundtrip, no C++ required)
+rust-encode *args: (run 'encode_oracle' args)
 
-# Run rust_decompress_oracle (uses C++ as oracle)
-rust-decompress *args: (run 'rust_decompress_oracle' args)
+# Run decode_oracle (parallel Rust + C++ roundtrips, cross-checks decodeed values)
+rust-decode *args: (run 'decode_oracle' args)
 
-# Feed arbitrary bytes directly to the Rust decompressor; runs=0 means run indefinitely (Ctrl-C to stop)
-rust-decompress-arbitrary *args: (run 'rust_decompress_arbitrary' args)
+# Feed arbitrary bytes directly to the Rust decodeor (no panic check)
+rust-decode-arbitrary *args: (run 'decode_arbitrary' args)
 
 # Run cpp_roundtrip (C++ roundtrip)
 cpp-roundtrip *args: (run 'cpp_roundtrip' args)
 
-# Reproduce a specific crash artifact
-repro target artifact:
-    cargo +nightly fuzz run {{target}} {{artifact}}
+# Run cpp_roundtrip (C++ roundtrip)
+compare-fastpfor-128 *args: (run 'compare_fastpfor_128' args)
+
+# Run encode_compare (Rust vs C++ bit-identical encode)
+full-compare *args: (run 'encode_compare' args)
 
 # Run a single pass of every fuzz target (CI smoke test; stops after 1 iteration each).
 ci-test:
diff --git a/justfile b/justfile
index 21c9940..5fc804e 100755
--- a/justfile
+++ b/justfile
@@ -85,7 +85,7 @@ fmt:
     #!/usr/bin/env bash
     set -euo pipefail
     for dir in "./" "fuzz"; do
-        pushd "$dir"
+        cd "$dir"
         if (rustup toolchain list | grep nightly && rustup component list --toolchain nightly | grep rustfmt) &> /dev/null; then
             echo "Reformatting Rust code using nightly Rust fmt to sort imports in $dir"
             cargo +nightly fmt --all -- --config imports_granularity=Module,group_imports=StdExternalCrate
@@ -93,7 +93,9 @@ fmt:
             echo "Reformatting Rust with the stable cargo fmt in $dir.  Install nightly with \`rustup install nightly\` for better results"
             cargo fmt --all
         fi
-        popd
+        if [ -f .git ]; then
+            cd ..
+        fi
     done
 
 # Reformat all Cargo.toml files using cargo-sort
diff --git a/src/codec.rs b/src/codec.rs
index 8bff34c..1a021e6 100644
--- a/src/codec.rs
+++ b/src/codec.rs
@@ -62,6 +62,9 @@ pub trait BlockCodec {
 
     /// Decompress blocks from `input`, using the length stored in the header.
     ///
+    /// Returns the number of input `u32` words consumed, so the caller (e.g.
+    /// [`CompositeCodec`]) can locate the tail without parsing the block format.
+    ///
     /// When `expected_len` is `Some(n)`:
     /// - Validates that the header value equals `n` (must be a multiple of
     ///   [`size`](BlockCodec::size)).
diff --git a/src/cpp/codecs.rs b/src/cpp/codecs.rs
index 2ac6750..0c8af9d 100644
--- a/src/cpp/codecs.rs
+++ b/src/cpp/codecs.rs
@@ -12,7 +12,7 @@ use crate::cpp::wrappers::{
 // Single macro: all C++ codecs implement AnyLenCodec. Codecs marked with `@ 64`
 // also implement BlockCodec64 for 64-bit integer support.
 
-/// Macro for C++ codec wrappers: struct + Default + [`AnyLenCodec`].
+/// Macro for C++ codec wrappers: struct + Default + `AnyLenCodec`.
 macro_rules! implement_cpp_codecs {
     ($(
         $(#[$($attrs:tt)*])*
@@ -139,7 +139,7 @@ implement_cpp_codecs! {
 
     // CppSnappy => snappy_codec,  // Conditional with #ifdef
 
-    /// [`CppStreamVByte`](https://github.com/lemire/streamvbyte) encoding for fast variable-byte compression.
+    /// [`StreamVByte`](https://github.com/lemire/streamvbyte) encoding for fast variable-byte compression.
     CppStreamVByte => streamvbyte_codec,
 
     /// Standard variable-byte encoding.
@@ -192,7 +192,7 @@ pub(crate) mod tests {
     }
 
     /// C++ `fastpfor256_codec` returns `CompositeCodec<FastPFor<8>, VariableByte>` — already
-    /// any-length. Use it directly; do not wrap in Rust `CppComposite`.
+    /// any-length. Use it directly; do not wrap in Rust `CompositeCodec`.
     #[test]
     fn test_cpp_fastpfor256_composite_anylen() {
         let mut codec = CppFastPFor256::new();
diff --git a/src/cpp/tests.rs b/src/cpp/tests.rs
index 4845c0a..ce29230 100644
--- a/src/cpp/tests.rs
+++ b/src/cpp/tests.rs
@@ -1,31 +1,31 @@
 use crate::cpp::codecs::tests::roundtrip_32;
 
-/// Test all codecs compile and do a basic 32-bit roundtrip
+// Test all codecs compile and do a basic 32-bit roundtrip
 macro_rules! test_anylen {
-    ($($name:ident),* $(,)?) => {
-        $(
-            #[test]
-            #[allow(non_snake_case)]
-            fn $name() {
-                roundtrip_32(&mut $crate::cpp::$name::new(), &[1u32, 2, 3, 4, 5]);
-            }
-        )*
-    };
-}
+        ($($name:ident),*) => {
+            $(
+                #[test]
+                #[allow(non_snake_case)]
+                fn $name() {
+                    roundtrip_32(&mut $crate::cpp::$name::new(), &[1u32, 2, 3, 4, 5]);
+                }
+            )*
+        };
+    }
 
 test_anylen!(
     CppBP32,
     CppCopy,
+    CppFastBinaryPacking8,
     CppFastBinaryPacking16,
     CppFastBinaryPacking32,
-    CppFastBinaryPacking8,
     CppFastPFor128,
     CppFastPFor256,
     CppMaskedVByte,
     CppNewPFor,
     CppOptPFor,
-    CppPFor,
     CppPFor2008,
+    CppPFor,
     CppSimdBinaryPacking,
     CppSimdFastPFor128,
     CppSimdFastPFor256,
@@ -39,13 +39,13 @@ test_anylen!(
     CppStreamVByte,
     CppVByte,
     CppVarInt,
-    CppVarIntGb,
+    CppVarIntGb
 );
 
-/// Simple-9/16/8b codecs require values that fit in small bit widths and a
-/// block-aligned count; test them separately with 128 small values.
+// Simple-9/16/8b codecs require values that fit in small bit widths and a
+// block-aligned count; test them separately with 128 small values.
 macro_rules! test_anylen_128 {
-        ($($name:ident),* $(,)?) => {
+        ($($name:ident),*) => {
             $(
                 #[test]
                 #[allow(non_snake_case)]
@@ -57,37 +57,45 @@ macro_rules! test_anylen_128 {
         };
     }
 
-// Note: Simple9Rle crashes with heap corruption on various inputs; skip everywhere.
-test_anylen_128!(CppSimple16, CppSimple8b, CppSimple9, CppSimple8bRle);
+// Note: CppSimple9Rle crashes with heap corruption on various inputs; skip everywhere.
+test_anylen_128!(CppSimple16, CppSimple8b, CppSimple9);
+
+// CppSimple8bRle reinterpret-casts uint32_t* → uint64_t* inside the C++ header,
+// which is UB on strict-alignment architectures (ARM64 requires 8-byte alignment
+// for 64-bit loads/stores and will SIGSEGV on unaligned access). The codec is
+// otherwise correct on x86/x86_64 where unaligned access is handled in hardware.
+// Tracked upstream; skip on aarch64 until fixed in the submodule.
+// #[cfg(not(target_arch = "aarch64"))]
+test_anylen_128!(CppSimple8bRle);
 
 // Verify Default impl routes through new() for all generated codec types.
 macro_rules! test_default {
-    ($($name:ident),* $(,)?) => {
-        $(
-            #[test]
-            #[allow(non_snake_case)]
-            fn $name() {
-                let _codec = $crate::cpp::$name::default();
-            }
-        )*
-    };
-}
+        ($($name:ident),*) => {
+            $(
+                #[test]
+                #[allow(non_snake_case)]
+                fn $name() {
+                    let _codec = $crate::cpp::$name::default();
+                }
+            )*
+        };
+    }
 
-/// Use a distinct prefix to avoid name collisions with `test_anylen` tests.
+// Use a distinct prefix to avoid name collisions with test_anylen tests.
 mod default_impls {
     test_default!(
         CppBP32,
         CppCopy,
+        CppFastBinaryPacking8,
         CppFastBinaryPacking16,
         CppFastBinaryPacking32,
-        CppFastBinaryPacking8,
         CppFastPFor128,
         CppFastPFor256,
         CppMaskedVByte,
         CppNewPFor,
         CppOptPFor,
-        CppPFor,
         CppPFor2008,
+        CppPFor,
         CppSimdBinaryPacking,
         CppSimdFastPFor128,
         CppSimdFastPFor256,
@@ -101,11 +109,15 @@ mod default_impls {
         CppSimple8b,
         CppSimple8bRle,
         CppSimple9,
-        CppSimple9Rle,
         CppSimplePFor,
         CppStreamVByte,
         CppVByte,
         CppVarInt,
-        CppVarIntGb,
+        CppVarIntGb
     );
 }
+
+mod default_impls2 {
+    // #[cfg(not(target_arch = "aarch64"))]
+    test_default!(CppSimple9Rle);
+}
diff --git a/src/error.rs b/src/error.rs
index 522b2a1..d97b2e6 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -1,8 +1,5 @@
 use thiserror::Error;
 
-/// Alias for the result type of `FastPFor` operations.
-pub type FastPForResult<T> = Result<T, FastPForError>;
-
 /// Errors that can occur when using the `FastPFor` codecs.
 #[non_exhaustive]
 #[derive(Error, Debug)]
@@ -23,6 +20,15 @@ pub enum FastPForError {
     #[error("Invalid input length {0}")]
     InvalidInputLength(usize),
 
+    /// Page size is not a multiple of the block size
+    #[error("Page size {page_size} is not a multiple of block size {block_size}")]
+    InvalidPageSize {
+        /// The page size that was provided
+        page_size: u32,
+        /// The block size that the page size must be a multiple of
+        block_size: u32,
+    },
+
     /// Error propagated from the C++ `FastPFOR` library
     #[cfg(feature = "cpp")]
     #[error("C++ exception: {0}")]
diff --git a/src/helpers.rs b/src/helpers.rs
index 8db9446..7acf2a4 100644
--- a/src/helpers.rs
+++ b/src/helpers.rs
@@ -17,7 +17,6 @@ pub trait AsUsize: Eq + Copy {
     fn as_usize(self) -> usize;
 
     #[inline]
-    #[cfg(feature = "cpp")]
     fn is_decoded_mismatch(self, expected: impl AsUsize) -> Result<(), FastPForError> {
         let actual = self.as_usize();
         let expected = expected.as_usize();
@@ -30,7 +29,6 @@ pub trait AsUsize: Eq + Copy {
 
     /// Returns an error if `expected` exceeds `max`.
     #[inline]
-    #[cfg(feature = "cpp")]
     fn is_valid_expected(self, max: impl AsUsize) -> Result<usize, FastPForError> {
         let expected = self.as_usize();
         let max = max.as_usize();
diff --git a/src/lib.rs b/src/lib.rs
index a0cd734..d4a71e4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2,26 +2,20 @@
 #![cfg_attr(docsrs, feature(doc_cfg))]
 #![doc = include_str!("../README.md")]
 
-#[cfg(not(any(feature = "cpp", feature = "rust",)))]
+#[cfg(not(any(feature = "cpp", feature = "rust")))]
 compile_error!("At least one of the features 'cpp' or 'rust' must be enabled");
 
 // Error types are always available regardless of which codec features are enabled.
 mod error;
-pub use error::{FastPForError, FastPForResult};
+pub use error::FastPForError;
 
-// FIXME: need decide on the external API. Some ideas:
-//  - offer two sets of similar APIs - rust and cpp ffi
-//  - it will be possible to enable/disable each with a feature flag
-//  - introduce a new feature-agnostic API that will forward to either
-//  - if both are enabled, forward to the more stable (ffi probably)
 #[cfg(feature = "cpp")]
 /// Rust wrapper for the [`FastPFOR` C++ library](https://github.com/fast-pack/FastPFor)
 pub mod cpp;
 
 #[cfg(feature = "rust")]
 #[forbid(unsafe_code, reason = "Rust code must always be safe")]
-/// Rust re-implementation of `FastPFor` (work in progress)
-pub mod rust;
+pub(crate) mod rust;
 
 mod codec;
 #[cfg(feature = "cpp")]
@@ -30,38 +24,12 @@ pub use codec::{AnyLenCodec, BlockCodec, slice_to_blocks};
 
 pub(crate) mod helpers;
 
-/// Low-level compression interface using caller-provided buffers.
-///
-/// Codecs write into pre-allocated slices and return a sub-slice showing exactly
-/// what was written. Works across FFI boundaries and allows buffer reuse.
-///
-/// # Type Parameters
-///
-/// - `In`: Input data type (e.g., `u32` or `u64` for integer codecs)
-/// - `Out`: Compressed output type (defaults to `In`, but may differ - e.g.,
-///   64-bit integers compress to 32-bit words: `CodecToSlice<u64, u32>`)
-///
-/// # Buffer Sizing
-///
-/// Caller must ensure output buffers are large enough. For compression, estimate
-/// `input.len() * 2 + 1024`. For decompression, size depends on the codec.
-pub trait CodecToSlice<In, Out = In> {
-    /// Error type returned by compression/decompression operations.
-    type Error;
-
-    /// Compresses input into output buffer, returning slice of data written.
-    fn compress_to_slice<'out>(
-        &mut self,
-        input: &[In],
-        output: &'out mut [Out],
-    ) -> Result<&'out [Out], Self::Error>;
-
-    /// Decompresses input into output buffer, returning slice of data written.
-    ///
-    /// Output size cannot be known in advance for some codecs (e.g., RLE).
-    fn decompress_to_slice<'out>(
-        &mut self,
-        input: &[Out],
-        output: &'out mut [In],
-    ) -> Result<&'out [In], Self::Error>;
-}
+// Re-export bytemuck::Pod so that users writing generic `BlockCodec` code
+// can constrain their own `Block` associated-type bounds without a separate
+// `bytemuck` dependency.
+pub use bytemuck::Pod;
+#[cfg(feature = "rust")]
+pub use rust::{
+    CompositeCodec, FastPFor, FastPFor128, FastPFor256, FastPForBlock128, FastPForBlock256,
+    JustCopy, VariableByte,
+};
diff --git a/src/rust/composite.rs b/src/rust/composite.rs
new file mode 100644
index 0000000..1e2e280
--- /dev/null
+++ b/src/rust/composite.rs
@@ -0,0 +1,258 @@
+//! [`CompositeCodec`]: chains a [`BlockCodec`] for aligned blocks with an
+//! [`AnyLenCodec`] for the sub-block remainder.
+//!
+//! Rust-only: combines Rust block codecs with Rust tail codecs. Do not wrap C++ codecs.
+
+use crate::FastPForError;
+use crate::codec::{AnyLenCodec, BlockCodec, slice_to_blocks};
+use crate::helpers::AsUsize;
+
+/// Combines a block-oriented codec with an arbitrary-length tail codec.
+///
+/// `CompositeCodec<Blocks, Tail>` implements [`AnyLenCodec`]: it accepts any
+/// input length, encodes the aligned prefix with `Blocks`, and the
+/// sub-block remainder with `Tail`.
+///
+/// **Rust-only:** Use only with Rust codecs (e.g. `FastPForBlock256`, `VariableByte`).
+/// C++ block codecs are already any-length in the C++ library; use them directly.
+///
+/// # Wire format (matches C++ `CompositeCodec`)
+///
+/// ```text
+/// [ Blocks encoded data... ] [ Tail encoded data... ]
+/// ```
+///
+/// No composite-level header; the block codec's first word is its value count.
+/// For tail-only input, C++ `FastPFor` writes 0, so we emit `[0][tail]`.
+///
+/// # Example
+///
+/// ```rust,ignore
+/// use fastpfor::{AnyLenCodec, FastPFor256};
+///
+/// let data: Vec<u32> = (0..600).collect(); // 2 × 256 + 88 remainder
+/// let codec = FastPFor256::default();
+///
+/// let mut encoded = Vec::new();
+/// codec.encode(&data, &mut encoded).unwrap();
+///
+/// let mut decoded = Vec::new();
+/// codec.decode(&encoded, &mut decoded, None).unwrap();
+/// assert_eq!(decoded, data);
+/// ```
+pub struct CompositeCodec<Blocks: BlockCodec, Tail: AnyLenCodec> {
+    block: Blocks,
+    tail: Tail,
+}
+
+impl<Blocks, Tail> Default for CompositeCodec<Blocks, Tail>
+where
+    Blocks: BlockCodec + Default,
+    Tail: AnyLenCodec + Default,
+{
+    fn default() -> Self {
+        Self::new(Blocks::default(), Tail::default())
+    }
+}
+
+impl<Blocks: BlockCodec, Tail: AnyLenCodec> CompositeCodec<Blocks, Tail> {
+    /// Creates a new `CompositeCodec` from a block codec and a tail codec.
+    pub fn new(block: Blocks, tail: Tail) -> Self {
+        Self { block, tail }
+    }
+}
+
+impl<Blocks: BlockCodec, Tail: AnyLenCodec> AnyLenCodec for CompositeCodec<Blocks, Tail> {
+    fn encode(&mut self, input: &[u32], out: &mut Vec<u32>) -> Result<(), FastPForError> {
+        let (blocks, remainder) = slice_to_blocks::<Blocks>(input);
+        // C++ CompositeCodec: concatenate block + tail. Block codec writes length header (0 when empty).
+        self.block.encode_blocks(blocks, out)?;
+        self.tail.encode(remainder, out)
+    }
+
+    /// Decode C++ format: `[block_data][tail_data]`. Block codec's first word = block value count.
+    fn decode(
+        &mut self,
+        input: &[u32],
+        out: &mut Vec<u32>,
+        expected_len: Option<u32>,
+    ) -> Result<(), FastPForError> {
+        let start_len = out.len();
+        let max = Self::max_decompressed_len(input.len());
+
+        if let Some(expected) = expected_len {
+            out.reserve(expected.is_valid_expected(max)?);
+        }
+
+        if input.is_empty() {
+            // When input is empty, max_decompressed_len(0) == 0, so is_valid_expected
+            // already rejected any expected_len > 0 above. No mismatch check needed.
+            self.tail.decode(&[], out, None)?;
+            return Ok(());
+        }
+
+        let block_expected = expected_len.map(|v| {
+            let v = (v.as_usize() / Blocks::size()) * Blocks::size();
+            u32::try_from(v).expect("block-aligned expected_len fits in u32")
+        });
+
+        let consumed = self.block.decode_blocks(input, block_expected, out)?;
+        // Decoder is expected to return valid data
+        let tail_input = &input[consumed..];
+        self.tail.decode(tail_input, out, None)?;
+
+        if let Some(n) = expected_len {
+            (out.len() - start_len).is_decoded_mismatch(n)?;
+        }
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::rust::{FastPForBlock128, FastPForBlock256, JustCopy, VariableByte};
+
+    fn roundtrip<C: AnyLenCodec>(codec: &mut C, data: &[u32]) {
+        let mut encoded = Vec::new();
+        codec.encode(data, &mut encoded).unwrap();
+        let mut decoded = Vec::new();
+        codec.decode(&encoded, &mut decoded, None).unwrap();
+        assert_eq!(decoded, data);
+    }
+
+    #[test]
+    fn test_fastpfor256_vbyte_exact_two_blocks() {
+        let data: Vec<u32> = (0..512).collect();
+        roundtrip(
+            &mut CompositeCodec::new(FastPForBlock256::default(), VariableByte::new()),
+            &data,
+        );
+    }
+
+    #[test]
+    fn test_fastpfor256_vbyte_with_remainder() {
+        let data: Vec<u32> = (0..600).collect();
+        roundtrip(
+            &mut CompositeCodec::new(FastPForBlock256::default(), VariableByte::new()),
+            &data,
+        );
+    }
+
+    #[test]
+    fn test_fastpfor128_justcopy_with_remainder() {
+        let data: Vec<u32> = (0..300).collect();
+        roundtrip(
+            &mut CompositeCodec::new(FastPForBlock128::default(), JustCopy::new()),
+            &data,
+        );
+    }
+
+    #[test]
+    fn test_empty_input() {
+        roundtrip(
+            &mut CompositeCodec::new(FastPForBlock256::default(), VariableByte::new()),
+            &[],
+        );
+    }
+
+    #[test]
+    fn test_decode_truly_empty_input() {
+        // Decoding a zero-length slice (not even a header word) must succeed with empty output.
+        let mut codec = CompositeCodec::new(FastPForBlock256::default(), VariableByte::new());
+        let mut out = Vec::new();
+        codec.decode(&[], &mut out, None).unwrap();
+        assert!(out.is_empty());
+    }
+
+    #[test]
+    fn test_decode_empty_input_with_expected_zero() {
+        // Empty input with expected_len=0 must succeed.
+        let mut codec = CompositeCodec::new(FastPForBlock256::default(), VariableByte::new());
+        let mut out = Vec::new();
+        codec.decode(&[], &mut out, Some(0)).unwrap();
+        assert!(out.is_empty());
+    }
+
+    #[test]
+    fn test_decode_empty_input_with_nonzero_expected_errors() {
+        // Empty input: max_decompressed_len(0) == 0, so any expected_len > 0 fails
+        // with ExpectedCountExceedsMax before decoding begins.
+        let mut codec = CompositeCodec::new(FastPForBlock256::default(), VariableByte::new());
+        let err = codec.decode(&[], &mut Vec::new(), Some(5)).unwrap_err();
+        assert!(matches!(
+            err,
+            FastPForError::ExpectedCountExceedsMax {
+                expected: 5,
+                max: 0
+            }
+        ));
+    }
+
+    #[test]
+    fn test_decode_huge_n_blocks_header_returns_error() {
+        // A corrupt header claiming ~1.6 M blocks must return an error rather
+        // than attempting a multi-gigabyte allocation.
+        // Regression: fuzzer found bytes [0x04, 0x35, 0x19] → u32 LE 0x00193504 = 1_651_460
+        // fed to FastPFor256.decode caused an OOM via a ~2.5 GB Vec::resize.
+        let mut codec = CompositeCodec::new(FastPForBlock256::default(), VariableByte::new());
+        let mut out = Vec::new();
+        let input = [0x0019_3504u32]; // n_blocks = 1_651_460, rest is empty
+        assert!(codec.decode(&input, &mut out, None).is_err());
+        assert!(out.is_empty());
+    }
+
+    #[test]
+    fn test_sub_block_only() {
+        let data: Vec<u32> = (0..10).collect();
+        roundtrip(
+            &mut CompositeCodec::new(FastPForBlock256::default(), VariableByte::new()),
+            &data,
+        );
+    }
+
+    #[test]
+    fn test_decode_with_expected_len() {
+        let data: Vec<u32> = (0..600).collect();
+        let mut codec = CompositeCodec::new(FastPForBlock256::default(), VariableByte::new());
+        let mut encoded = Vec::new();
+        codec.encode(&data, &mut encoded).unwrap();
+        let mut decoded = Vec::new();
+        codec.decode(&encoded, &mut decoded, Some(600)).unwrap();
+        assert_eq!(decoded, data);
+    }
+
+    #[test]
+    fn test_decode_expected_len_mismatch_errors() {
+        let data: Vec<u32> = (0..100).collect();
+        let mut codec = CompositeCodec::new(FastPForBlock256::default(), VariableByte::new());
+        let mut encoded = Vec::new();
+        codec.encode(&data, &mut encoded).unwrap();
+        let mut decoded = Vec::new();
+        let err = codec.decode(&encoded, &mut decoded, Some(50)).unwrap_err();
+        assert!(matches!(
+            err,
+            FastPForError::DecodedCountMismatch {
+                actual: 100,
+                expected: 50
+            }
+        ));
+    }
+
+    #[test]
+    fn test_decode_expected_len_exceeds_max_errors() {
+        let data: Vec<u32> = (0..10).collect();
+        let mut codec = CompositeCodec::new(FastPForBlock256::default(), VariableByte::new());
+        let mut encoded = Vec::new();
+        codec.encode(&data, &mut encoded).unwrap();
+        let mut decoded = Vec::new();
+        let huge =
+            (CompositeCodec::<FastPForBlock256, VariableByte>::max_decompressed_len(encoded.len())
+                + 1) as u32;
+        let err = codec
+            .decode(&encoded, &mut decoded, Some(huge))
+            .unwrap_err();
+        assert!(matches!(err, FastPForError::ExpectedCountExceedsMax { .. }));
+    }
+}
diff --git a/src/rust/integer_compression/codec.rs b/src/rust/integer_compression/codec.rs
deleted file mode 100644
index e817b14..0000000
--- a/src/rust/integer_compression/codec.rs
+++ /dev/null
@@ -1,153 +0,0 @@
-use std::io::Cursor;
-
-use crate::CodecToSlice;
-use crate::rust::{FastPFOR, FastPForResult, Integer, JustCopy, VariableByte};
-
-/// Type-erased wrapper for compression codecs.
-///
-/// Allows different codec types to be used interchangeably through a unified interface.
-pub enum Codec {
-    /// [`FastPFOR`] compression codec
-    FastPFor(Box<FastPFOR>),
-    /// [`VariableByte`] compression codec
-    VariableByte(VariableByte),
-    /// Pass-through codec (no compression)
-    JustCopy(JustCopy),
-}
-
-impl Integer<u32> for Codec {
-    fn compress(
-        &mut self,
-        input: &[u32],
-        input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
-        match self {
-            Codec::FastPFor(fastpfor) => {
-                fastpfor.compress(input, input_length, input_offset, output, output_offset)
-            }
-            Codec::VariableByte(vb) => {
-                vb.compress(input, input_length, input_offset, output, output_offset)
-            }
-            Codec::JustCopy(jc) => {
-                jc.compress(input, input_length, input_offset, output, output_offset)
-            }
-        }
-    }
-
-    fn uncompress(
-        &mut self,
-        input: &[u32],
-        input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
-        match self {
-            Codec::FastPFor(fastpfor) => {
-                fastpfor.uncompress(input, input_length, input_offset, output, output_offset)
-            }
-            Codec::VariableByte(vb) => {
-                vb.uncompress(input, input_length, input_offset, output, output_offset)
-            }
-            Codec::JustCopy(jc) => {
-                jc.uncompress(input, input_length, input_offset, output, output_offset)
-            }
-        }
-    }
-}
-
-impl CodecToSlice<u32> for Codec {
-    type Error = crate::rust::FastPForError;
-
-    fn compress_to_slice<'out>(
-        &mut self,
-        input: &[u32],
-        output: &'out mut [u32],
-    ) -> Result<&'out [u32], Self::Error> {
-        let mut output_offset = Cursor::new(0);
-        let input_length = input
-            .len()
-            .try_into()
-            .map_err(|_| Self::Error::InvalidInputLength(input.len()))?;
-
-        self.compress(
-            input,
-            input_length,
-            &mut Cursor::new(0),
-            output,
-            &mut output_offset,
-        )?;
-
-        let written = output_offset.position() as usize;
-        Ok(&output[..written])
-    }
-
-    fn decompress_to_slice<'out>(
-        &mut self,
-        input: &[u32],
-        output: &'out mut [u32],
-    ) -> Result<&'out [u32], Self::Error> {
-        let mut output_offset = Cursor::new(0);
-        let input_length: u32 = input
-            .len()
-            .try_into()
-            .map_err(|_| Self::Error::InvalidInputLength(input.len()))?;
-
-        self.uncompress(
-            input,
-            input_length,
-            &mut Cursor::new(0),
-            output,
-            &mut output_offset,
-        )?;
-
-        let written = output_offset.position() as usize;
-        Ok(&output[..written])
-    }
-}
-
-impl From<FastPFOR> for Codec {
-    fn from(fastpfor: FastPFOR) -> Self {
-        Codec::FastPFor(Box::new(fastpfor))
-    }
-}
-
-impl From<VariableByte> for Codec {
-    fn from(vb: VariableByte) -> Self {
-        Codec::VariableByte(vb)
-    }
-}
-
-impl From<JustCopy> for Codec {
-    fn from(jc: JustCopy) -> Self {
-        Codec::JustCopy(jc)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn supports_compress_to_slice() {
-        let data = vec![1, 2, 3, 4, 5];
-        let mut rust_codec = Codec::from(VariableByte::new());
-        let mut compressed = vec![0u32; data.len() * 4];
-
-        let compressed_len = {
-            let result = rust_codec
-                .compress_to_slice(&data, &mut compressed)
-                .unwrap();
-            result.len()
-        };
-
-        let mut decompressed = vec![0u32; data.len()];
-        let result = rust_codec
-            .decompress_to_slice(&compressed[..compressed_len], &mut decompressed)
-            .unwrap();
-        assert_eq!(result, &data[..]);
-    }
-}
diff --git a/src/rust/integer_compression/composition.rs b/src/rust/integer_compression/composition.rs
deleted file mode 100644
index 824334a..0000000
--- a/src/rust/integer_compression/composition.rs
+++ /dev/null
@@ -1,100 +0,0 @@
-use std::io::Cursor;
-
-use crate::rust::cursor::IncrementCursor;
-use crate::rust::{Codec, FastPForResult, Integer};
-
-/// Chains two codecs together, applying them sequentially.
-///
-/// Compresses data first with `c1`, then with `c2` on the remaining data.
-pub struct Composition {
-    c1: Codec,
-    c2: Codec,
-}
-
-impl Composition {
-    /// Creates a new instance of the composition codec.
-    pub fn new<C1, C2>(c1: C1, c2: C2) -> Self
-    where
-        C1: Into<Codec>,
-        C2: Into<Codec>,
-    {
-        Composition {
-            c1: c1.into(),
-            c2: c2.into(),
-        }
-    }
-}
-
-impl Integer<u32> for Composition {
-    fn compress(
-        &mut self,
-        input: &[u32],
-        mut input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
-        if input_length == 0 {
-            // Return early if there is no data to compress
-            return Ok(());
-        }
-        let inpos_init = input_offset.position();
-        let outpos_init = output_offset.position();
-        self.c1
-            .compress(input, input_length, input_offset, output, output_offset)?;
-        if output_offset.position() == outpos_init {
-            output[outpos_init as usize] = 0;
-            output_offset.increment();
-        }
-        input_length -= input_offset.position() as u32 - inpos_init as u32;
-        self.c2
-            .compress(input, input_length, input_offset, output, output_offset)
-    }
-
-    fn uncompress(
-        &mut self,
-        input: &[u32],
-        mut input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
-        if input_length == 0 {
-            // Return early if there is no data to compress
-            return Ok(());
-        }
-        let final_init = input_offset.position() as u32;
-        self.c1
-            .uncompress(input, input_length, input_offset, output, output_offset)?;
-        input_length -= input_offset.position() as u32 - final_init;
-        self.c2
-            .uncompress(input, input_length, input_offset, output, output_offset)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::rust::integer_compression::fastpfor::FastPFOR;
-    use crate::rust::integer_compression::variable_byte::VariableByte;
-
-    #[test]
-    fn test_composition() {
-        let fastpfor = FastPFOR::default();
-        let vb = VariableByte::new();
-        let mut comp = Composition::new(fastpfor, vb);
-        let input = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
-        let mut output = vec![0; 10];
-        let mut input_offset = Cursor::new(0);
-        let mut output_offset = Cursor::new(0);
-        let input_length = 10;
-        comp.compress(
-            &input,
-            input_length,
-            &mut input_offset,
-            &mut output,
-            &mut output_offset,
-        )
-        .expect("Failed to compress");
-    }
-}
diff --git a/src/rust/integer_compression/differential/mod.rs b/src/rust/integer_compression/differential/mod.rs
deleted file mode 100644
index 76cfe4c..0000000
--- a/src/rust/integer_compression/differential/mod.rs
+++ /dev/null
@@ -1,61 +0,0 @@
-use std::ops::{Add, AddAssign};
-
-/// Delta encoding/decoding utility for integer compression.
-pub struct Delta;
-
-impl Delta {
-    /// Creates a new instance
-    #[must_use]
-    pub fn new() -> Delta {
-        Delta
-    }
-
-    // C++ port as it supports any type of numeric data
-    // source: https://github.com/fast-pack/FastPFor/blob/49d44d94773518ef26486f7a58f8da08e8a498bb/headers/deltautil.h#L274
-    /// Applies inverse delta encoding to decompress delta-encoded data in place.
-    pub fn fast_inverse_delta<T>(data: &mut [T])
-    where
-        T: Copy + Add<Output = T> + AddAssign,
-    {
-        if data.is_empty() {
-            return;
-        }
-
-        let sz0 = (data.len() / 4) * 4;
-        let mut i = 1;
-
-        if sz0 >= 4 {
-            let mut a = data[0];
-            while i < sz0 - 4 {
-                a = {
-                    data[i] += a;
-                    data[i]
-                };
-                a = {
-                    data[i + 1] += a;
-                    data[i + 1]
-                };
-                a = {
-                    data[i + 2] += a;
-                    data[i + 2]
-                };
-                a = {
-                    data[i + 3] += a;
-                    data[i + 3]
-                };
-                i += 4;
-            }
-        }
-
-        while i < data.len() {
-            data[i] += data[i - 1];
-            i += 1;
-        }
-    }
-}
-
-impl Default for Delta {
-    fn default() -> Self {
-        Delta::new()
-    }
-}
diff --git a/src/rust/integer_compression/fastpfor.rs b/src/rust/integer_compression/fastpfor.rs
index fbf55e8..cf5d216 100644
--- a/src/rust/integer_compression/fastpfor.rs
+++ b/src/rust/integer_compression/fastpfor.rs
@@ -1,92 +1,143 @@
 use std::array;
 use std::io::Cursor;
-use std::num::NonZeroU32;
 
+use bytemuck::cast_slice;
 use bytes::{Buf as _, BufMut as _, BytesMut};
 
-use crate::helpers::{GetWithErr, bits, greatest_multiple};
+use crate::helpers::{AsUsize, GetWithErr, bits, greatest_multiple};
 use crate::rust::cursor::IncrementCursor;
 use crate::rust::integer_compression::{bitpacking, bitunpacking};
-use crate::rust::{Integer, Skippable};
-use crate::{FastPForError, FastPForResult};
-
-/// Block size constant for 256 integers per block
-pub const BLOCK_SIZE_256: NonZeroU32 = NonZeroU32::new(256).unwrap();
-
-/// Block size constant for 128 integers per block
-pub const BLOCK_SIZE_128: NonZeroU32 = NonZeroU32::new(128).unwrap();
+use crate::{BlockCodec, FastPForError};
 
 /// Overhead cost (in bits) for storing each exception's position in the block
 const OVERHEAD_OF_EACH_EXCEPT: u32 = 8;
 
-/// Default page size in number of integers
-pub const DEFAULT_PAGE_SIZE: NonZeroU32 = NonZeroU32::new(65536).unwrap();
+/// Default page size in number of integers (64 KiB / 4 bytes = 16 Ki integers).
+const DEFAULT_PAGE_SIZE: u32 = 65536;
+
+/// Type alias for [`FastPFor`] with 128-element blocks.
+pub type FastPForBlock128 = FastPFor<128>;
+
+/// Type alias for [`FastPFor`] with 256-element blocks.
+pub type FastPForBlock256 = FastPFor<256>;
 
-/// Fast Patched Frame-of-Reference ([`FastPFOR`](https://github.com/lemire/FastPFor)) integer compression codec.
+/// Fast Patched Frame-of-Reference ([FastPFOR](https://github.com/lemire/FastPFor)) codec.
 ///
-/// It is useful for compressing sequences of unsigned 32-bit integers.
+/// `N` is the block size (128 or 256 values per block). This struct implements
+/// [`BlockCodec`] with `Block = [u32; N]`, giving compile-time guarantees that
+/// only correctly-sized blocks are accepted.
 ///
-/// The algorithm works by
-/// - dividing data into blocks,
-/// - determining the optimal number of bits needed for most values, and
-/// - handling exceptions (values requiring more bits) separately
+/// Use [`FastPForBlock128`] or [`FastPForBlock256`] as convenient type aliases.
+///
+/// To compress arbitrary-length data (including a sub-block remainder),
+/// wrap this in a [`CompositeCodec`](crate::CompositeCodec):
+///
+/// ```
+/// # use fastpfor::{FastPFor256, AnyLenCodec};
+/// # let data = [];
+/// # let mut out = vec![];
+/// let mut codec = FastPFor256::default();
+/// codec.encode(&data, &mut out).unwrap();
+/// ```
 #[derive(Debug)]
-pub struct FastPFOR {
+pub struct FastPFor<const N: usize> {
     /// Exception values indexed by bit width difference
-    pub exception_buffers: [Vec<u32>; 33],
+    exception_buffers: [Vec<u32>; 33],
     /// Metadata buffer for encoding/decoding
-    pub bytes_container: BytesMut,
+    bytes_container: BytesMut,
     /// Maximum integers per page
-    pub page_size: u32,
+    page_size: u32,
     /// Position trackers for exception arrays
-    pub data_pointers: [usize; 33],
+    data_pointers: [usize; 33],
     /// Frequency count for each bit width:
     /// `freqs[i]` = count of values needing exactly i bits
-    pub freqs: [u32; 33],
+    freqs: [u32; 33],
     /// Optimal number of bits chosen for the current block
-    pub optimal_bits: u8,
+    optimal_bits: u8,
     /// Number of exceptions that don't fit in the optimal bit width
-    pub exception_count: u8,
+    exception_count: u8,
     /// Maximum bit width required for any value in the block
-    pub max_bits: u8,
-    /// Integers per block (128 or 256)
-    pub block_size: u32,
+    max_bits: u8,
+}
+
+impl<const N: usize> Default for FastPFor<N>
+where
+    [u32; N]: bytemuck::Pod,
+{
+    fn default() -> Self {
+        Self::create(DEFAULT_PAGE_SIZE)
+            .expect("DEFAULT_PAGE_SIZE is a multiple of all valid block sizes")
+    }
+}
+
+impl FastPFor<128> {
+    /// Creates a new `FastPForBlock128` codec with the given page size.
+    ///
+    /// Returns an error if `page_size` is not a multiple of 128.
+    /// Use [`Default`] for the default page size.
+    pub fn new(page_size: u32) -> Result<Self, FastPForError> {
+        Self::create(page_size)
+    }
+}
+
+impl FastPFor<256> {
+    /// Creates a new `FastPForBlock256` codec with the given page size.
+    ///
+    /// Returns an error if `page_size` is not a multiple of 256.
+    /// Use [`Default`] for the default page size.
+    pub fn new(page_size: u32) -> Result<Self, FastPForError> {
+        Self::create(page_size)
+    }
 }
 
-impl Skippable for FastPFOR {
-    fn headless_compress(
+impl<const N: usize> FastPFor<N> {
+    fn create(page_size: u32) -> Result<Self, FastPForError> {
+        if page_size % N as u32 != 0 {
+            return Err(FastPForError::InvalidPageSize {
+                page_size,
+                block_size: N as u32,
+            });
+        }
+        Ok(FastPFor {
+            bytes_container: BytesMut::with_capacity(
+                (3 * page_size / N as u32 + page_size) as usize,
+            ),
+            page_size,
+            exception_buffers: array::from_fn(|_| Vec::new()),
+            data_pointers: [0; 33],
+            freqs: [0; 33],
+            optimal_bits: 0,
+            exception_count: 0,
+            max_bits: 0,
+        })
+    }
+
+    fn compress_blocks(
         &mut self,
         input: &[u32],
         input_length: u32,
         input_offset: &mut Cursor<u32>,
         output: &mut [u32],
         output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
-        let inlength = greatest_multiple(input_length, self.block_size);
+    ) {
+        let inlength = greatest_multiple(input_length, N as u32);
         let final_inpos = input_offset.position() as u32 + inlength;
         while input_offset.position() as u32 != final_inpos {
             let this_size =
                 std::cmp::min(self.page_size, final_inpos - input_offset.position() as u32);
             self.encode_page(input, this_size, input_offset, output, output_offset);
         }
-        Ok(())
     }
 
-    #[expect(unused_variables)]
-    fn headless_uncompress(
+    fn decode_headless_blocks(
         &mut self,
         input: &[u32],
         inlength: u32,
         input_offset: &mut Cursor<u32>,
         output: &mut [u32],
         output_offset: &mut Cursor<u32>,
-        num: u32,
-    ) -> FastPForResult<()> {
-        if inlength == 0 && self.block_size == BLOCK_SIZE_128.get() {
-            // Return early if there is no data to uncompress and block size is 128
-            return Ok(());
-        }
-        let mynvalue = greatest_multiple(inlength, self.block_size);
+    ) -> Result<(), FastPForError> {
+        let mynvalue = greatest_multiple(inlength, N as u32);
         let final_out = output_offset.position() as u32 + mynvalue;
         while output_offset.position() as u32 != final_out {
             let this_size =
@@ -95,78 +146,6 @@ impl Skippable for FastPFOR {
         }
         Ok(())
     }
-}
-
-impl Integer<u32> for FastPFOR {
-    fn compress(
-        &mut self,
-        input: &[u32],
-        input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
-        let inlength = greatest_multiple(input_length, self.block_size);
-        if inlength == 0 {
-            // Return early if there is no data to compress
-            return Ok(());
-        }
-        output[output_offset.position() as usize] = inlength;
-        output_offset.increment();
-        self.headless_compress(input, inlength, input_offset, output, output_offset)
-    }
-
-    fn uncompress(
-        &mut self,
-        input: &[u32],
-        input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
-        if input_length == 0 {
-            // Return early if there is no data to uncompress
-            return Ok(());
-        }
-        let outlength = input[input_offset.position() as usize];
-        input_offset.increment();
-        self.headless_uncompress(
-            input,
-            outlength,
-            input_offset,
-            output,
-            output_offset,
-            outlength,
-        )
-    }
-}
-
-impl Default for FastPFOR {
-    fn default() -> Self {
-        Self::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_256) // Use default values here
-    }
-}
-
-impl FastPFOR {
-    /// Creates codec with specified page and block sizes.
-    #[must_use]
-    pub fn new(page_size: NonZeroU32, block_size: NonZeroU32) -> FastPFOR {
-        let page_size = page_size.get();
-        let block_size = block_size.get();
-        FastPFOR {
-            page_size,
-            block_size,
-            bytes_container: BytesMut::with_capacity(
-                (3 * page_size / block_size + page_size) as usize,
-            ),
-            exception_buffers: array::from_fn(|_| Vec::new()),
-            data_pointers: [0; 33],
-            freqs: [0; 33],
-            optimal_bits: 0,
-            exception_count: 0,
-            max_bits: 0,
-        }
-    }
 
     /// Encodes a page using optimal bit width per block.
     ///
@@ -176,13 +155,13 @@ impl FastPFOR {
     /// - Writes header, packed data, metadata bytes, and exception values.
     ///
     /// # Arguments
-    /// * `thissize` - Must be multiple of `block_size`
-    /// * `input_offset` - Advanced by `thissize`
+    /// * `this_size` - Must be multiple of `block_size`
+    /// * `input_offset` - Advanced by `this_size`
     /// * `output_offset` - Advanced by compressed size
     fn encode_page(
         &mut self,
         input: &[u32],
-        thissize: u32,
+        this_size: u32,
         input_offset: &mut Cursor<u32>,
         output: &mut [u32],
         output_offset: &mut Cursor<u32>,
@@ -196,7 +175,7 @@ impl FastPFOR {
         self.bytes_container.clear();
 
         let mut tmp_input_offset = input_offset.position() as u32;
-        let final_input_offset = tmp_input_offset + thissize - self.block_size;
+        let final_input_offset = tmp_input_offset + this_size - N as u32;
         while tmp_input_offset <= final_input_offset {
             self.best_bit_from_data(input, tmp_input_offset);
             self.bytes_container.put_u8(self.optimal_bits);
@@ -210,7 +189,7 @@ impl FastPFOR {
                     let new_cap = needed.saturating_mul(2).next_multiple_of(32);
                     self.exception_buffers[index].resize(new_cap, 0);
                 }
-                for k in 0..self.block_size {
+                for k in 0..N as u32 {
                     if (input[(k + tmp_input_offset) as usize] >> self.optimal_bits) != 0 {
                         self.bytes_container.put_u8(k as u8);
                         self.exception_buffers[index][self.data_pointers[index]] =
@@ -219,7 +198,7 @@ impl FastPFOR {
                     }
                 }
             }
-            for k in (0..self.block_size).step_by(32) {
+            for k in (0..N as u32).step_by(32) {
                 bitpacking::fast_pack(
                     input,
                     (tmp_input_offset + k) as usize,
@@ -229,7 +208,7 @@ impl FastPFOR {
                 );
                 tmp_output_offset += u32::from(self.optimal_bits);
             }
-            tmp_input_offset += self.block_size;
+            tmp_input_offset += N as u32;
         }
         input_offset.set_position(u64::from(tmp_input_offset));
         output[header_pos] = tmp_output_offset - header_pos as u32;
@@ -241,14 +220,10 @@ impl FastPFOR {
         output[tmp_output_offset as usize] = byte_size as u32;
         tmp_output_offset += 1;
         let how_many_ints = self.bytes_container.len() / 4;
-
-        for it in output
-            .iter_mut()
-            .skip(tmp_output_offset as usize)
-            .take(how_many_ints)
-        {
-            *it = self.bytes_container.get_u32_le();
-        }
+        // Match C++ memcpy: copy metadata bytes as u32s in one shot (native byte order).
+        let meta_u32s: &[u32] = cast_slice(self.bytes_container.chunk());
+        output[tmp_output_offset as usize..][..how_many_ints]
+            .copy_from_slice(&meta_u32s[..how_many_ints]);
         tmp_output_offset += how_many_ints as u32;
         let mut bitmap = 0;
         for k in 2..=32 {
@@ -289,7 +264,7 @@ impl FastPFOR {
     /// Analyzes frequency distribution to balance regular value bits against exception overhead.
     fn best_bit_from_data(&mut self, input: &[u32], pos: u32) {
         self.freqs.fill(0);
-        let k_end = std::cmp::min(pos + self.block_size, input.len() as u32);
+        let k_end = std::cmp::min(pos + N as u32, input.len() as u32);
         for k in pos..k_end {
             self.freqs[bits(input[k as usize])] += 1;
         }
@@ -300,19 +275,19 @@ impl FastPFOR {
         }
         self.max_bits = self.optimal_bits;
 
-        let mut best_cost = u32::from(self.optimal_bits) * self.block_size;
+        let mut best_cost = u32::from(self.optimal_bits) * N as u32;
         let mut num_exceptions: u32 = 0;
         self.exception_count = 0;
 
         for bits in (0..self.optimal_bits).rev() {
             num_exceptions += self.freqs[bits as usize + 1];
-            if num_exceptions == self.block_size {
+            if num_exceptions == N as u32 {
                 break;
             }
             let diff = u32::from(self.max_bits - bits);
             let mut cost = num_exceptions * OVERHEAD_OF_EACH_EXCEPT
                 + num_exceptions * diff
-                + u32::from(bits) * self.block_size
+                + u32::from(bits) * N as u32
                 + 8;
             if diff == 1 {
                 cost -= num_exceptions;
@@ -331,9 +306,9 @@ impl FastPFOR {
     /// unpacks regular values per block, patches in exceptions by position.
     ///
     /// # Arguments
-    /// * `thissize` - Expected decompressed integer count
+    /// * `this_size` - Expected decompressed integer count
     /// * `input_offset` - Advanced by bytes read
-    /// * `output_offset` - Advanced by `thissize`
+    /// * `output_offset` - Advanced by `this_size`
     #[expect(clippy::too_many_lines)]
     fn decode_page(
         &mut self,
@@ -341,8 +316,8 @@ impl FastPFOR {
         input_offset: &mut Cursor<u32>,
         output: &mut [u32],
         output_offset: &mut Cursor<u32>,
-        thissize: u32,
-    ) -> FastPForResult<()> {
+        this_size: u32,
+    ) -> Result<(), FastPForError> {
         let n = u32::try_from(input.len())
             .map_err(|_| FastPForError::InvalidInputLength(input.len()))?;
 
@@ -362,7 +337,7 @@ impl FastPFOR {
         // The C++ encoder uses a raw `memcpy` of bytes into the u32 output (no endian
         // conversion), and the decoder does a raw reinterpret_cast back -- both native byte
         // order. `cast_slice` is the exact Rust equivalent: a safe, zero-copy native view.
-        let input_bytes: &[u8] = bytemuck::cast_slice(input);
+        let input_bytes: &[u8] = cast_slice(input);
         let mut byte_pos = (inexcept as usize)
             .checked_mul(4)
             .filter(|&bp| bp <= input_bytes.len())
@@ -448,7 +423,7 @@ impl FastPFOR {
         let mut tmp_output_offset = output_offset.position() as u32;
         let mut tmp_input_offset = input_offset.position() as u32;
 
-        let run_end = thissize / self.block_size;
+        let run_end = this_size / N as u32;
         for _ in 0..run_end {
             let bits = input_bytes.get_val(byte_pos)?;
             if bits > 32 {
@@ -457,15 +432,21 @@ impl FastPFOR {
             byte_pos += 1;
             let num_exceptions = input_bytes.get_val(byte_pos)?;
             byte_pos += 1;
-            for k in (0..self.block_size).step_by(32) {
+            for k in (0..N as u32).step_by(32) {
                 let in_start = tmp_input_offset as usize;
                 let out_start = (tmp_output_offset + k) as usize;
-                if in_start + usize::from(bits) > input.len() {
-                    return Err(FastPForError::NotEnoughData);
-                }
-                if out_start + 32 > output.len() {
-                    return Err(FastPForError::OutputBufferTooSmall);
-                }
+                // Both invariants are guaranteed by the caller:
+                // - packed data lies within [init_pos+1, init_pos+where_meta), which is
+                //   within bounds because metadata was successfully read at init_pos+where_meta.
+                // - output is pre-allocated to n_blocks*N by decode_blocks.
+                debug_assert!(
+                    in_start + usize::from(bits) <= input.len(),
+                    "packed data overruns input"
+                );
+                debug_assert!(
+                    out_start + 32 <= output.len(),
+                    "output pre-allocated to wrong size"
+                );
                 bitunpacking::fast_unpack(input, in_start, output, out_start, bits);
                 tmp_input_offset += u32::from(bits);
             }
@@ -483,7 +464,7 @@ impl FastPFOR {
                     for _ in 0..num_exceptions {
                         let pos = input_bytes.get_val(byte_pos)?;
                         byte_pos += 1;
-                        if u32::from(pos) >= self.block_size {
+                        if u32::from(pos) >= N as u32 {
                             return Err(FastPForError::NotEnoughData);
                         }
                         let out_idx = tmp_output_offset as usize + pos as usize;
@@ -496,7 +477,7 @@ impl FastPFOR {
                     for _ in 0..num_exceptions {
                         let pos = input_bytes.get_val(byte_pos)?;
                         byte_pos += 1;
-                        if u32::from(pos) >= self.block_size {
+                        if u32::from(pos) >= N as u32 {
                             return Err(FastPForError::NotEnoughData);
                         }
                         let out_idx = tmp_output_offset as usize + pos as usize;
@@ -509,7 +490,7 @@ impl FastPFOR {
                     }
                 }
             }
-            tmp_output_offset += self.block_size;
+            tmp_output_offset += N as u32;
         }
         output_offset.set_position(u64::from(tmp_output_offset));
         input_offset.set_position(u64::from(inexcept));
@@ -517,280 +498,525 @@ impl FastPFOR {
     }
 }
 
+impl<const N: usize> BlockCodec for FastPFor<N>
+where
+    [u32; N]: bytemuck::Pod,
+{
+    type Block = [u32; N];
+
+    fn encode_blocks(
+        &mut self,
+        blocks: &[[u32; N]],
+        out: &mut Vec<u32>,
+    ) -> Result<(), FastPForError> {
+        let n_values = (blocks.len() * N) as u32;
+        if blocks.is_empty() {
+            out.push(n_values);
+            return Ok(());
+        }
+        let flat: &[u32] = cast_slice(blocks);
+
+        let capacity = flat.len() * 2 + 1024;
+        let start = out.len();
+        // Reserve slot for the length header, then space for compressed data.
+        out.resize(start + 1 + capacity, 0);
+
+        let mut in_off = Cursor::new(0u32);
+        let mut out_off = Cursor::new(0u32);
+
+        // Write length header then compress.
+        out[start] = n_values;
+        self.compress_blocks(
+            flat,
+            n_values,
+            &mut in_off,
+            &mut out[start + 1..],
+            &mut out_off,
+        );
+
+        let written = 1 + out_off.position() as usize;
+        out.truncate(start + written);
+        Ok(())
+    }
+
+    fn decode_blocks(
+        &mut self,
+        input: &[u32],
+        expected_len: Option<u32>,
+        out: &mut Vec<u32>,
+    ) -> Result<usize, FastPForError> {
+        let Some((&block_n_values, rest)) = input.split_first() else {
+            return Err(FastPForError::NotEnoughData);
+        };
+        if block_n_values % N as u32 != 0 {
+            return Err(FastPForError::NotEnoughData);
+        }
+        if let Some(expected) = expected_len {
+            if block_n_values != expected {
+                return Err(FastPForError::DecodedCountMismatch {
+                    actual: block_n_values.as_usize(),
+                    expected: expected.as_usize(),
+                });
+            }
+        } else {
+            let max = Self::max_decompressed_len(input.len());
+            if block_n_values.as_usize() > max {
+                return Err(FastPForError::NotEnoughData);
+            }
+        }
+        let n_blocks = block_n_values as usize / N;
+        if n_blocks == 0 {
+            return Ok(1);
+        }
+        let start = out.len();
+        out.resize(start + n_blocks * N, 0);
+
+        let mut in_off = Cursor::new(0u32);
+        let mut out_off = Cursor::new(0u32);
+
+        self.decode_headless_blocks(
+            rest,
+            block_n_values,
+            &mut in_off,
+            &mut out[start..],
+            &mut out_off,
+        )?;
+
+        let written = out_off.position() as usize;
+        if written != n_blocks * N {
+            out.truncate(start + written);
+        }
+        // +1 for the header word (block_n_values) that precedes `rest`.
+        Ok(1 + in_off.position() as usize)
+    }
+}
+
 #[cfg(test)]
 mod tests {
+    use bytemuck::cast_slice_mut;
+
     use super::*;
 
-    #[test]
-    fn fastpfor_test() {
-        let mut codec1 = FastPFOR::default();
-        let mut codec2 = FastPFOR::default();
-        let mut data = vec![0u32; BLOCK_SIZE_256.get() as usize];
-        data[126] = -1i32 as u32;
-        let mut out_buf = vec![0; data.len() * 4];
-        let mut input_offset = Cursor::new(0);
-        let mut output_offset = Cursor::new(0);
-        codec1
-            .compress(
-                &data,
-                data.len() as u32,
-                &mut input_offset,
-                &mut out_buf,
-                &mut output_offset,
-            )
+    // ── Generic helpers ───────────────────────────────────────────────────────
+
+    /// Encode `data` with `FastPFor<N>`, decode it back, and return the result.
+    fn roundtrip<const N: usize>(data: &[u32]) -> Vec<u32>
+    where
+        FastPFor<N>: BlockCodec<Block = [u32; N]>,
+        [u32; N]: bytemuck::Pod,
+    {
+        let blocks: &[[u32; N]] = cast_slice(data);
+        let mut compressed = Vec::new();
+        FastPFor::<N>::default()
+            .encode_blocks(blocks, &mut compressed)
             .unwrap();
-        let comp = out_buf[..output_offset.position() as usize].to_vec();
-
-        let mut out_buf_uncomp = vec![0; data.len() * 4];
-        input_offset = Cursor::new(0);
-        output_offset = Cursor::new(0);
-        codec2
-            .uncompress(
-                &comp,
-                comp.len() as u32,
-                &mut input_offset,
-                &mut out_buf_uncomp,
-                &mut output_offset,
-            )
+        let mut decoded = Vec::new();
+        FastPFor::<N>::default()
+            .decode_blocks(&compressed, Some((blocks.len() * N) as u32), &mut decoded)
             .unwrap();
-        let answer = out_buf_uncomp[..output_offset.position() as usize].to_vec();
+        decoded
+    }
 
-        assert_eq!(answer.len(), BLOCK_SIZE_256.get() as usize);
-        assert_eq!(data.len(), BLOCK_SIZE_256.get() as usize);
-        for k in 0..BLOCK_SIZE_256.get() {
-            assert_eq!(answer[k as usize], data[k as usize], "bug in {k}");
-        }
+    /// Encode `data` as a single batch of `[u32; N]` blocks and return the compressed words.
+    fn encode_block<const N: usize>(data: &[u32]) -> Vec<u32>
+    where
+        FastPFor<N>: BlockCodec<Block = [u32; N]>,
+        [u32; N]: bytemuck::Pod,
+    {
+        let mut out = Vec::new();
+        FastPFor::<N>::default()
+            .encode_blocks(cast_slice(data), &mut out)
+            .expect("compression must succeed");
+        out
     }
 
-    #[test]
-    fn fastpfor_test_128() {
-        let mut codec1 = FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128);
-        let mut codec2 = FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128);
-        let mut data = vec![0; BLOCK_SIZE_128.get() as usize];
-        data[126] = -1i32 as u32;
-        let mut out_buf = vec![0; data.len() * 4];
-        let mut input_offset = Cursor::new(0);
-        let mut output_offset = Cursor::new(0);
-        codec1
-            .compress(
-                &data,
-                data.len() as u32,
-                &mut input_offset,
-                &mut out_buf,
-                &mut output_offset,
-            )
-            .unwrap();
-        let comp = out_buf[..output_offset.position() as usize].to_vec();
-
-        let mut out_buf_uncomp = vec![0; data.len() * 4];
-        input_offset = Cursor::new(0);
-        output_offset = Cursor::new(0);
-        codec2
-            .uncompress(
-                &comp,
-                comp.len() as u32,
-                &mut input_offset,
-                &mut out_buf_uncomp,
-                &mut output_offset,
-            )
-            .unwrap();
-        let answer = out_buf_uncomp[..output_offset.position() as usize].to_vec();
+    /// Try to decode `compressed` as 1 block with `FastPFor<N>`.
+    fn try_decode<const N: usize>(compressed: &[u32]) -> Result<(), impl std::fmt::Debug>
+    where
+        FastPFor<N>: BlockCodec<Block = [u32; N]>,
+        [u32; N]: bytemuck::Pod,
+    {
+        FastPFor::<N>::default()
+            .decode_blocks(compressed, Some(N as u32), &mut Vec::new())
+            .map(|_| ())
+    }
 
-        assert_eq!(answer.len(), BLOCK_SIZE_128.get() as usize);
-        assert_eq!(data.len(), BLOCK_SIZE_128.get() as usize);
-        for k in 0..BLOCK_SIZE_128.get() {
-            assert_eq!(answer[k as usize], data[k as usize], "bug in {k}");
+    // ── Wire format index helpers (FastPFor block layout) ─────────────────────
+    //
+    // Full `compressed` layout (output of `encode_blocks` for a single block):
+    //   [0]                              = out_length  (number of encoded u32 values)
+    //   [1]                              = where_meta (offset to metadata section)
+    //   [2 .. where_meta]               = packed regular values
+    //   [1+where_meta]                  = bytesize   (byte count of block metadata)
+    //   [+1 .. +ceil(bytesize/4)]       = block metadata bytes
+    //   [+ceil(bytesize/4)+1]           = bitmap
+    //   for each set bit k (2..=32):
+    //     [next]                        = size  (# of packed exceptions at width k)
+    //     [next ceil(size*k/32) words]  = bit-packed exception values
+
+    fn meta_byte_start(compressed: &[u32]) -> usize {
+        let where_meta = compressed[1] as usize;
+        (1 + where_meta + 1) * 4
+    }
+
+    fn bitmap_idx(compressed: &[u32]) -> usize {
+        let where_meta = compressed[1] as usize;
+        let bytesize_idx = 1 + where_meta;
+        let bytesize = compressed[bytesize_idx] as usize;
+        bytesize_idx + 1 + bytesize.div_ceil(4)
+    }
+
+    fn find_exception_block(bytes: &[u8], meta_start: usize) -> Option<(usize, usize, usize)> {
+        let mut pos = meta_start;
+        while pos + 1 < bytes.len() {
+            if bytes[pos + 1] > 0 {
+                return Some((pos, pos + 1, pos + 2));
+            }
+            pos += 2;
         }
+        None
+    }
+
+    /// Compressed data containing at least one non-trivial exception group.
+    fn compressed_with_exceptions() -> (Vec<u32>, Vec<u32>) {
+        let data: Vec<u32> = (0..256u32)
+            .map(|i| if i % 2 == 0 { 1u32 << 30 } else { 3 })
+            .collect();
+        (encode_block::<256>(&data), data)
+    }
+
+    /// Compressed data whose exception group uses bit-width difference == 1
+    /// (`maxbits - optimal_bits == 1`), triggering the `index == 1` branch.
+    fn compressed_with_index1_exceptions() -> (Vec<u32>, Vec<u32>) {
+        let mut data = vec![1u32; 256];
+        data[0] = 3; // needs 2 bits → encoder picks optimal_bits=1, maxbits=2, index=1
+        (encode_block::<256>(&data), data)
     }
 
+    // ── Round-trip tests ──────────────────────────────────────────────────────
+
     #[test]
-    fn test_spurious() {
-        let mut c = FastPFOR::default();
-        let x = vec![0; 1024];
-        let mut y = vec![0; 0];
-        let mut i0 = Cursor::new(0);
-        let mut i1 = Cursor::new(0);
-        for inlength in 0..32 {
-            c.compress(&x, inlength, &mut i0, &mut y, &mut i1).unwrap();
-            assert_eq!(0, i1.position());
-        }
+    fn fastpfor_test() {
+        let mut data = vec![0u32; 256];
+        data[126] = u32::MAX;
+        assert_eq!(roundtrip::<256>(&data), data);
     }
 
     #[test]
-    fn test_zero_in_zero_out() {
-        let mut c = FastPFOR::default();
-        let x = vec![0; 0];
-        let mut y = vec![0; 0];
-        let mut i0 = Cursor::new(0);
-        let mut i1 = Cursor::new(0);
-        c.compress(&x, 0, &mut i0, &mut y, &mut i1).unwrap();
-        assert_eq!(0, i1.position());
-
-        // Needs uncompress
-        let mut out = vec![0; 0];
-        let mut outpos = Cursor::new(0);
-        c.uncompress(&y, 0, &mut i1, &mut out, &mut outpos).unwrap();
-        assert_eq!(0, outpos.position());
-    }
-
-    // The following tests are ported from C++
-    fn run_codec_test(codec: &mut FastPFOR, data: &[u32]) {
-        let mut compressed = vec![0u32; data.len() * 2];
-        let mut decompressed = vec![0u32; data.len()];
-        let len = data.len() as u32;
-        let mut input_offset = Cursor::new(0);
-        let mut output_offset = Cursor::new(0);
-
-        codec
-            .compress(
-                data,
-                len,
-                &mut input_offset,
-                &mut compressed,
-                &mut output_offset,
-            )
-            .expect("Compression failed");
-
-        input_offset.set_position(0);
-        output_offset.set_position(0);
-
-        codec
-            .uncompress(
-                &compressed,
-                len,
-                &mut input_offset,
-                &mut decompressed,
-                &mut output_offset,
-            )
-            .expect("Decompression failed");
+    fn fastpfor_test_128() {
+        let mut data = vec![0u32; 128];
+        data[126] = u32::MAX;
+        assert_eq!(roundtrip::<128>(&data), data);
+    }
 
-        for (i, &original) in data.iter().enumerate() {
-            assert_eq!(
-                decompressed[i], original,
-                "Mismatch at index {}: {} != {}",
-                i, decompressed[i], original
-            );
-        }
+    #[test]
+    fn test_empty_blocks_ok() {
+        // Empty input encodes to length header [0] (matches C++ FastPFor) and decodes cleanly.
+        let mut enc = Vec::new();
+        FastPForBlock256::default()
+            .encode_blocks(&[], &mut enc)
+            .unwrap();
+        assert_eq!(enc, [0]);
+        let mut dec = Vec::new();
+        FastPForBlock256::default()
+            .decode_blocks(&enc, Some(0), &mut dec)
+            .unwrap();
+        assert_eq!(dec, []);
     }
 
+    // Tests ported from C++
     #[test]
     fn test_constant_sequence() {
-        let mut codec = FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128);
-        let data = vec![42u32; 65536];
-        run_codec_test(&mut codec, &data);
+        assert_eq!(roundtrip::<128>(&vec![42u32; 65536]), vec![42u32; 65536]);
     }
 
     #[test]
     fn test_alternating_sequence() {
-        let mut codec = FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128);
-        let data: Vec<_> = (0..65536).map(|i| u32::from(i % 2 != 0)).collect(); // Alternating 0s and 1s
-        run_codec_test(&mut codec, &data);
+        let data: Vec<_> = (0..65536u32).map(|i| u32::from(i % 2 != 0)).collect();
+        assert_eq!(roundtrip::<128>(&data), data);
     }
 
     #[test]
     fn test_large_numbers() {
-        let mut codec = FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128);
-        let data: Vec<u32> = (0..65536).map(|i| i + (1u32 << 30)).collect(); // Large numbers near 2^30
-        run_codec_test(&mut codec, &data);
-    }
-
-    // The following tests fail. It is not clear if this is due the translation or there's a bug
-    // Fails
-    // #[test]
-    // fn test_powers_of_two() {
-    //     let mut codec = FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128);
-    //     let data: Vec<u32> = (0..32).map(|i| 1 << i).collect(); // Powers of 2
-    //     run_codec_test(&mut codec, &data);
-    // }
-
-    // Fails
-    // #[test]
-    // fn test_large_random_sequence() {
-    //     let mut codec = FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128);
-    //     let data = generate_random_data(100000); // Large random data set
-    //     run_codec_test(&mut codec, &data);
-    // }
-
-    // Fails
-    // #[test]
-    // fn test_edge_cases() {
-    //     let mut codec = fastpfor::FastPFOR::new(fastpfor::DEFAULT_PAGE_SIZE, fastpfor::BLOCK_SIZE_128);
-    //     let data = vec![u32::MIN, u32::MAX, 0, 1, 42, u32::MAX - 1]; // Edge cases
-    //     run_codec_test(&mut codec, &data);
-    // }
-
-    // Fails
-    // Utility to generate random data
-    // fn generate_random_data(size: usize) -> Vec<u32> {
-    //     let mut rng = thread_rng();
-    //     (0..size).map(|_| rng.gen()).collect()
-    // }
-
-    /// Compress one block of data and return the compressed words.
-    fn compress_one_block(data: &[u32]) -> Vec<u32> {
-        let mut codec = FastPFOR::default();
-        let mut compressed = vec![0u32; data.len() * 4];
-        let mut in_off = Cursor::new(0);
-        let mut out_off = Cursor::new(0);
-        codec
-            .compress(
-                data,
-                data.len() as u32,
-                &mut in_off,
-                &mut compressed,
-                &mut out_off,
-            )
-            .unwrap();
-        compressed[..out_off.position() as usize].to_vec()
+        let data: Vec<u32> = (0..65536u32).map(|i| i + (1u32 << 30)).collect();
+        assert_eq!(roundtrip::<128>(&data), data);
     }
 
     #[test]
-    fn test_truncated_input_returns_error() {
-        let data = vec![42u32; BLOCK_SIZE_256.get() as usize];
-        let compressed = compress_one_block(&data);
+    fn cursor_api_roundtrip() {
+        assert_eq!(roundtrip::<256>(&vec![42u32; 256]), vec![42u32; 256]);
+    }
+
+    #[test]
+    fn headless_compress_unfit_pagesize() {
+        // 640 values with 128-block codec spans two pages (512 + 128), exercising the loop.
+        let input: Vec<u32> = (0..640u32).collect();
+        assert_eq!(roundtrip::<128>(&input), input);
+    }
+
+    #[test]
+    fn exception_value_vector_resizes() {
+        // Alternating large/small values trigger exception-buffer resizing across pages.
+        let input: Vec<u32> = (0..1024u32)
+            .map(|i| if i % 2 == 0 { 1 << 30 } else { 3 })
+            .collect();
+        assert_eq!(roundtrip::<128>(&input), input);
+    }
 
-        // Try decompressing with progressively shorter inputs — all must error, never panic.
+    // ── Error-path tests: truncated / corrupted compressed data ──────────────
+    //
+    // Each test: compress valid data → surgically corrupt one field →
+    // assert `Err` is returned rather than a panic.
+
+    #[test]
+    fn test_truncated_input_returns_error() {
+        let compressed = encode_block::<256>(&vec![42u32; 256]);
         for truncated_len in [1, 2, compressed.len() / 2, compressed.len() - 1] {
-            let truncated = &compressed[..truncated_len];
-            let mut codec = FastPFOR::default();
-            let mut out = vec![0u32; data.len()];
-            let mut in_off = Cursor::new(0);
-            let mut out_off = Cursor::new(0);
-            let result = codec.uncompress(
-                truncated,
-                truncated.len() as u32,
-                &mut in_off,
-                &mut out,
-                &mut out_off,
-            );
             assert!(
-                result.is_err(),
-                "expected error for truncated len {truncated_len}, got Ok"
+                try_decode::<256>(&compressed[..truncated_len]).is_err(),
+                "expected error for truncated len {truncated_len}"
             );
         }
     }
 
     #[test]
     fn test_corrupted_where_meta_returns_error() {
-        let data = vec![1u32; BLOCK_SIZE_256.get() as usize];
-        let mut compressed = compress_one_block(&data);
+        let mut compressed = encode_block::<256>(&vec![1u32; 256]);
+        // word [1] = where_meta; point it past the end
+        compressed[1] = u32::MAX;
+        assert!(try_decode::<256>(&compressed).is_err());
+    }
 
-        // The first word after the length header is `where_meta` — point it far past the end.
-        if compressed.len() > 1 {
-            compressed[1] = u32::MAX;
-        }
+    #[test]
+    fn uncompress_zero_input_length_err() {
+        // Truly empty input (no header word at all) is invalid — C++ would crash reading *in.
+        assert!(
+            FastPForBlock256::default()
+                .decode_blocks(&[], None, &mut Vec::new())
+                .is_err()
+        );
+    }
 
-        let mut codec = FastPFOR::default();
-        let mut out = vec![0u32; data.len()];
-        let mut in_off = Cursor::new(0);
-        let mut out_off = Cursor::new(0);
-        let result = codec.uncompress(
-            &compressed,
-            compressed.len() as u32,
-            &mut in_off,
-            &mut out,
-            &mut out_off,
+    #[test]
+    fn headless_uncompress_zero_inlength_128_ok() {
+        FastPForBlock128::default()
+            .decode_headless_blocks(
+                &[],
+                0,
+                &mut Cursor::new(0u32),
+                &mut [],
+                &mut Cursor::new(0u32),
+            )
+            .expect("zero-length decompress must succeed");
+    }
+
+    #[test]
+    fn decode_where_meta_missing() {
+        // Only an out_length word, no where_meta follows → must error.
+        assert!(try_decode::<256>(&[256u32]).is_err());
+    }
+
+    #[test]
+    fn decode_where_meta_out_of_bounds() {
+        let (mut compressed, _) = compressed_with_exceptions();
+        compressed[1] = u32::MAX;
+        assert!(try_decode::<256>(&compressed).is_err());
+    }
+
+    #[test]
+    fn decode_where_meta_overflow() {
+        // FIXME: this test should be modified to use public API
+        let (compressed, _) = compressed_with_exceptions();
+        let mut padded = vec![0u32];
+        padded.extend_from_slice(&compressed);
+        padded[2] = u32::MAX;
+        let out_length = padded[1];
+        assert!(
+            FastPForBlock256::default()
+                .decode_headless_blocks(
+                    &padded,
+                    out_length,
+                    &mut Cursor::new(1u32),
+                    &mut vec![0u32; 320],
+                    &mut Cursor::new(0u32),
+                )
+                .is_err()
         );
-        assert!(result.is_err(), "expected error for corrupted where_meta");
+    }
+
+    #[test]
+    fn decode_bytesize_out_of_bounds() {
+        let (mut compressed, _) = compressed_with_exceptions();
+        compressed[1] = compressed.len() as u32 - 1;
+        assert!(try_decode::<256>(&compressed).is_err());
+    }
+
+    #[test]
+    fn decode_bytesize_length_overflow() {
+        let (mut compressed, _) = compressed_with_exceptions();
+        let bytesize_idx = 1 + compressed[1] as usize;
+        compressed[bytesize_idx] = u32::MAX - 3;
+        assert!(try_decode::<256>(&compressed).is_err());
+    }
+
+    #[test]
+    fn decode_bitmap_out_of_bounds() {
+        let (mut compressed, _) = compressed_with_exceptions();
+        let bytesize_idx = 1 + compressed[1] as usize;
+        let remaining = (compressed.len() - bytesize_idx - 1) as u32;
+        compressed[bytesize_idx] = remaining * 4;
+        assert!(try_decode::<256>(&compressed).is_err());
+    }
+
+    #[test]
+    fn decode_exception_size_exceeds_page_size() {
+        let (mut compressed, _) = compressed_with_exceptions();
+        let size_idx = bitmap_idx(&compressed) + 1;
+        compressed[size_idx] = DEFAULT_PAGE_SIZE + 1;
+        assert!(try_decode::<256>(&compressed).is_err());
+    }
+
+    #[test]
+    fn decode_exception_partial_group_not_enough_data() {
+        let (compressed, _) = compressed_with_exceptions();
+        assert!(try_decode::<256>(&compressed[..compressed.len() - 2]).is_err());
+    }
+
+    #[test]
+    fn decode_block_b_too_large() {
+        let (mut compressed, _) = compressed_with_exceptions();
+        let start = meta_byte_start(&compressed);
+        cast_slice_mut::<_, u8>(&mut compressed)[start] = 33;
+        assert!(try_decode::<256>(&compressed).is_err());
+    }
+
+    #[test]
+    fn decode_packed_region_truncated() {
+        let (compressed, _) = compressed_with_exceptions();
+        let where_meta = compressed[1] as usize;
+        assert!(try_decode::<256>(&compressed[..where_meta]).is_err());
+    }
+
+    #[test]
+    fn decode_exception_maxbits_too_large() {
+        let (mut compressed, _) = compressed_with_exceptions();
+        let start = meta_byte_start(&compressed);
+        let bytes: &mut [u8] = cast_slice_mut(&mut compressed);
+        if let Some((_, _, mb_off)) = find_exception_block(bytes, start) {
+            bytes[mb_off] = 33;
+        }
+        assert!(try_decode::<256>(&compressed).is_err());
+    }
+
+    #[test]
+    fn decode_exception_index_underflow() {
+        let (mut compressed, _) = compressed_with_exceptions();
+        let start = meta_byte_start(&compressed);
+        let bytes: &mut [u8] = cast_slice_mut(&mut compressed);
+        if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
+            bytes[mb_off] = bytes[bb_off].saturating_sub(1);
+        }
+        assert!(try_decode::<256>(&compressed).is_err());
+    }
+
+    #[test]
+    fn decode_exception_index_zero() {
+        let (mut compressed, _) = compressed_with_exceptions();
+        let start = meta_byte_start(&compressed);
+        let bytes: &mut [u8] = cast_slice_mut(&mut compressed);
+        if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
+            bytes[mb_off] = bytes[bb_off];
+        }
+        assert!(try_decode::<256>(&compressed).is_err());
+    }
+
+    #[test]
+    fn decode_index1_branch_valid() {
+        let (compressed, data) = compressed_with_index1_exceptions();
+        let mut out = Vec::new();
+        FastPForBlock256::default()
+            .decode_blocks(&compressed, Some(256), &mut out)
+            .expect("decompression of index-1 data must succeed");
+        assert_eq!(out, data);
+    }
+
+    #[test]
+    fn decode_index1_pos_byte_missing() {
+        let (compressed, _) = compressed_with_index1_exceptions();
+        assert!(try_decode::<256>(&compressed[..compressed.len() - 1]).is_err());
+    }
+
+    #[test]
+    fn decode_index1_pos_out_of_block() {
+        let mut data = vec![1u32; 128];
+        data[0] = 3;
+        let mut buf = encode_block::<128>(&data);
+        let start = meta_byte_start(&buf);
+        let bytes: &mut [u8] = cast_slice_mut(&mut buf);
+        if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
+            if bytes[mb_off].wrapping_sub(bytes[bb_off]) == 1 && mb_off + 1 < bytes.len() {
+                bytes[mb_off + 1] = 200; // position 200 >= block_size 128
+            }
+        }
+        assert!(try_decode::<128>(&buf).is_err());
+    }
+
+    #[test]
+    fn decode_exception_pos_byte_missing() {
+        let (compressed, _) = compressed_with_exceptions();
+        assert!(try_decode::<256>(&compressed[..compressed.len() - 1]).is_err());
+    }
+
+    #[test]
+    fn decode_exception_pos_out_of_block() {
+        let data: Vec<u32> = (0..128u32)
+            .map(|i| if i % 4 == 0 { 1u32 << 30 } else { 1 })
+            .collect();
+        let mut buf = encode_block::<128>(&data);
+        let start = meta_byte_start(&buf);
+        let bytes: &mut [u8] = cast_slice_mut(&mut buf);
+        if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
+            if bytes[mb_off].wrapping_sub(bytes[bb_off]) > 1 && mb_off + 1 < bytes.len() {
+                bytes[mb_off + 1] = 200; // position 200 >= block_size 128
+            }
+        }
+        assert!(try_decode::<128>(&buf).is_err());
+    }
+
+    /// `decode_blocks` with `expected_len: None` and header=0 returns `Ok` with empty output.
+    #[test]
+    fn decode_blocks_header_only_input() {
+        // Input with just the length header [0]: no blocks to decode.
+        let input = vec![0u32];
+        let mut out = Vec::new();
+        FastPForBlock256::default()
+            .decode_blocks(&input, None, &mut out)
+            .unwrap();
+        assert!(out.is_empty());
+    }
+
+    #[test]
+    fn decode_exception_unpopulated_data_to_be_packed() {
+        // Hand-crafted compressed stream: out_length=256, where_meta=9,
+        // 8 packed zero words (bits=1), bytesize=4,
+        // meta=[bits=1, cexcept=1, maxbits=3, pos=0], bitmap=0.
+        // The exception buffer is never filled, so decoding must error.
+        let compressed: Vec<u32> = [
+            256u32, // out_length
+            9,      // where_meta
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,                                // 8 packed words
+            4,                                // bytesize = 4 bytes
+            u32::from_le_bytes([1, 1, 3, 0]), // meta: bits=1, cexcept=1, maxbits=3, pos=0
+            0,                                // bitmap=0
+        ]
+        .into();
+        assert!(try_decode::<256>(&compressed).is_err());
     }
 }
diff --git a/src/rust/integer_compression/integer_codec.rs b/src/rust/integer_compression/integer_codec.rs
deleted file mode 100644
index 4ab11fd..0000000
--- a/src/rust/integer_compression/integer_codec.rs
+++ /dev/null
@@ -1,39 +0,0 @@
-use std::io::Cursor;
-
-use crate::rust::FastPForResult;
-
-/// Integer compression/decompression interface with length headers.
-///
-/// Implementations write output length as a header before compressed data,
-/// enabling self-describing compressed streams.
-pub trait Integer<T> {
-    /// Compresses integers with length header.
-    ///
-    /// # Arguments
-    /// * `input_length` - Number of integers to compress
-    /// * `input_offset` - Read position cursor, advanced by `input_length`
-    /// * `output_offset` - Write position cursor, advanced by bytes written
-    fn compress(
-        &mut self,
-        input: &[u32],
-        input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [T],
-        output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()>;
-
-    /// Decompresses integers using length header.
-    ///
-    /// # Arguments
-    /// * `input_length` - Total compressed data length
-    /// * `input_offset` - Read position cursor, advanced by bytes read
-    /// * `output_offset` - Write position cursor, advanced by integers written
-    fn uncompress(
-        &mut self,
-        input: &[T],
-        input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()>;
-}
diff --git a/src/rust/integer_compression/just_copy.rs b/src/rust/integer_compression/just_copy.rs
index 92cadb7..55e996d 100644
--- a/src/rust/integer_compression/just_copy.rs
+++ b/src/rust/integer_compression/just_copy.rs
@@ -1,6 +1,6 @@
-use std::io::Cursor;
-
-use crate::rust::{FastPForError, FastPForResult, Integer, Skippable};
+use crate::FastPForError;
+use crate::codec::AnyLenCodec;
+use crate::helpers::AsUsize;
 
 /// A no-op codec that copies data without compression.
 ///
@@ -22,103 +22,63 @@ impl Default for JustCopy {
     }
 }
 
-impl Skippable for JustCopy {
-    fn headless_compress(
-        &mut self,
-        input: &[u32],
-        input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
-        let start_input = input_offset.position() as usize;
-        let end_input = start_input + input_length as usize;
-        let start_output = output_offset.position() as usize;
-        let end_output = start_output + input_length as usize;
-
-        if end_input > input.len() {
-            return Err(FastPForError::NotEnoughData);
-        }
-        if end_output > output.len() {
-            return Err(FastPForError::OutputBufferTooSmall);
-        }
-
-        output[start_output..end_output].copy_from_slice(&input[start_input..end_input]);
-
-        input_offset.set_position(end_input as u64);
-        output_offset.set_position(end_output as u64);
-
+impl AnyLenCodec for JustCopy {
+    fn encode(&mut self, input: &[u32], out: &mut Vec<u32>) -> Result<(), FastPForError> {
+        out.extend_from_slice(input);
         Ok(())
     }
 
-    fn headless_uncompress(
+    fn decode(
         &mut self,
         input: &[u32],
-        #[expect(unused)] input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-        num: u32,
-    ) -> FastPForResult<()> {
-        let start_input = input_offset.position() as usize;
-        let end_input = start_input + num as usize;
-        let start_output = output_offset.position() as usize;
-        let end_output = start_output + num as usize;
-
-        if end_input > input.len() {
-            return Err(FastPForError::NotEnoughData);
-        }
-        if end_output > output.len() {
-            return Err(FastPForError::OutputBufferTooSmall);
+        out: &mut Vec<u32>,
+        expected_len: Option<u32>,
+    ) -> Result<(), FastPForError> {
+        if let Some(expected) = expected_len {
+            let expected = expected.is_valid_expected(Self::max_decompressed_len(input.len()))?;
+            input.len().is_decoded_mismatch(expected)?;
         }
-
-        output[start_output..end_output].copy_from_slice(&input[start_input..end_input]);
-
-        input_offset.set_position(end_input as u64);
-        output_offset.set_position(end_output as u64);
+        out.extend_from_slice(input);
         Ok(())
     }
 }
 
-impl Integer<u32> for JustCopy {
-    fn compress(
-        &mut self,
-        input: &[u32],
-        input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
-        self.headless_compress(input, input_length, input_offset, output, output_offset)
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn justcopy_default_and_roundtrip() {
+        // Exercise the Default impl explicitly.
+        let mut codec = <JustCopy as Default>::default();
+        let data = vec![1u32, 2, 3];
+        let mut compressed = Vec::new();
+        codec.encode(&data, &mut compressed).unwrap();
+        let mut decoded = Vec::new();
+        codec.decode(&compressed, &mut decoded, None).unwrap();
+        assert_eq!(decoded, data);
     }
 
-    fn uncompress(
-        &mut self,
-        input: &[u32],
-        input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
-        let start_input = input_offset.position() as usize;
-        let end_input = start_input + input_length as usize;
-        let start_output = output_offset.position() as usize;
-        let end_output = start_output + input_length as usize;
-
-        // Ensure we don't exceed the slice bounds
-        if end_input > input.len() {
-            return Err(FastPForError::NotEnoughData);
-        }
-        if end_output > output.len() {
-            return Err(FastPForError::OutputBufferTooSmall);
-        }
-
-        output[start_output..end_output].copy_from_slice(&input[start_input..end_input]);
-
-        // Update the cursor positions
-        input_offset.set_position(end_input as u64);
-        output_offset.set_position(end_output as u64);
+    #[test]
+    fn justcopy_decode_with_expected_len_ok() {
+        let data = vec![1u32, 2, 3];
+        let mut out = Vec::new();
+        JustCopy::new().decode(&data, &mut out, Some(3)).unwrap();
+        assert_eq!(out, data);
+    }
 
-        Ok(())
+    #[test]
+    fn justcopy_decode_expected_len_mismatch_errors() {
+        let data = vec![1u32, 2, 3];
+        let err = JustCopy::new()
+            .decode(&data, &mut Vec::new(), Some(2))
+            .unwrap_err();
+        assert!(matches!(
+            err,
+            FastPForError::DecodedCountMismatch {
+                actual: 3,
+                expected: 2
+            }
+        ));
     }
 }
diff --git a/src/rust/integer_compression/mod.rs b/src/rust/integer_compression/mod.rs
index 26bc651..7aed001 100644
--- a/src/rust/integer_compression/mod.rs
+++ b/src/rust/integer_compression/mod.rs
@@ -1,10 +1,5 @@
 pub mod bitpacking;
 pub mod bitunpacking;
-pub mod codec;
-pub mod composition;
-pub mod differential;
 pub mod fastpfor;
-pub mod integer_codec;
 pub mod just_copy;
-pub mod skippable_codec;
 pub mod variable_byte;
diff --git a/src/rust/integer_compression/skippable_codec.rs b/src/rust/integer_compression/skippable_codec.rs
deleted file mode 100644
index c8b64a8..0000000
--- a/src/rust/integer_compression/skippable_codec.rs
+++ /dev/null
@@ -1,101 +0,0 @@
-use std::io::Cursor;
-
-use crate::rust::{Codec, FastPForResult};
-
-/// Headerless compression/decompression for seekable streams.
-///
-/// Methods operate without length headers, requiring external length tracking.
-/// Useful for random access and pre-sized buffer scenarios.
-pub trait Skippable {
-    /// Compresses integers without writing length header.
-    ///
-    /// # Arguments
-    /// * `input_length` - Number of integers to compress
-    /// * `input_offset` - Read position cursor, advanced by `input_length`
-    /// * `output_offset` - Write position cursor, advanced by bytes written
-    fn headless_compress(
-        &mut self,
-        input: &[u32],
-        input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()>;
-
-    /// Decompresses integers without reading length header.
-    ///
-    /// # Arguments
-    /// * `input_length` - Compressed data length
-    /// * `input_offset` - Read position cursor, advanced by bytes read
-    /// * `output_offset` - Write position cursor, advanced by `num`
-    /// * `num` - Expected number of integers to decompress
-    fn headless_uncompress(
-        &mut self,
-        input: &[u32],
-        input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-        num: u32,
-    ) -> FastPForResult<()>;
-}
-
-impl Skippable for Codec {
-    fn headless_compress(
-        &mut self,
-        input: &[u32],
-        input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
-        match self {
-            Codec::FastPFor(fastpfor) => {
-                fastpfor.headless_compress(input, input_length, input_offset, output, output_offset)
-            }
-            Codec::VariableByte(vb) => {
-                vb.headless_compress(input, input_length, input_offset, output, output_offset)
-            }
-            Codec::JustCopy(jc) => {
-                jc.headless_compress(input, input_length, input_offset, output, output_offset)
-            }
-        }
-    }
-
-    fn headless_uncompress(
-        &mut self,
-        input: &[u32],
-        input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-        num: u32,
-    ) -> FastPForResult<()> {
-        match self {
-            Codec::FastPFor(fastpfor) => fastpfor.headless_uncompress(
-                input,
-                input_length,
-                input_offset,
-                output,
-                output_offset,
-                num,
-            ),
-            Codec::VariableByte(vb) => vb.headless_uncompress(
-                input,
-                input_length,
-                input_offset,
-                output,
-                output_offset,
-                num,
-            ),
-            Codec::JustCopy(jc) => jc.headless_uncompress(
-                input,
-                input_length,
-                input_offset,
-                output,
-                output_offset,
-                num,
-            ),
-        }
-    }
-}
diff --git a/src/rust/integer_compression/variable_byte.rs b/src/rust/integer_compression/variable_byte.rs
index 26e7277..345d2c9 100644
--- a/src/rust/integer_compression/variable_byte.rs
+++ b/src/rust/integer_compression/variable_byte.rs
@@ -2,9 +2,10 @@ use std::io::Cursor;
 
 use bytemuck::{cast_slice, cast_slice_mut};
 
+use crate::FastPForError;
+use crate::codec::AnyLenCodec;
 use crate::helpers::AsUsize;
 use crate::rust::cursor::IncrementCursor;
-use crate::rust::{FastPForError, FastPForResult, Integer, Skippable};
 
 /// Variable-byte encoding codec for integer compression.
 #[derive(Debug)]
@@ -30,67 +31,60 @@ impl VariableByte {
     pub fn new() -> VariableByte {
         VariableByte
     }
-}
-
-// Implemented for consistency with other codecs
-impl Default for VariableByte {
-    fn default() -> Self {
-        VariableByte::new()
-    }
-}
 
-impl Skippable for VariableByte {
-    fn headless_compress(
-        &mut self,
+    /// Compress `input_length` u32 values from `input[input_offset..]` into
+    /// `output[output_offset..]` as packed variable-byte u8 values (stored in
+    /// u32 words, padded to 4-byte alignment with `0xFF`).
+    #[allow(clippy::unnecessary_wraps)]
+    fn compress_into_slice(
         input: &[u32],
         input_length: u32,
         input_offset: &mut Cursor<u32>,
         output: &mut [u32],
         output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
+    ) -> Result<(), FastPForError> {
         if input_length == 0 {
-            // Return early if there is no data to compress
             return Ok(());
         }
 
-        // Get byte view of the output buffer
         let output_start = output_offset.position() as usize;
         let output_bytes: &mut [u8] = &mut cast_slice_mut::<u32, u8>(output)[output_start * 4..];
 
+        // Lemire format: last byte has high bit set (c >= 128 means end of value).
         let mut byte_pos = 0;
         for k in input_offset.position()..(input_offset.position() + u64::from(input_length)) {
             let val = input[k as usize];
             if val < (1 << 7) {
-                output_bytes[byte_pos] = Self::extract_7bits::<0>(val);
+                output_bytes[byte_pos] = Self::extract_7bits::<0>(val) | (1 << 7);
                 byte_pos += 1;
             } else if val < (1 << 14) {
-                output_bytes[byte_pos] = Self::extract_7bits::<0>(val) | (1 << 7);
-                output_bytes[byte_pos + 1] = Self::extract_7bits_maskless::<1>(val);
+                output_bytes[byte_pos] = Self::extract_7bits::<0>(val);
+                output_bytes[byte_pos + 1] = Self::extract_7bits_maskless::<1>(val) | (1 << 7);
                 byte_pos += 2;
             } else if val < (1 << 21) {
-                output_bytes[byte_pos] = Self::extract_7bits::<0>(val) | (1 << 7);
-                output_bytes[byte_pos + 1] = Self::extract_7bits::<1>(val) | (1 << 7);
-                output_bytes[byte_pos + 2] = Self::extract_7bits_maskless::<2>(val);
+                output_bytes[byte_pos] = Self::extract_7bits::<0>(val);
+                output_bytes[byte_pos + 1] = Self::extract_7bits::<1>(val);
+                output_bytes[byte_pos + 2] = Self::extract_7bits_maskless::<2>(val) | (1 << 7);
                 byte_pos += 3;
             } else if val < (1 << 28) {
-                output_bytes[byte_pos] = Self::extract_7bits::<0>(val) | (1 << 7);
-                output_bytes[byte_pos + 1] = Self::extract_7bits::<1>(val) | (1 << 7);
-                output_bytes[byte_pos + 2] = Self::extract_7bits::<2>(val) | (1 << 7);
-                output_bytes[byte_pos + 3] = Self::extract_7bits_maskless::<3>(val);
+                output_bytes[byte_pos] = Self::extract_7bits::<0>(val);
+                output_bytes[byte_pos + 1] = Self::extract_7bits::<1>(val);
+                output_bytes[byte_pos + 2] = Self::extract_7bits::<2>(val);
+                output_bytes[byte_pos + 3] = Self::extract_7bits_maskless::<3>(val) | (1 << 7);
                 byte_pos += 4;
             } else {
-                output_bytes[byte_pos] = Self::extract_7bits::<0>(val) | (1 << 7);
-                output_bytes[byte_pos + 1] = Self::extract_7bits::<1>(val) | (1 << 7);
-                output_bytes[byte_pos + 2] = Self::extract_7bits::<2>(val) | (1 << 7);
-                output_bytes[byte_pos + 3] = Self::extract_7bits::<3>(val) | (1 << 7);
-                output_bytes[byte_pos + 4] = Self::extract_7bits_maskless::<4>(val);
+                output_bytes[byte_pos] = Self::extract_7bits::<0>(val);
+                output_bytes[byte_pos + 1] = Self::extract_7bits::<1>(val);
+                output_bytes[byte_pos + 2] = Self::extract_7bits::<2>(val);
+                output_bytes[byte_pos + 3] = Self::extract_7bits::<3>(val);
+                output_bytes[byte_pos + 4] = Self::extract_7bits_maskless::<4>(val) | (1 << 7);
                 byte_pos += 5;
             }
         }
 
-        // Pad to 4-byte alignment with 0xFF
+        // Pad to 4-byte alignment with 0 (lemire uses 0, not 0xFF)
         while byte_pos % 4 != 0 {
-            output_bytes[byte_pos] = 0xFF;
+            output_bytes[byte_pos] = 0;
             byte_pos += 1;
         }
 
@@ -100,68 +94,40 @@ impl Skippable for VariableByte {
         Ok(())
     }
 
-    #[expect(unused_variables)]
-    fn headless_uncompress(
-        &mut self,
-        input: &[u32],
-        input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-        num: u32,
-    ) -> FastPForResult<()> {
-        Err(FastPForError::Unimplemented)
-    }
-}
-
-impl Integer<u32> for VariableByte {
-    fn compress(
-        &mut self,
-        input: &[u32],
-        input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
-        self.headless_compress(input, input_length, input_offset, output, output_offset)
-    }
-
-    fn uncompress(
-        &mut self,
+    /// Decompress `input_length` u32 words of variable-byte data from
+    /// `input[input_offset..]` into `output[output_offset..]`.
+    fn decompress_from_u32_slice(
         input: &[u32],
         input_length: u32,
         input_offset: &mut Cursor<u32>,
         output: &mut [u32],
         output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
+    ) -> Result<(), FastPForError> {
         if input_length == 0 {
             return Ok(());
         }
 
-        // Convert u32 array to byte view
-        let byte_length = (input_length.as_usize()) * 4;
+        let byte_length = input_length.as_usize() * 4;
         let input_start = input_offset.position() as usize;
 
-        // Create a byte slice view of the input
         let input_bytes: &[u8] =
             &cast_slice::<u32, u8>(input)[input_start * 4..input_start * 4 + byte_length];
 
         let mut byte_pos = 0;
         let mut tmp_outpos = output_offset.position() as usize;
 
+        // Lemire format: high bit set (c >= 128) means last byte of value.
         // Fast path: process while we have at least 10 bytes remaining
         while byte_pos + 10 <= byte_length {
             let mut v: u32 = 0;
             let mut bytes_read = 0;
 
-            // Decode up to 5 bytes for a u32 value
             for i in 0..5 {
                 let c = input_bytes[byte_pos + i];
 
                 if i < 4 {
-                    // For bytes 0-3, use 7 bits each
                     v |= u32::from(c & 0x7F) << (i * 7);
-                    if c < 128 {
+                    if c >= 128 {
                         bytes_read = i + 1;
                         break;
                     }
@@ -180,7 +146,7 @@ impl Integer<u32> for VariableByte {
             tmp_outpos += 1;
         }
 
-        // Slow path: process remaining bytes
+        // Slow path: process remaining bytes (lemire: c >= 128 = last byte)
         while byte_pos < byte_length {
             let mut v: u32 = 0;
             let mut decoded = false;
@@ -192,7 +158,7 @@ impl Integer<u32> for VariableByte {
                 byte_pos += 1;
                 if i < 4 {
                     v |= u32::from(c & 0x7F) << (i * 7);
-                    if c < 128 {
+                    if c >= 128 {
                         decoded = true;
                         break;
                     }
@@ -216,19 +182,19 @@ impl Integer<u32> for VariableByte {
 
         Ok(())
     }
-}
 
-impl Integer<i8> for VariableByte {
-    fn compress(
-        &mut self,
+    /// Compress `input_length` u32 values into an `i8` slice using sign-bit
+    /// continuation encoding (negative i8 = more bytes follow).
+    #[cfg(test)]
+    #[allow(clippy::unnecessary_wraps)]
+    fn compress_to_i8_slice(
         input: &[u32],
         input_length: u32,
         input_offset: &mut Cursor<u32>,
         output: &mut [i8],
         output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
+    ) -> Result<(), FastPForError> {
         if input_length == 0 {
-            // Return early if there is no data to compress
             return Ok(());
         }
         let mut out_pos_tmp = output_offset.position();
@@ -275,14 +241,18 @@ impl Integer<i8> for VariableByte {
         input_offset.add(input_length);
         Ok(())
     }
-    fn uncompress(
-        &mut self,
+
+    /// Decompress `input_length` i8 values (sign-bit continuation encoding)
+    /// into u32 output.
+    #[cfg(test)]
+    #[allow(clippy::unnecessary_wraps)]
+    fn decompress_from_i8_slice(
         input: &[i8],
         input_length: u32,
         input_offset: &mut Cursor<u32>,
         output: &mut [u32],
         output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
+    ) -> Result<(), FastPForError> {
         let mut p = input_offset.position() as u32;
         let final_p = input_offset.position() as u32 + input_length;
         let mut tmp_outpos = output_offset.position();
@@ -290,7 +260,6 @@ impl Integer<i8> for VariableByte {
         while p < final_p {
             let mut v = i32::from(input[p.as_usize()] & 0x7F);
             if input[p.as_usize()] >= 0 {
-                // High bit is NOT set, this is the last byte
                 p += 1;
                 output[tmp_outpos as usize] = v as u32;
                 tmp_outpos += 1;
@@ -299,7 +268,6 @@ impl Integer<i8> for VariableByte {
 
             v |= i32::from(input[p.as_usize() + 1] & 0x7F) << 7;
             if input[p.as_usize() + 1] >= 0 {
-                // High bit is NOT set, this is the last byte
                 p += 2;
                 output[tmp_outpos as usize] = v as u32;
                 tmp_outpos += 1;
@@ -308,7 +276,6 @@ impl Integer<i8> for VariableByte {
 
             v |= i32::from(input[p.as_usize() + 2] & 0x7F) << 14;
             if input[p.as_usize() + 2] >= 0 {
-                // High bit is NOT set, this is the last byte
                 p += 3;
                 output[tmp_outpos as usize] = v as u32;
                 tmp_outpos += 1;
@@ -317,7 +284,6 @@ impl Integer<i8> for VariableByte {
 
             v |= i32::from(input[p.as_usize() + 3] & 0x7F) << 21;
             if input[p.as_usize() + 3] >= 0 {
-                // High bit is NOT set, this is the last byte
                 p += 4;
                 output[tmp_outpos as usize] = v as u32;
                 tmp_outpos += 1;
@@ -335,17 +301,76 @@ impl Integer<i8> for VariableByte {
     }
 }
 
+impl Default for VariableByte {
+    fn default() -> Self {
+        VariableByte::new()
+    }
+}
+
+impl AnyLenCodec for VariableByte {
+    fn encode(&mut self, input: &[u32], out: &mut Vec<u32>) -> Result<(), FastPForError> {
+        let capacity = input.len() * 2 + 4;
+        let start = out.len();
+        out.resize(start + capacity, 0);
+        let mut in_off = Cursor::new(0u32);
+        let mut out_off = Cursor::new(0u32);
+        VariableByte::compress_into_slice(
+            input,
+            input.len() as u32,
+            &mut in_off,
+            &mut out[start..],
+            &mut out_off,
+        )?;
+        let written = out_off.position() as usize;
+        out.truncate(start + written);
+        Ok(())
+    }
+
+    fn decode(
+        &mut self,
+        input: &[u32],
+        out: &mut Vec<u32>,
+        expected_len: Option<u32>,
+    ) -> Result<(), FastPForError> {
+        let capacity = if let Some(expected) = expected_len {
+            expected.is_valid_expected(Self::max_decompressed_len(input.len()))?
+        } else {
+            input.len() * 4
+        };
+        let start = out.len();
+        out.reserve(capacity);
+        out.resize(start + capacity, 0);
+        let mut in_off = Cursor::new(0u32);
+        let mut out_off = Cursor::new(0u32);
+        VariableByte::decompress_from_u32_slice(
+            input,
+            input.len() as u32,
+            &mut in_off,
+            &mut out[start..],
+            &mut out_off,
+        )?;
+        let written = out_off.position() as usize;
+        out.truncate(start + written);
+        if let Some(n) = expected_len {
+            written.is_decoded_mismatch(n)?;
+        }
+        Ok(())
+    }
+}
+
 #[cfg(test)]
 mod tests {
+    use std::collections::hash_map::RandomState;
+    use std::hash::{BuildHasher, Hasher};
+
     use super::*;
 
     fn verify_u32_roundtrip(input: &[u32]) {
-        let mut vb = VariableByte::new();
-        let mut encoded: Vec<u32> = vec![0; input.len() * 2];
+        let mut encoded: Vec<u32> = vec![0; input.len() * 2 + 1];
         let mut input_offset = Cursor::new(0);
         let mut output_offset = Cursor::new(0);
 
-        vb.compress(
+        VariableByte::compress_into_slice(
             input,
             input.len() as u32,
             &mut input_offset,
@@ -359,7 +384,7 @@ mod tests {
         let mut input_offset = Cursor::new(0);
         let mut output_offset = Cursor::new(0);
 
-        vb.uncompress(
+        VariableByte::decompress_from_u32_slice(
             &encoded,
             encoded_len,
             &mut input_offset,
@@ -377,12 +402,11 @@ mod tests {
     }
 
     fn verify_i8_roundtrip(input: &[u32]) {
-        let mut vb = VariableByte::new();
         let mut encoded: Vec<i8> = vec![0; input.len() * 10];
         let mut input_offset = Cursor::new(0);
         let mut output_offset = Cursor::new(0);
 
-        vb.compress(
+        VariableByte::compress_to_i8_slice(
             input,
             input.len() as u32,
             &mut input_offset,
@@ -396,7 +420,7 @@ mod tests {
         let mut input_offset = Cursor::new(0);
         let mut output_offset = Cursor::new(0);
 
-        vb.uncompress(
+        VariableByte::decompress_from_i8_slice(
             &encoded,
             encoded_len,
             &mut input_offset,
@@ -521,9 +545,6 @@ mod tests {
 
     #[test]
     fn test_random_numbers_small() {
-        use std::collections::hash_map::RandomState;
-        use std::hash::{BuildHasher, Hasher};
-
         let seed = RandomState::new().build_hasher().finish();
         let mut rng = seed;
         let mut input = Vec::new();
@@ -558,4 +579,113 @@ mod tests {
         verify_u32_roundtrip(&input);
         verify_i8_roundtrip(&input);
     }
+
+    #[test]
+    fn test_variable_byte_default() {
+        let mut codec = <VariableByte as Default>::default();
+        let data = vec![1u32, 2, 3];
+        let mut out = Vec::new();
+        codec.encode(&data, &mut out).unwrap();
+        let mut decoded = Vec::new();
+        codec.decode(&out, &mut decoded, None).unwrap();
+        assert_eq!(decoded, data);
+    }
+
+    /// `decompress_from_u32_slice` returns `OutputBufferTooSmall` when the
+    /// output buffer is exhausted mid-stream (fast path, ≥10 bytes remaining).
+    #[test]
+    fn test_decompress_output_too_small_fast_path() {
+        // Encode 16 values so the fast path (≥10 bytes) is exercised.
+        let input: Vec<u32> = (0..16).collect();
+        let mut encoded: Vec<u32> = vec![0; input.len() * 2 + 1];
+        let mut in_off = Cursor::new(0u32);
+        let mut out_off = Cursor::new(0u32);
+        VariableByte::compress_into_slice(
+            &input,
+            input.len() as u32,
+            &mut in_off,
+            &mut encoded,
+            &mut out_off,
+        )
+        .unwrap();
+        let encoded_len = out_off.position() as u32;
+
+        // Output buffer with room for only 4 values — must error.
+        let mut tiny_out = vec![0u32; 4];
+        let result = VariableByte::decompress_from_u32_slice(
+            &encoded,
+            encoded_len,
+            &mut Cursor::new(0u32),
+            &mut tiny_out,
+            &mut Cursor::new(0u32),
+        );
+        assert!(
+            matches!(result, Err(FastPForError::OutputBufferTooSmall)),
+            "expected OutputBufferTooSmall, got {result:?}"
+        );
+    }
+
+    /// `decompress_from_u32_slice` returns `OutputBufferTooSmall` when the
+    /// output buffer is exhausted in the slow path (<10 bytes remaining).
+    #[test]
+    fn test_decompress_output_too_small_slow_path() {
+        // Encode 2 values so only the slow path is exercised (< 10 bytes).
+        let input = vec![1u32, 2];
+        let mut encoded: Vec<u32> = vec![0; input.len() * 2 + 1];
+        let mut in_off = Cursor::new(0u32);
+        let mut out_off = Cursor::new(0u32);
+        VariableByte::compress_into_slice(
+            &input,
+            input.len() as u32,
+            &mut in_off,
+            &mut encoded,
+            &mut out_off,
+        )
+        .unwrap();
+        let encoded_len = out_off.position() as u32;
+
+        // Zero-capacity output — must error.
+        let result = VariableByte::decompress_from_u32_slice(
+            &encoded,
+            encoded_len,
+            &mut Cursor::new(0u32),
+            &mut [],
+            &mut Cursor::new(0u32),
+        );
+        assert!(
+            matches!(result, Err(FastPForError::OutputBufferTooSmall)),
+            "expected OutputBufferTooSmall, got {result:?}"
+        );
+    }
+
+    #[test]
+    fn test_anylen_decode_with_expected_len_ok() {
+        let data = vec![1u32, 2, 3];
+        let mut encoded = Vec::new();
+        VariableByte::new().encode(&data, &mut encoded).unwrap();
+        let mut decoded = Vec::new();
+        VariableByte::new()
+            .decode(&encoded, &mut decoded, Some(3))
+            .unwrap();
+        assert_eq!(decoded, data);
+    }
+
+    #[test]
+    fn test_anylen_decode_expected_len_mismatch_errors() {
+        // expected_len must be >= actual to avoid OutputBufferTooSmall; use a larger
+        // value to exercise the is_decoded_mismatch path.
+        let data = vec![1u32, 2, 3];
+        let mut encoded = Vec::new();
+        VariableByte::new().encode(&data, &mut encoded).unwrap();
+        let err = VariableByte::new()
+            .decode(&encoded, &mut Vec::new(), Some(10))
+            .unwrap_err();
+        assert!(matches!(
+            err,
+            FastPForError::DecodedCountMismatch {
+                actual: 3,
+                expected: 10
+            }
+        ));
+    }
 }
diff --git a/src/rust/mod.rs b/src/rust/mod.rs
index 24493ae..92a2b05 100644
--- a/src/rust/mod.rs
+++ b/src/rust/mod.rs
@@ -1,18 +1,17 @@
+mod composite;
 mod cursor;
 mod integer_compression;
 
-pub use cursor::IncrementCursor;
-pub use integer_compression::bitpacking::fast_pack;
-pub use integer_compression::bitunpacking::fast_unpack;
-pub use integer_compression::codec::Codec;
-pub use integer_compression::composition::Composition;
-pub use integer_compression::differential::Delta;
-pub use integer_compression::fastpfor::{
-    BLOCK_SIZE_128, BLOCK_SIZE_256, DEFAULT_PAGE_SIZE, FastPFOR,
-};
-pub use integer_compression::integer_codec::Integer;
+pub use composite::CompositeCodec;
+/// Type-safe block codec with block size encoded in the type.
+pub use integer_compression::fastpfor::{FastPFor, FastPForBlock128, FastPForBlock256};
+/// Pass-through codec — implements [`AnyLenCodec`](crate::codec::AnyLenCodec).
 pub use integer_compression::just_copy::JustCopy;
-pub use integer_compression::skippable_codec::Skippable;
+/// Variable-byte codec — implements [`AnyLenCodec`](crate::codec::AnyLenCodec).
 pub use integer_compression::variable_byte::VariableByte;
 
-pub use crate::{FastPForError, FastPForResult};
+/// `FastPForBlock256` blocks + `VariableByte` remainder — the most common composite.
+pub type FastPFor256 = CompositeCodec<FastPForBlock256, VariableByte>;
+
+/// `FastPForBlock128` blocks + `VariableByte` remainder.
+pub type FastPFor128 = CompositeCodec<FastPForBlock128, VariableByte>;
diff --git a/tests/basic_tests.rs b/tests/basic_tests.rs
index 488c99f..a0f3301 100644
--- a/tests/basic_tests.rs
+++ b/tests/basic_tests.rs
@@ -1,468 +1,123 @@
-//! Basic tests for FastPFOR-rs.
+//! Basic integration tests exercising the public `BlockCodec` and `AnyLenCodec` APIs.
 
 #![cfg(feature = "rust")]
-#![expect(clippy::needless_range_loop)]
 
-use std::io::Cursor;
-use std::num::NonZeroU32;
-
-use fastpfor::rust::{
-    BLOCK_SIZE_128, Composition, DEFAULT_PAGE_SIZE, FastPFOR, Integer, VariableByte, fast_pack,
-    fast_unpack,
+use fastpfor::{
+    AnyLenCodec, BlockCodec, FastPFor128, FastPFor256, FastPForBlock128, FastPForBlock256,
+    JustCopy, VariableByte, slice_to_blocks,
 };
-use rand::RngExt as _;
+use rand::rngs::StdRng;
+use rand::{RngExt as _, SeedableRng};
 
 mod common;
 
-#[test]
-#[cfg(feature = "cpp")]
-fn saul_test() {
-    let codecs = common::get_codecs();
-
-    for mut codec in codecs {
-        if codec.name() == "VariableByte" {
-            continue;
-        }
-
-        for x in 0..50 {
-            let input = vec![2, 3, 4, 5];
-            let mut output: Vec<u32> = vec![0; 90];
-            let mut answer: Vec<u32> = vec![0; input.len()];
-            let mut input_offset = Cursor::new(0);
-            let mut output_offset = Cursor::new(0);
-            output_offset.set_position(u64::from(x));
+// ── Generic helpers ───────────────────────────────────────────────────────────
 
-            codec
-                .compress(
-                    &input,
-                    input.len() as u32,
-                    &mut input_offset,
-                    &mut output,
-                    &mut output_offset,
-                )
-                .unwrap_or_else(|e| {
-                    panic!("Failed to compress with {}: {e:?}", codec.name());
-                });
-
-            let len = output_offset.position() as u32 - x;
-            output_offset.set_position(u64::from(x));
-
-            codec
-                .uncompress(
-                    &output,
-                    len,
-                    &mut output_offset,
-                    &mut answer,
-                    &mut Cursor::new(0),
-                )
-                .unwrap_or_else(|e| {
-                    panic!("Failed to uncompress with {}: {e:?}", codec.name());
-                });
-
-            assert_eq!(input, answer);
-        }
-    }
+fn anylen_roundtrip<C: AnyLenCodec + ?Sized>(codec: &mut C, data: &[u32]) {
+    let mut compressed = Vec::new();
+    codec
+        .encode(data, &mut compressed)
+        .unwrap_or_else(|e| panic!("encode failed: {e:?}"));
+    let mut decoded = Vec::new();
+    codec
+        .decode(&compressed, &mut decoded, None)
+        .unwrap_or_else(|e| panic!("decode failed: {e:?}"));
+    assert_eq!(decoded, data);
 }
 
-#[test]
-#[cfg(feature = "cpp")]
-fn test_varying_length() {
-    let n = 4096;
-    let mut data = vec![0u32; n];
-    for k in 0..n {
-        data[k] = k as u32;
-    }
-    let codecs = common::get_codecs();
-    for mut codec in codecs {
-        for l in 1..128 {
-            let mut data_copy = data.clone();
-            data_copy.resize(l, 0);
-            let mut output_compress = vec![0; data_copy.len() * 4];
-            let mut output_offset = Cursor::new(0);
-            codec
-                .compress(
-                    &data_copy,
-                    data_copy.len() as u32,
-                    &mut Cursor::new(0),
-                    &mut output_compress,
-                    &mut output_offset,
-                )
-                .unwrap_or_else(|e| {
-                    panic!("Failed to compress with {}: {e:?}", codec.name());
-                });
-            let compressed_len = output_offset.position() as u32;
-            let mut answer = vec![0; l + 1024];
-            codec
-                .uncompress(
-                    &output_compress,
-                    compressed_len,
-                    &mut Cursor::new(0),
-                    &mut answer,
-                    &mut Cursor::new(0),
-                )
-                .unwrap_or_else(|e| {
-                    panic!("Failed to uncompress with {}: {e:?}", codec.name());
-                });
-            for k in 0..l {
-                assert_eq!(answer[k], data[k]);
-            }
-        }
-    }
+fn block_roundtrip<C: BlockCodec + Default>(data: &[u32]) {
+    let mut codec = C::default();
+    let (blocks, _) = slice_to_blocks::<C>(data);
+    let mut compressed = Vec::new();
+    codec.encode_blocks(blocks, &mut compressed).unwrap();
+    let mut decoded = Vec::new();
+    let expected_values = blocks.len() * C::size();
+    codec
+        .decode_blocks(
+            &compressed,
+            Some(u32::try_from(expected_values).expect("expected_values fits in u32")),
+            &mut decoded,
+        )
+        .unwrap();
+    assert_eq!(decoded, &data[..expected_values]);
 }
 
-#[test]
-#[cfg(feature = "cpp")]
-fn test_varying_length_two() {
-    let n = 128;
-    let mut data = vec![0u32; n];
-    data[126] = -1i32 as u32;
-    let codecs = common::get_codecs();
-    for mut codec in codecs {
-        for l in 1..128 {
-            let mut data_copy = data.clone();
-            let mut output_compress = vec![0; data_copy.len() * 4];
-            data_copy.resize(l, 0);
-            codec
-                .compress(
-                    &data_copy,
-                    data_copy.len() as u32,
-                    &mut Cursor::new(0),
-                    &mut output_compress,
-                    &mut Cursor::new(0),
-                )
-                .unwrap_or_else(|e| {
-                    panic!("Failed to compress with {}: {e:?}", codec.name());
-                });
-            let mut answer = vec![0; data_copy.len() + 1024];
-            codec
-                .uncompress(
-                    &output_compress,
-                    128,
-                    &mut Cursor::new(0),
-                    &mut answer,
-                    &mut Cursor::new(0),
-                )
-                .unwrap_or_else(|e| {
-                    panic!("Failed to uncompress with {}: {e:?}", codec.name());
-                });
-            for k in 1..l {
-                if answer[k] != data[k] {
-                    assert_eq!(answer[k], data[k]);
-                }
-            }
-        }
-    }
-}
+// ── Tests ─────────────────────────────────────────────────────────────────────
 
 #[test]
-fn verity_bitpacking() {
-    let n = 32;
-    let times = 1000;
-    let mut r = rand::rng();
-    let mut data = vec![0; n];
-    let mut compressed = vec![0; n];
-    let mut uncompressed = vec![0; n];
-
-    for bit in 0..31 {
-        for _ in 0..times {
-            for k in 0..n {
-                data[k] = r.random_range(0..(1 << bit));
-            }
-
-            fast_pack(&data, 0, &mut compressed, 0, bit);
-            fast_unpack(&compressed, 0, &mut uncompressed, 0, bit);
-
-            assert_eq!(uncompressed, data, "Mismatch for bit {bit}");
-        }
-    }
-}
-
-fn mask_array(array: &mut [u32], mask: u32) {
-    for value in array.iter_mut() {
-        *value &= mask;
+#[cfg(feature = "cpp")]
+fn saul_test() {
+    use fastpfor::cpp::CppFastPFor128;
+    // Block codecs + tail for any-length. C++ block codecs are already any-length; use directly.
+    let mut codecs: Vec<(&str, Box<dyn AnyLenCodec>)> = vec![
+        ("JustCopy", Box::new(JustCopy)),
+        ("FastPFor256", Box::new(FastPFor256::default())),
+        ("FastPFor128", Box::new(FastPFor128::default())),
+        ("CppFastPFor128", Box::new(CppFastPFor128::default())),
+    ];
+    let input = vec![2u32, 3, 4, 5];
+    for (name, codec) in &mut codecs {
+        anylen_roundtrip(codec.as_mut(), &input);
+        // silence unused-variable warning when cpp feature is off
+        let _ = name;
     }
 }
 
+/// Sub-block-sized inputs produce no output via `BlockCodec`.
 #[test]
-fn verify_with_exceptions() {
-    const N: usize = 32;
-    const TIMES: usize = 1000;
-    let mut rng = rand::rng();
-
-    let mut data = vec![0u32; N];
-    let mut compressed = vec![0u32; N];
-    let mut uncompressed = vec![0u32; N];
-
-    for bit in 0..31 {
-        for _ in 0..TIMES {
-            for value in &mut data {
-                *value = rng.random();
-            }
-
-            fast_pack(&data, 0, &mut compressed, 0, bit);
-            fast_unpack(&compressed, 0, &mut uncompressed, 0, bit);
-
-            mask_array(&mut data, (1 << bit) - 1);
-
-            assert_eq!(
-                data, uncompressed,
-                "Data does not match uncompressed output"
-            );
-        }
+fn spurious_out_test() {
+    fn check<C: BlockCodec + Default>(len: usize) {
+        let x = vec![0u32; 1024];
+        let (blocks, _) = slice_to_blocks::<C>(&x[..len]);
+        let mut out = Vec::new();
+        C::default().encode_blocks(blocks, &mut out).unwrap();
+        assert!(out.is_empty() || blocks.is_empty());
     }
-}
-
-fn test_spurious<C: Integer<u32>>(codec: &mut C) {
-    let x = vec![0u32; 1024];
-    let mut y: Vec<u32> = vec![0; 1024];
-    let mut i0 = Cursor::new(0);
-    let mut i1 = Cursor::new(0);
-
-    for inp_length in 0..32 {
-        codec
-            .compress(&x, inp_length, &mut i0, &mut y, &mut i1)
-            .unwrap_or_else(|e| panic!("Compression failed: {e:?}"));
-
-        assert_eq!(
-            0,
-            i1.position(),
-            "Expected output cursor position to be 0, but got {}",
-            i1.position()
-        );
+    for len in 0..32usize {
+        check::<FastPForBlock256>(len);
+        check::<FastPForBlock128>(len);
     }
 }
 
-#[test]
-fn spurious_out_test() {
-    let mut codec1 = FastPFOR::default();
-    test_spurious(&mut codec1);
-
-    let mut codec2 = FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128);
-    test_spurious(&mut codec2);
-}
-
-fn test_zero_in_zero_out<C: Integer<u32>>(codec: &mut C) {
-    // Empty input and output arrays
-    let x: Vec<u32> = Vec::new();
-    let mut y: Vec<u32> = Vec::new();
-    let mut i0 = Cursor::new(0);
-    let mut i1 = Cursor::new(0);
-
-    // Test compression
-    codec
-        .compress(&x, 0, &mut i0, &mut y, &mut i1)
-        .unwrap_or_else(|e| panic!("Compression failed: {e:?}"));
-    assert_eq!(
-        i1.position(),
-        0,
-        "Expected output cursor position to be 0 after compression, but got {}",
-        i1.position()
-    );
-
-    // Test decompression
-    let mut out: Vec<u32> = Vec::new();
-    let mut out_pos = Cursor::new(0);
-    codec
-        .uncompress(&y, 0, &mut i1, &mut out, &mut out_pos)
-        .unwrap_or_else(|e| panic!("Decompression failed: {e:?}"));
-    assert_eq!(
-        out_pos.position(),
-        0,
-        "Expected output cursor position to be 0 after decompression, but got {}",
-        out_pos.position()
-    );
-}
-
+/// `AnyLenCodec` round-trips empty input correctly.
 #[test]
 fn zero_in_zero_out_test() {
-    let mut codec1 = FastPFOR::default();
-    test_zero_in_zero_out(&mut codec1);
-
-    let mut codec2 = FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128);
-    test_zero_in_zero_out(&mut codec2);
-
-    let mut codec3 = VariableByte;
-    test_zero_in_zero_out(&mut codec3);
-
-    let mut codec4 = Composition::new(FastPFOR::default(), VariableByte);
-    test_zero_in_zero_out(&mut codec4);
-
-    let mut codec5 = Composition::new(
-        FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128),
-        VariableByte,
-    );
-    test_zero_in_zero_out(&mut codec5);
+    anylen_roundtrip(&mut VariableByte::new(), &[]);
+    anylen_roundtrip(&mut JustCopy::new(), &[]);
+    anylen_roundtrip(&mut FastPFor256::default(), &[]);
+    anylen_roundtrip(&mut FastPFor128::default(), &[]);
 }
 
 #[test]
 fn test_increasing_sequence() {
-    let n = 256;
-    let data: Vec<u32> = (0..n).collect();
-    let codecs = vec![
-        FastPFOR::default(),
-        FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128),
-    ];
-    for mut codec in codecs {
-        // Compress the data
-        let mut output_compress = vec![0; data.len() * 4];
-        codec
-            .compress(
-                &data,
-                data.len() as u32,
-                &mut Cursor::new(0),
-                &mut output_compress,
-                &mut Cursor::new(0),
-            )
-            .unwrap_or_else(|e| {
-                panic!("Failed to compress: {e:?}");
-            });
-
-        // Decompress the data
-        let mut decompressed = vec![0; data.len() + 1024];
-        codec
-            .uncompress(
-                &output_compress,
-                n,
-                &mut Cursor::new(0),
-                &mut decompressed,
-                &mut Cursor::new(0),
-            )
-            .unwrap_or_else(|e| {
-                panic!("Failed to uncompress: {e:?}");
-            });
-
-        // Verify decompressed data matches original
-        for (i, &value) in data.iter().enumerate() {
-            assert_eq!(value, decompressed[i], "Mismatch at index {i}");
-        }
-    }
+    let data: Vec<u32> = (0..256u32).collect();
+    anylen_roundtrip(&mut FastPFor256::default(), &data);
+    anylen_roundtrip(&mut FastPFor128::default(), &data);
 }
 
 #[test]
 fn test_random_numbers() {
-    use rand::SeedableRng;
-    use rand::rngs::StdRng;
-
-    let n = 65536;
-    let mut rng = StdRng::seed_from_u64(123456);
-    let data: Vec<u32> = (0..n).map(|_| rng.random()).collect(); // Generate random data
-    let codecs = vec![
-        FastPFOR::default(),
-        FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128),
-    ];
-    for mut codec in codecs {
-        // Compress the data
-        let mut output_compress = vec![0; data.len() * 4];
-        codec
-            .compress(
-                &data,
-                data.len() as u32,
-                &mut Cursor::new(0),
-                &mut output_compress,
-                &mut Cursor::new(0),
-            )
-            .unwrap_or_else(|e| {
-                panic!("Failed to compress: {e:?}");
-            });
-
-        // Decompress the data
-        let mut decompressed = vec![0; data.len() + 1024];
-        codec
-            .uncompress(
-                &output_compress,
-                n as u32,
-                &mut Cursor::new(0),
-                &mut decompressed,
-                &mut Cursor::new(0),
-            )
-            .unwrap_or_else(|e| {
-                panic!("Failed to uncompress: {e:?}");
-            });
-
-        // Verify decompressed data matches original
-        for (i, &value) in data.iter().enumerate() {
-            assert_eq!(value, decompressed[i], "Mismatch at index {i}");
-        }
-    }
+    let data: Vec<u32> = (0..65536)
+        .map(|_| StdRng::seed_from_u64(123456).random())
+        .collect();
+    anylen_roundtrip(&mut FastPFor256::default(), &data);
+    anylen_roundtrip(&mut FastPFor128::default(), &data);
 }
 
+/// `BlockCodec` round-trip using `slice_to_blocks` to split aligned input.
 #[test]
-fn test_fastpfor_headless_compress_unfit_pagesize() {
-    // The input size is a multiple of 128 but does not fit the page size
-    let test_input_size = BLOCK_SIZE_128.checked_add(512).unwrap();
-    let page_size = NonZeroU32::new(512).unwrap();
-
-    let input: Vec<u32> = (0..test_input_size.get()).collect();
-    let mut output: Vec<u32> = vec![0; input.len()];
-    let mut decoded: Vec<u32> = vec![0; input.len()];
-    let mut input_offset = Cursor::new(0u32);
-    let mut output_offset = Cursor::new(0u32);
-
-    let mut codec = FastPFOR::new(page_size, BLOCK_SIZE_128);
-    codec
-        .compress(
-            &input,
-            input.len() as u32,
-            &mut input_offset,
-            &mut output,
-            &mut output_offset,
-        )
-        .expect("compression failed");
-
-    let compressed_len = output_offset.position() as usize;
-    input_offset.set_position(0);
-
-    codec
-        .uncompress(
-            &output,
-            compressed_len as u32,
-            &mut input_offset,
-            &mut decoded,
-            &mut Cursor::new(0u32),
-        )
-        .expect("decompression failed");
-
-    assert_eq!(input, decoded, "Input and decompressed data do not match");
+fn block_codec_roundtrip() {
+    block_roundtrip::<FastPForBlock256>(&(0u32..512).collect::<Vec<_>>());
+    block_roundtrip::<FastPForBlock128>(&(0u32..512).collect::<Vec<_>>());
 }
 
+/// `AnyLenCodec` round-trip with random values at various lengths.
 #[test]
-fn test_exception_value_vector_resizes() {
-    let page_size = NonZeroU32::new(512).unwrap();
-    let test_input_size = page_size.get() * 2;
-
-    // every even index value is large which will trigger exception buffer to be resized
-    let input: Vec<u32> = (0..test_input_size)
-        .map(|i| if i % 2 == 0 { 1 << 30 } else { 3 })
-        .collect();
-
-    let mut output: Vec<u32> = vec![0; input.len() * 4];
-    let mut decoded: Vec<u32> = vec![0; input.len()];
-    let mut input_offset = Cursor::new(0u32);
-    let mut output_offset = Cursor::new(0u32);
-
-    let mut codec = FastPFOR::new(page_size, BLOCK_SIZE_128);
-    codec
-        .compress(
-            &input,
-            input.len() as u32,
-            &mut input_offset,
-            &mut output,
-            &mut output_offset,
-        )
-        .expect("compression failed");
-
-    let compressed_len = output_offset.position() as usize;
-    input_offset.set_position(0);
-
-    codec
-        .uncompress(
-            &output,
-            compressed_len as u32,
-            &mut input_offset,
-            &mut decoded,
-            &mut Cursor::new(0u32),
-        )
-        .expect("decompression failed");
-
-    assert_eq!(input, decoded, "Input and decompressed data do not match");
+fn anylen_random_roundtrip() {
+    let mut rng = rand::rng();
+    for n in [128usize, 300, 512, 1000, 4096] {
+        let data: Vec<u32> = (0..n).map(|_| rng.random()).collect();
+        anylen_roundtrip(&mut FastPFor256::default(), &data);
+        anylen_roundtrip(&mut FastPFor128::default(), &data);
+    }
 }
diff --git a/tests/benchmark_smoke.rs b/tests/benchmark_smoke.rs
index fbc1e8c..782d5a1 100644
--- a/tests/benchmark_smoke.rs
+++ b/tests/benchmark_smoke.rs
@@ -8,23 +8,25 @@
 
 #[path = "../benches/bench_utils.rs"]
 mod bench_utils;
-use bench_utils::{
-    BLOCK_SIZE_128, DEFAULT_PAGE_SIZE, FastPFOR, block_size_fixtures, compress_data,
-    compress_fixtures, decompress_data, ratio_fixtures,
-};
+
+#[cfg(feature = "cpp")]
+use bench_utils::decompress_anylen;
+use bench_utils::{BlockSizeFixture, compress, compress_fixtures, decompress, ratio_fixtures};
 #[cfg(feature = "cpp")]
-use bench_utils::{cpp_decode, cpp_decode_fixtures};
+use fastpfor::BlockCodec;
 #[cfg(feature = "cpp")]
 use fastpfor::cpp::CppFastPFor128;
+use fastpfor::{FastPForBlock128, FastPForBlock256};
 
-const SMOKE_SIZE: usize = 256;
+const SMOKE_BLOCK_COUNT: usize = 2;
 
 #[test]
 fn smoke_compression() {
-    for (_, fix) in compress_fixtures(&[SMOKE_SIZE]) {
-        let compressed = compress_data(&mut FastPFOR::default(), &fix.data);
+    for (_, fix) in compress_fixtures::<FastPForBlock128>(&[SMOKE_BLOCK_COUNT]) {
+        let mut out = Vec::new();
+        compress::<FastPForBlock128>(&fix.data, &mut out);
         assert!(
-            !compressed.is_empty(),
+            !out.is_empty(),
             "{}: compressed output must be non-empty",
             fix.name
         );
@@ -33,64 +35,69 @@ fn smoke_compression() {
 
 #[test]
 fn smoke_decompression() {
-    let mut decompressed = vec![0u32; SMOKE_SIZE];
-    for (_, fix) in compress_fixtures(&[SMOKE_SIZE]) {
-        let n = decompress_data(
-            &mut FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128),
-            &fix.rust_compressed,
-            &mut decompressed,
-        );
+    for (_, fix) in compress_fixtures::<FastPForBlock128>(&[SMOKE_BLOCK_COUNT]) {
+        let mut decompressed = Vec::new();
+        let n = decompress::<FastPForBlock128>(&fix.compressed, fix.n_blocks, &mut decompressed);
         assert_eq!(
-            &decompressed[..n],
-            &fix.data[..],
-            "{}: roundtrip mismatch",
+            n,
+            fix.data.len(),
+            "{}: decompressed length mismatch",
             fix.name
         );
+        assert_eq!(decompressed, fix.data, "{}: roundtrip mismatch", fix.name);
     }
 }
 
 /// Mirrors `benchmark_roundtrip`: compress then immediately decompress.
 #[test]
 fn smoke_roundtrip() {
-    for (_, fix) in compress_fixtures(&[SMOKE_SIZE]) {
-        let compressed = compress_data(&mut FastPFOR::default(), &fix.data);
-        let mut decompressed = vec![0u32; fix.data.len()];
-        let n = decompress_data(&mut FastPFOR::default(), &compressed, &mut decompressed);
-        assert_eq!(
-            &decompressed[..n],
-            &fix.data[..],
-            "{}: roundtrip mismatch",
-            fix.name
-        );
+    for (_, fix) in compress_fixtures::<FastPForBlock128>(&[SMOKE_BLOCK_COUNT]) {
+        let mut compressed = Vec::new();
+        compress::<FastPForBlock128>(&fix.data, &mut compressed);
+        let mut decompressed = Vec::new();
+        let n = decompress::<FastPForBlock128>(&compressed, fix.n_blocks, &mut decompressed);
+        assert_eq!(n, fix.data.len(), "{}: roundtrip length mismatch", fix.name);
+        assert_eq!(decompressed, fix.data, "{}: roundtrip mismatch", fix.name);
     }
 }
 
 #[test]
 fn smoke_block_sizes() {
-    for fix in block_size_fixtures(SMOKE_SIZE) {
-        // compress_data path
-        let compressed = compress_data(
-            &mut FastPFOR::new(DEFAULT_PAGE_SIZE, fix.block_size),
-            &fix.data,
-        );
-        assert_eq!(compressed, fix.compressed);
-        // decompress_data path using the fixture's pre-compressed buffer
-        let mut decompressed = vec![0u32; fix.data.len()];
-        let n = decompress_data(
-            &mut FastPFOR::new(DEFAULT_PAGE_SIZE, fix.block_size),
-            &fix.compressed,
-            &mut decompressed,
+    let fix128 = BlockSizeFixture::<FastPForBlock128>::new(SMOKE_BLOCK_COUNT);
+    let fix256 = BlockSizeFixture::<FastPForBlock256>::new(SMOKE_BLOCK_COUNT);
+
+    // 128-element blocks
+    {
+        let mut compressed = Vec::new();
+        compress::<FastPForBlock128>(&fix128.data, &mut compressed);
+        assert_eq!(
+            compressed, fix128.compressed,
+            "128: compress output mismatch"
         );
-        assert_eq!(&decompressed[..n], &fix.data[..]);
+        let mut decompressed = Vec::new();
+        let n = decompress::<FastPForBlock128>(&compressed, fix128.n_blocks, &mut decompressed);
+        assert_eq!(n, fix128.data.len(), "128: decompressed length mismatch");
+        assert_eq!(decompressed, fix128.data, "128: roundtrip mismatch");
+    }
+
+    // 256-element blocks
+    {
+        let mut compressed = Vec::new();
+        compress::<FastPForBlock256>(&fix256.data, &mut compressed);
+        let mut decompressed = Vec::new();
+        let n = decompress::<FastPForBlock256>(&compressed, fix256.n_blocks, &mut decompressed);
+        assert_eq!(n, fix256.data.len(), "256: decompressed length mismatch");
+        assert_eq!(decompressed, fix256.data, "256: roundtrip mismatch");
     }
 }
 
 #[test]
 fn smoke_compression_ratio() {
-    for fix in ratio_fixtures(SMOKE_SIZE) {
-        let compressed = compress_data(&mut FastPFOR::default(), &fix.data);
+    for fix in ratio_fixtures::<FastPForBlock128>(SMOKE_BLOCK_COUNT) {
+        let mut out = Vec::new();
+        compress::<FastPForBlock128>(&fix.data, &mut out);
         assert!(
-            !compressed.is_empty(),
+            !out.is_empty(),
             "{}: compressed output must be non-empty",
             fix.name
         );
@@ -98,7 +105,7 @@ fn smoke_compression_ratio() {
             clippy::cast_precision_loss,
             reason = "Loss of precision is acceptable for compression ratio calculation"
         )]
-        let ratio = fix.data.len() as f64 / compressed.len() as f64;
+        let ratio = fix.data.len() as f64 / out.len() as f64;
         assert!(
             ratio > 0.0,
             "{}: compression ratio must be positive",
@@ -110,28 +117,27 @@ fn smoke_compression_ratio() {
 #[cfg(feature = "cpp")]
 #[test]
 fn smoke_cpp_vs_rust() {
-    for (_, fix) in cpp_decode_fixtures(&[SMOKE_SIZE]) {
-        // C++ decode
-        let mut codec = CppFastPFor128::new();
-        let mut cpp_out = vec![0u32; fix.original_len];
-        let n = cpp_decode(&mut codec, &fix.cpp_compressed, &mut cpp_out);
+    for (_, fix) in compress_fixtures::<FastPForBlock128>(&[SMOKE_BLOCK_COUNT]) {
+        let expected_len = fix.n_blocks * FastPForBlock128::size();
+
+        // C++ decode (same wire format as Rust; C++ uses AnyLenCodec)
+        let mut cpp_out = Vec::new();
+        let n = decompress_anylen::<CppFastPFor128>(&fix.compressed, expected_len, &mut cpp_out);
         assert_eq!(
-            n, fix.original_len,
+            n, expected_len,
             "{}: C++ decoded wrong element count",
             fix.name
         );
+        assert_eq!(cpp_out, fix.data, "{}: C++ roundtrip mismatch", fix.name);
 
         // Rust decode
-        let mut rust_out = vec![0u32; fix.original_len];
-        let n = decompress_data(
-            &mut FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128),
-            &fix.rust_compressed,
-            &mut rust_out,
-        );
+        let mut rust_out = Vec::new();
+        let n = decompress::<FastPForBlock128>(&fix.compressed, fix.n_blocks, &mut rust_out);
         assert_eq!(
-            n, fix.original_len,
+            n, expected_len,
             "{}: Rust decoded wrong element count",
             fix.name
         );
+        assert_eq!(rust_out, fix.data, "{}: Rust roundtrip mismatch", fix.name);
     }
 }
diff --git a/tests/common.rs b/tests/common.rs
index cabadb1..03e888d 100644
--- a/tests/common.rs
+++ b/tests/common.rs
@@ -1,103 +1,12 @@
 //! Common test utilities for codec compatibility testing.
 
-#![cfg(all(feature = "rust", feature = "cpp"))]
+#![cfg(any(feature = "rust", feature = "cpp"))]
 #![allow(dead_code, reason = "This file is shared by several test modules")]
 
-use std::io::Cursor;
-
-use fastpfor::rust::{
-    BLOCK_SIZE_128, Composition, DEFAULT_PAGE_SIZE, FastPFOR, FastPForResult, Integer, JustCopy,
-    VariableByte,
-};
 use rand::rngs::StdRng;
 use rand::{RngExt as _, SeedableRng as _};
 
-/// Wrapper enum for different codec implementations used in tests.
-pub enum TestCodec {
-    /// Variable-byte codec with name
-    VariableByte(VariableByte, String),
-    /// Pass-through codec with name
-    JustCopy(JustCopy, String),
-    /// Composite codec with name
-    Composition(Box<Composition>, String),
-}
-
-impl TestCodec {
-    /// Returns the name of the codec.
-    #[must_use]
-    pub fn name(&self) -> &str {
-        match self {
-            TestCodec::Composition(_, name)
-            | TestCodec::JustCopy(_, name)
-            | TestCodec::VariableByte(_, name) => name,
-        }
-    }
-    /// Compresses input data using the wrapped codec.
-    pub fn compress(
-        &mut self,
-        input: &[u32],
-        input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
-        match self {
-            TestCodec::VariableByte(vb, _) => {
-                vb.compress(input, input_length, input_offset, output, output_offset)
-            }
-            TestCodec::JustCopy(jc, _) => {
-                jc.compress(input, input_length, input_offset, output, output_offset)
-            }
-            TestCodec::Composition(comp, _) => {
-                comp.compress(input, input_length, input_offset, output, output_offset)
-            }
-        }
-    }
-
-    /// Decompresses input data using the wrapped codec.
-    pub fn uncompress(
-        &mut self,
-        input: &[u32],
-        input_length: u32,
-        input_offset: &mut Cursor<u32>,
-        output: &mut [u32],
-        output_offset: &mut Cursor<u32>,
-    ) -> FastPForResult<()> {
-        match self {
-            TestCodec::VariableByte(vb, _) => {
-                vb.uncompress(input, input_length, input_offset, output, output_offset)
-            }
-            TestCodec::JustCopy(jc, _) => {
-                jc.uncompress(input, input_length, input_offset, output, output_offset)
-            }
-            TestCodec::Composition(comp, _) => {
-                comp.uncompress(input, input_length, input_offset, output, output_offset)
-            }
-        }
-    }
-}
-
-/// Returns a collection of codec instances for testing.
-#[must_use]
-pub fn get_codecs() -> Vec<TestCodec> {
-    vec![
-        TestCodec::VariableByte(VariableByte::new(), "VariableByte".to_string()),
-        TestCodec::JustCopy(JustCopy::new(), "JustCopy".to_string()),
-        TestCodec::Composition(
-            Box::new(Composition::new(FastPFOR::default(), VariableByte::new())),
-            "FastPFOR + VariableByte".to_string(),
-        ),
-        TestCodec::Composition(
-            Box::new(Composition::new(
-                FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128),
-                VariableByte::new(),
-            )),
-            "FastPFOR + VariableByte".to_string(),
-        ),
-    ]
-}
-
-/// Returns various input sizes to test codec behavior.
+/// Returns various input sizes to test codec behavior (multiples of 128).
 #[must_use]
 pub fn test_input_sizes() -> Vec<usize> {
     (1..=8).map(|exp| (1usize << exp) * 128).collect()
diff --git a/tests/cpp_compat_tests.rs b/tests/cpp_compat_tests.rs
index 8d2c618..1e70845 100644
--- a/tests/cpp_compat_tests.rs
+++ b/tests/cpp_compat_tests.rs
@@ -1,96 +1,146 @@
 //! Compatibility tests between Rust and C++ codec implementations.
+//!
+//! C++ codecs are composite (any-length); Rust block codecs produce the same wire format
+//! for block-aligned data. Both sides use the same element-count header.
 
 #![cfg(all(feature = "rust", feature = "cpp"))]
 
-use std::io::Cursor;
-
-use fastpfor::rust::Integer as _;
-use fastpfor::{AnyLenCodec as _, rust};
+use fastpfor::{FastPFor128, FastPFor256, FastPForBlock128};
 
 mod common;
 use common::{get_test_cases, test_input_sizes};
 use fastpfor::cpp::CppFastPFor128;
+use fastpfor::{AnyLenCodec, BlockCodec, slice_to_blocks};
 
+/// C++ `AnyLenCodec` encode → Rust `BlockCodec` decode (same wire format for block-aligned data).
 #[test]
 fn test_rust_decompresses_cpp_encoded_data() {
     let mut codec_cpp = CppFastPFor128::new();
-    let mut codec_rs = rust::FastPFOR::new(rust::DEFAULT_PAGE_SIZE, rust::BLOCK_SIZE_128);
+    let mut codec_rs = FastPForBlock128::default();
 
     for n in test_input_sizes() {
-        for input in get_test_cases(n + rust::BLOCK_SIZE_128.get() as usize) {
-            let mut compressed_buffer = Vec::new();
-            codec_cpp.encode(&input, &mut compressed_buffer).unwrap();
-            let compressed_len = compressed_buffer.len();
+        for input in get_test_cases(n + 128) {
+            if input.len() % 128 != 0 || input.is_empty() {
+                continue;
+            }
+            let n_blocks = input.len() / 128;
 
-            let mut decoded_by_cpp = Vec::new();
-            codec_cpp
-                .decode(
-                    &compressed_buffer,
-                    &mut decoded_by_cpp,
-                    Some(input.len() as u32),
-                )
-                .unwrap();
-            let decoded_cpp = decoded_by_cpp.as_slice();
+            let mut cpp_compressed = Vec::new();
+            codec_cpp.encode(&input, &mut cpp_compressed).unwrap();
 
-            // Rust decoding
-            let mut input_offset = Cursor::new(0u32);
-            let mut decoded_by_rust = vec![0; input.len()];
+            let mut rust_decoded = Vec::new();
             codec_rs
-                .uncompress(
-                    &compressed_buffer,
-                    compressed_len as u32,
-                    &mut input_offset,
-                    &mut decoded_by_rust,
-                    &mut Cursor::new(0u32),
+                .decode_blocks(
+                    &cpp_compressed,
+                    Some(u32::try_from(n_blocks * 128).expect("block count fits in u32")),
+                    &mut rust_decoded,
                 )
-                .unwrap();
+                .unwrap_or_else(|e| panic!("Rust decompress of C++ data failed: {e:?}"));
 
             assert_eq!(
-                decoded_cpp.len(),
-                decoded_by_rust.len(),
-                "Mismatched output lengths"
+                rust_decoded,
+                input,
+                "C++→Rust roundtrip mismatch for len {}",
+                input.len()
             );
-            assert_eq!(decoded_cpp, decoded_by_rust.as_slice());
         }
     }
 }
 
+/// Rust `BlockCodec` encode → C++ `AnyLenCodec` decode (same wire format).
 #[test]
-fn test_rust_and_cpp_fastpfor32_compression_matches() {
+fn test_cpp_decompresses_rust_block_encoded_data() {
     let mut codec_cpp = CppFastPFor128::new();
-    let mut codec_rs = rust::FastPFOR::new(rust::DEFAULT_PAGE_SIZE, rust::BLOCK_SIZE_128);
+    let mut codec_rs = FastPForBlock128::default();
 
     for n in test_input_sizes() {
-        for input in get_test_cases(n + rust::BLOCK_SIZE_128.get() as usize) {
-            // Rust `FastPFOR::compress` is a no-op for length 0; C++ still writes a stream header.
-            if input.is_empty() {
+        for input in get_test_cases(n + 128) {
+            if input.len() % 128 != 0 || input.is_empty() {
                 continue;
             }
+            let (blocks, _) = slice_to_blocks::<FastPForBlock128>(&input);
+            let n_blocks = blocks.len();
+            let expected_len = n_blocks * 128;
 
-            let mut compressed_buffer = Vec::new();
-            codec_cpp.encode(&input, &mut compressed_buffer).unwrap();
+            let mut rs_compressed = Vec::new();
+            codec_rs.encode_blocks(blocks, &mut rs_compressed).unwrap();
 
-            // Rust encoding
-            let mut input_offset_rs = Cursor::new(0u32);
-            let mut encoded_rs: Vec<u32> = vec![0; input.len()];
-            let mut output_offset_rs = Cursor::new(0u32);
-            codec_rs
-                .compress(
-                    &input,
-                    input.len() as u32,
-                    &mut input_offset_rs,
-                    &mut encoded_rs,
-                    &mut output_offset_rs,
+            let mut cpp_decoded = Vec::new();
+            codec_cpp
+                .decode(
+                    &rs_compressed,
+                    &mut cpp_decoded,
+                    Some(u32::try_from(expected_len).expect("expected len fits in u32")),
                 )
+                .unwrap_or_else(|e| panic!("C++ decompress of Rust data failed: {e:?}"));
+
+            assert_eq!(
+                cpp_decoded,
+                input,
+                "Rust→C++ roundtrip mismatch for len {}",
+                input.len()
+            );
+        }
+    }
+}
+
+/// Cross-check: Rust block encode and C++ any-length encode produce identical bytes for block-aligned input.
+#[test]
+fn test_rust_and_cpp_compression_matches() {
+    let mut codec_cpp = CppFastPFor128::new();
+    let mut codec_rs = FastPForBlock128::default();
+
+    for n in test_input_sizes() {
+        for input in get_test_cases(n + 128) {
+            if input.len() % 128 != 0 || input.is_empty() {
+                continue;
+            }
+            let (blocks_rs, _) = slice_to_blocks::<FastPForBlock128>(&input);
+
+            let mut cpp_compressed = Vec::new();
+            codec_cpp.encode(&input, &mut cpp_compressed).unwrap();
+
+            let mut rs_compressed = Vec::new();
+            codec_rs
+                .encode_blocks(blocks_rs, &mut rs_compressed)
                 .unwrap();
 
-            let compressed_len_rs = output_offset_rs.position() as usize;
             assert_eq!(
-                compressed_buffer.len(),
-                compressed_len_rs,
-                "C++ vs Rust compressed length mismatch"
+                cpp_compressed,
+                rs_compressed,
+                "Compressed bytes differ for input len {}",
+                input.len()
             );
-            assert_eq!(&compressed_buffer, &encoded_rs[..compressed_len_rs]);
+        }
+    }
+}
+
+/// Rust `AnyLenCodec` (`CompositeCodec`) encoder → round-trip.
+#[test]
+fn test_rust_anylen_roundtrip() {
+    for n in test_input_sizes() {
+        let mut codec = FastPFor256::default();
+        for input in get_test_cases(n) {
+            let mut compressed = Vec::new();
+            codec.encode(&input, &mut compressed).unwrap();
+            let mut decoded = Vec::new();
+            codec.decode(&compressed, &mut decoded, None).unwrap();
+            assert_eq!(decoded, input, "Rust AnyLenCodec round-trip failed");
+        }
+    }
+}
+
+/// Rust 128-block `AnyLenCodec` round-trip.
+#[test]
+fn test_rust_anylen_128_roundtrip() {
+    for n in test_input_sizes() {
+        let mut codec = FastPFor128::default();
+        for input in get_test_cases(n) {
+            let mut compressed = Vec::new();
+            codec.encode(&input, &mut compressed).unwrap();
+            let mut decoded = Vec::new();
+            codec.decode(&compressed, &mut decoded, None).unwrap();
+            assert_eq!(decoded, input, "Rust AnyLenCodec 128 round-trip failed");
         }
     }
 }
diff --git a/tests/decode_error_paths.rs b/tests/decode_error_paths.rs
deleted file mode 100644
index 234556d..0000000
--- a/tests/decode_error_paths.rs
+++ /dev/null
@@ -1,493 +0,0 @@
-//! Integration tests exercising every error return path in `FastPFOR::decode_page`.
-//!
-//! Each test:
-//!   1. Compresses a valid input block to get a well-formed byte stream.
-//!   2. Surgically corrupts exactly the field under test.
-//!   3. Asserts that `uncompress` returns `Err(…)` rather than panicking.
-//!
-//! The goal is 100% branch coverage of the Rust decoding path.
-
-#![cfg(feature = "rust")]
-
-use std::io::Cursor;
-
-use fastpfor::rust::{BLOCK_SIZE_128, DEFAULT_PAGE_SIZE, FastPFOR, Integer, Skippable};
-
-// ── helpers ──────────────────────────────────────────────────────────────────
-
-/// Compress `data` with a default (256-block, 65536-page) codec and return
-/// the compressed words.
-fn compress_block(data: &[u32]) -> Vec<u32> {
-    let mut codec = FastPFOR::default();
-    let mut buf = vec![0u32; data.len() * 8 + 1024];
-    let mut in_off = Cursor::new(0u32);
-    let mut out_off = Cursor::new(0u32);
-    codec
-        .compress(data, data.len() as u32, &mut in_off, &mut buf, &mut out_off)
-        .expect("compression must succeed");
-    buf.truncate(out_off.position() as usize);
-    buf
-}
-
-/// Build compressed data that has at least one non-trivial exception group
-/// (bit-width difference > 1) so that the full exception decode path is taken.
-/// Returns `(compressed_words, original_data)`.
-fn compressed_with_exceptions() -> (Vec<u32>, Vec<u32>) {
-    // One 256-block where every even position carries a large value needing 31
-    // bits and every odd position carries a small value needing 2 bits.
-    // The encoder will choose optimal_bits ≈ 2, maxbits = 31, index = 29.
-    let data: Vec<u32> = (0..256)
-        .map(|i| if i % 2 == 0 { 1u32 << 30 } else { 3 })
-        .collect();
-    (compress_block(&data), data)
-}
-
-/// Build compressed data whose exception group uses bit-width difference == 1
-/// `(maxbits - optimal_bits == 1)`, triggering the `index == 1` branch.
-/// Returns `(compressed_words, original_data)`.
-fn compressed_with_index1_exceptions() -> (Vec<u32>, Vec<u32>) {
-    // Almost all values fit in 1 bit except one value needing exactly 2 bits.
-    // Encoder picks optimal_bits=1, maxbits=2, index=1.
-    let mut data = vec![1u32; 256];
-    data[0] = 3; // needs 2 bits
-    (compress_block(&data), data)
-}
-
-/// Decompress `compressed` with a fresh default codec.
-fn try_decode(compressed: &[u32]) -> Result<(), impl std::fmt::Debug> {
-    let mut codec = FastPFOR::default();
-    let mut out = vec![0u32; 256 + 64];
-    let mut in_off = Cursor::new(0u32);
-    let mut out_off = Cursor::new(0u32);
-    codec.uncompress(
-        compressed,
-        compressed.len() as u32,
-        &mut in_off,
-        &mut out,
-        &mut out_off,
-    )
-}
-
-// ── Wire format reference ─────────────────────────────────────────────────────
-//
-// `uncompress` reads word [0] as `outlength`, then delegates to
-// `headless_uncompress` → `decode_page`.  Within one page (starting at
-// `init_pos` in the u32 slice):
-//
-//   [init_pos]                              = where_meta
-//   [init_pos+1 .. init_pos+where_meta-1]   = packed regular values
-//   [init_pos+where_meta]                   = bytesize  (byte count of block metadata)
-//   [+1 .. +ceil(bytesize/4)]               = block metadata bytes
-//   [+ceil(bytesize/4)+1]                   = bitmap
-//   for each set bit k (2..=32):
-//     [next word]                           = size  (# of packed exceptions at width k)
-//     [next ceil(size*k/32) words]          = bit-packed exception values
-//
-// In the full `compressed` slice (including the `outlength` prefix added by
-// `compress`):
-//   compressed[0]                           = outlength
-//   compressed[1]                           = where_meta  (init_pos == 1 here)
-//   compressed[1+where_meta]                = bytesize
-//   …
-//
-// Helper to find the byte offset of the block-metadata region:
-fn meta_byte_start(compressed: &[u32]) -> usize {
-    let where_meta = compressed[1] as usize;
-    let bytesize_idx = 1 + where_meta;
-    (bytesize_idx + 1) * 4 // word index → byte offset
-}
-
-// Helper to find the word index of the bitmap:
-fn bitmap_idx(compressed: &[u32]) -> usize {
-    let where_meta = compressed[1] as usize;
-    let bytesize_idx = 1 + where_meta;
-    let bytesize = compressed[bytesize_idx] as usize;
-    bytesize_idx + 1 + bytesize.div_ceil(4)
-}
-
-// ── Entry-point guards ────────────────────────────────────────────────────────
-
-/// `input_length == 0` → `uncompress` returns Ok immediately.
-#[test]
-fn uncompress_zero_input_length_ok() {
-    let mut codec = FastPFOR::default();
-    let mut out = vec![];
-    codec
-        .uncompress(
-            &[],
-            0,
-            &mut Cursor::new(0u32),
-            &mut out,
-            &mut Cursor::new(0u32),
-        )
-        .expect("empty uncompress must succeed");
-}
-
-/// `headless_uncompress` with `inlength == 0` and `BLOCK_SIZE_128` returns Ok immediately.
-#[test]
-fn headless_uncompress_zero_inlength_128_ok() {
-    let mut codec = FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128);
-    let mut out = vec![];
-    codec
-        .headless_uncompress(
-            &[],
-            0,
-            &mut Cursor::new(0u32),
-            &mut out,
-            &mut Cursor::new(0u32),
-            0,
-        )
-        .expect("zero-length headless uncompress must succeed");
-}
-
-// ── decode_page error paths ───────────────────────────────────────────────────
-
-/// Only the `outlength` word present — the page header (`where_meta`) is missing.
-#[test]
-fn decode_where_meta_missing() {
-    assert!(try_decode(&[256u32]).is_err());
-}
-
-/// `where_meta` points past end of input.
-#[test]
-fn decode_where_meta_out_of_bounds() {
-    let (mut compressed, _) = compressed_with_exceptions();
-    compressed[1] = u32::MAX;
-    assert!(try_decode(&compressed).is_err());
-}
-
-/// `init_pos + where_meta` wraps around u32.
-#[test]
-fn decode_where_meta_overflow() {
-    let (compressed, _) = compressed_with_exceptions();
-    // Prepend a dummy word and start decoding at offset 1, so init_pos = 1.
-    // Set where_meta = u32::MAX so 1 + u32::MAX overflows.
-    let mut padded = vec![0u32];
-    padded.extend_from_slice(&compressed);
-    padded[2] = u32::MAX; // where_meta field (compressed[1]) is now padded[2]
-    let outlength = padded[1];
-    let mut codec = FastPFOR::default();
-    let mut out = vec![0u32; 320];
-    let result = codec.headless_uncompress(
-        &padded,
-        outlength,
-        &mut Cursor::new(1u32),
-        &mut out,
-        &mut Cursor::new(0u32),
-        outlength,
-    );
-    assert!(result.is_err());
-}
-
-/// `where_meta` points to the last word, so reading `bytesize` goes out of bounds.
-#[test]
-fn decode_bytesize_out_of_bounds() {
-    let (mut compressed, _) = compressed_with_exceptions();
-    compressed[1] = compressed.len() as u32 - 1;
-    assert!(try_decode(&compressed).is_err());
-}
-
-/// `bytesize` is so large that `inexcept + ceil(bytesize/4)` overflows u32.
-#[test]
-fn decode_bytesize_length_overflow() {
-    let (mut compressed, _) = compressed_with_exceptions();
-    let bytesize_idx = 1 + compressed[1] as usize;
-    compressed[bytesize_idx] = u32::MAX - 3;
-    assert!(try_decode(&compressed).is_err());
-}
-
-/// `bytesize` is crafted so that `inexcept` lands exactly at the end of the
-/// slice, making the bitmap read go out of bounds.
-#[test]
-fn decode_bitmap_out_of_bounds() {
-    let (mut compressed, _) = compressed_with_exceptions();
-    let bytesize_idx = 1 + compressed[1] as usize;
-    // Advance inexcept to exactly compressed.len() so the bitmap get_u32 fails.
-    let remaining = (compressed.len() - bytesize_idx - 1) as u32;
-    compressed[bytesize_idx] = remaining * 4;
-    assert!(try_decode(&compressed).is_err());
-}
-
-/// `size` field for an exception group exceeds `page_size`.
-#[test]
-fn decode_exception_size_exceeds_page_size() {
-    let (mut compressed, _) = compressed_with_exceptions();
-    let size_idx = bitmap_idx(&compressed) + 1;
-    compressed[size_idx] = DEFAULT_PAGE_SIZE.get() + 1;
-    assert!(try_decode(&compressed).is_err());
-}
-
-/// Tail partial-group: not enough words remain after the full-group loop.
-#[test]
-fn decode_exception_partial_group_not_enough_data() {
-    let (compressed, _) = compressed_with_exceptions();
-    assert!(try_decode(&compressed[..compressed.len() - 2]).is_err());
-}
-
-/// `b > 32` in the per-block unpack loop.
-#[test]
-fn decode_block_b_too_large() {
-    let (mut compressed, _) = compressed_with_exceptions();
-    let start = meta_byte_start(&compressed);
-    let bytes: &mut [u8] = bytemuck::cast_slice_mut(&mut compressed);
-    bytes[start] = 33; // overwrite best_b of block 0
-    assert!(try_decode(&compressed).is_err());
-}
-
-/// Packed-values region is truncated: the bytesize read fails because `inexcept`
-/// (= `init_pos + where_meta`) is already beyond the end of the truncated slice.
-///
-/// Note: the `in_start + b > input.len()` guard inside the packed-value
-/// bitunpack loop in the decoder is structurally unreachable — the packed-values
-/// region physically precedes the metadata in the stream, so any truncation
-/// that removes packed words also removes the metadata needed to parse the page
-/// header, and an earlier `NotEnoughData` fires first. This test therefore
-/// exercises the nearest reachable error: the `bytesize` read failing when
-/// the metadata section is absent.
-#[test]
-fn decode_packed_region_truncated() {
-    let (compressed, _) = compressed_with_exceptions();
-    let where_meta = compressed[1] as usize;
-    // Keep only [0 .. where_meta]: header word + part of packed-values region.
-    // The bytesize read at `inexcept = 1 + where_meta` then fails OOB.
-    assert!(try_decode(&compressed[..where_meta]).is_err());
-}
-
-/// `out_start + 32 > output.len()`: output buffer too small for bitunpacking.
-#[test]
-fn decode_output_buffer_too_small_unpack() {
-    let (compressed, _) = compressed_with_exceptions();
-    let mut codec = FastPFOR::default();
-    let mut out = vec![0u32; 16]; // far too small for a 256-block
-    let result = codec.uncompress(
-        &compressed,
-        compressed.len() as u32,
-        &mut Cursor::new(0u32),
-        &mut out,
-        &mut Cursor::new(0u32),
-    );
-    assert!(result.is_err());
-}
-
-// ── Exception metadata validation ────────────────────────────────────────────
-
-/// Finds the byte offset of the first block with `cexcept > 0` in the block
-/// metadata region and returns `(best_b_offset, cexcept_offset, maxbits_offset)`.
-fn find_exception_block(bytes: &[u8], meta_start: usize) -> Option<(usize, usize, usize)> {
-    let mut pos = meta_start;
-    while pos + 1 < bytes.len() {
-        if bytes[pos + 1] > 0 {
-            return Some((pos, pos + 1, pos + 2));
-        }
-        pos += 2; // skip blocks with no exceptions
-    }
-    None
-}
-
-/// `maxbits > 32`.
-#[test]
-fn decode_exception_maxbits_too_large() {
-    let (mut compressed, _) = compressed_with_exceptions();
-    let start = meta_byte_start(&compressed);
-    let bytes: &mut [u8] = bytemuck::cast_slice_mut(&mut compressed);
-    if let Some((_, _, mb_off)) = find_exception_block(bytes, start) {
-        bytes[mb_off] = 33;
-    }
-    assert!(try_decode(&compressed).is_err());
-}
-
-/// `maxbits < b` (`checked_sub` underflows → index is None).
-#[test]
-fn decode_exception_index_underflow() {
-    let (mut compressed, _) = compressed_with_exceptions();
-    let start = meta_byte_start(&compressed);
-    let bytes: &mut [u8] = bytemuck::cast_slice_mut(&mut compressed);
-    if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
-        bytes[mb_off] = bytes[bb_off].saturating_sub(1); // maxbits < best_b
-    }
-    assert!(try_decode(&compressed).is_err());
-}
-
-/// `maxbits == b` (index == 0).
-#[test]
-fn decode_exception_index_zero() {
-    let (mut compressed, _) = compressed_with_exceptions();
-    let start = meta_byte_start(&compressed);
-    let bytes: &mut [u8] = bytemuck::cast_slice_mut(&mut compressed);
-    if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
-        bytes[mb_off] = bytes[bb_off]; // maxbits == best_b → index 0
-    }
-    assert!(try_decode(&compressed).is_err());
-}
-
-// ── index == 1 branch ─────────────────────────────────────────────────────────
-
-/// `index == 1` happy path: round-trips correctly.
-#[test]
-fn decode_index1_branch_valid() {
-    let (compressed, data) = compressed_with_index1_exceptions();
-    let mut codec = FastPFOR::default();
-    let mut out = vec![0u32; data.len() + 64];
-    codec
-        .uncompress(
-            &compressed,
-            compressed.len() as u32,
-            &mut Cursor::new(0u32),
-            &mut out,
-            &mut Cursor::new(0u32),
-        )
-        .expect("decompression of index-1 data must succeed");
-    assert_eq!(&out[..data.len()], data.as_slice());
-}
-
-/// `index == 1`: exception position byte is missing (stream too short).
-#[test]
-fn decode_index1_pos_byte_missing() {
-    let (compressed, _) = compressed_with_index1_exceptions();
-    assert!(try_decode(&compressed[..compressed.len() - 1]).is_err());
-}
-
-/// `index == 1`: exception position `>= block_size` (use 128-block codec
-/// so a u8 value of 200 exceeds the block size of 128).
-#[test]
-fn decode_index1_pos_out_of_block() {
-    let mut data = vec![1u32; 128];
-    data[0] = 3; // index == 1
-    let mut codec = FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128);
-    let mut buf = vec![0u32; 1024];
-    let mut out_off = Cursor::new(0u32);
-    codec
-        .compress(
-            &data,
-            data.len() as u32,
-            &mut Cursor::new(0u32),
-            &mut buf,
-            &mut out_off,
-        )
-        .unwrap();
-    buf.truncate(out_off.position() as usize);
-
-    let start = meta_byte_start(&buf);
-    let bytes: &mut [u8] = bytemuck::cast_slice_mut(&mut buf);
-    if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
-        if bytes[mb_off].wrapping_sub(bytes[bb_off]) == 1 && mb_off + 1 < bytes.len() {
-            bytes[mb_off + 1] = 200; // position 200 >= block_size 128
-        }
-    }
-    let _ = bytes;
-
-    let mut dec = FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128);
-    let result = dec.uncompress(
-        &buf,
-        buf.len() as u32,
-        &mut Cursor::new(0u32),
-        &mut vec![0u32; 256],
-        &mut Cursor::new(0u32),
-    );
-    assert!(result.is_err());
-}
-
-/// `index == 1`: output index out of bounds.
-#[test]
-fn decode_index1_output_out_of_bounds() {
-    let (compressed, _) = compressed_with_index1_exceptions();
-    let mut codec = FastPFOR::default();
-    let result = codec.uncompress(
-        &compressed,
-        compressed.len() as u32,
-        &mut Cursor::new(0u32),
-        &mut [0u32; 16], // too small
-        &mut Cursor::new(0u32),
-    );
-    assert!(result.is_err());
-}
-
-// ── index > 1 branch ──────────────────────────────────────────────────────────
-
-/// `index > 1`: exception position byte is missing (stream too short).
-#[test]
-fn decode_exception_pos_byte_missing() {
-    let (compressed, _) = compressed_with_exceptions();
-    assert!(try_decode(&compressed[..compressed.len() - 1]).is_err());
-}
-
-/// `index > 1`: exception position `>= block_size` (128-block codec, value 200).
-#[test]
-fn decode_exception_pos_out_of_block() {
-    let data: Vec<u32> = (0..128)
-        .map(|i| if i % 4 == 0 { 1u32 << 30 } else { 1 })
-        .collect();
-    let mut codec = FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128);
-    let mut buf = vec![0u32; 2048];
-    let mut out_off = Cursor::new(0u32);
-    codec
-        .compress(
-            &data,
-            data.len() as u32,
-            &mut Cursor::new(0u32),
-            &mut buf,
-            &mut out_off,
-        )
-        .unwrap();
-    buf.truncate(out_off.position() as usize);
-
-    let start = meta_byte_start(&buf);
-    let bytes: &mut [u8] = bytemuck::cast_slice_mut(&mut buf);
-    if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
-        if bytes[mb_off].wrapping_sub(bytes[bb_off]) > 1 && mb_off + 1 < bytes.len() {
-            bytes[mb_off + 1] = 200; // position 200 >= block_size 128
-        }
-    }
-    let _ = bytes;
-
-    let mut dec = FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128);
-    let result = dec.uncompress(
-        &buf,
-        buf.len() as u32,
-        &mut Cursor::new(0u32),
-        &mut vec![0u32; 256],
-        &mut Cursor::new(0u32),
-    );
-    assert!(result.is_err());
-}
-
-/// `index > 1`: `data_to_be_packed[index]` was never populated because the
-/// bitmap had no bit set for that bit-width, yet the block metadata claims
-/// exceptions at that width.  Before the fix this panicked with an
-/// index-out-of-bounds on the empty `Vec`; after the fix it returns `Err`.
-///
-/// The crafted stream has `bitmap=0` (no exception buffers filled) but block
-/// metadata with `bits=1`, `cexcept=1`, `maxbits=3` → `index=2`, causing an
-/// access into the empty `data_to_be_packed[2]`.
-#[test]
-fn decode_exception_unpopulated_data_to_be_packed() {
-    // meta_word encodes [bits=1, num_exceptions=1, maxbits=3, pos=0] in LE bytes.
-    // Layout: outlength=256, where_meta=9, 8 packed zero words (bits=1),
-    //         bytesize=4, meta_word, bitmap=0.
-    let meta_word = u32::from_le_bytes([1, 1, 3, 0]);
-    let compressed: Vec<u32> = [
-        256u32, // outlength
-        9,      // where_meta
-        0, 0, 0, 0, 0, 0, 0, 0,         // 8 packed words (bits=1, all zeros)
-        4,         // bytesize = 4 bytes
-        meta_word, // block metadata: bits=1, cexcept=1, maxbits=3, pos=0
-        0,         // bitmap=0: no exception bit-widths loaded into data_to_be_packed
-    ]
-    .into();
-    assert!(try_decode(&compressed).is_err());
-}
-
-/// `index > 1`: output buffer too small (`out_idx` >= `output.len()`).
-#[test]
-fn decode_exception_output_out_of_bounds() {
-    let (compressed, _) = compressed_with_exceptions();
-    let mut codec = FastPFOR::default();
-    let result = codec.uncompress(
-        &compressed,
-        compressed.len() as u32,
-        &mut Cursor::new(0u32),
-        &mut [0u32; 32], // too small for a 256-block
-        &mut Cursor::new(0u32),
-    );
-    assert!(result.is_err());
-}
diff --git a/tests/encode_paths.rs b/tests/encode_paths.rs
index cd8bb6e..b54a887 100644
--- a/tests/encode_paths.rs
+++ b/tests/encode_paths.rs
@@ -6,26 +6,37 @@
 
 #![cfg(feature = "rust")]
 
-use std::io::Cursor;
+use std::mem::size_of;
 
-use fastpfor::CodecToSlice;
-use fastpfor::rust::{
-    BLOCK_SIZE_128, BLOCK_SIZE_256, Codec, DEFAULT_PAGE_SIZE, FastPFOR, Skippable, VariableByte,
+use fastpfor::{
+    AnyLenCodec, BlockCodec, FastPFor128, FastPFor256, FastPForBlock256, JustCopy, VariableByte,
+    slice_to_blocks,
 };
+// ── helpers ───────────────────────────────────────────────────────────────────
 
-// ── helper ────────────────────────────────────────────────────────────────────
-
-/// Compress then immediately decompress, asserting round-trip correctness.
-fn roundtrip(mut codec: Codec, data: &[u32]) {
-    let mut compressed = vec![0u32; data.len() * 4 + 1024];
-    let compressed = codec.compress_to_slice(data, &mut compressed).unwrap();
+fn roundtrip<C: AnyLenCodec>(codec: &mut C, data: &[u32]) {
+    let mut compressed = Vec::new();
+    codec.encode(data, &mut compressed).unwrap();
+    let mut decompressed = Vec::new();
+    codec.decode(&compressed, &mut decompressed, None).unwrap();
+    assert_eq!(decompressed, data);
+}
 
-    let mut decompressed = vec![0u32; data.len() + 256];
-    let decompressed = codec
-        .decompress_to_slice(compressed, &mut decompressed)
+fn block_roundtrip<C: BlockCodec + Default>(data: &[u32]) {
+    let mut codec = C::default();
+    let (blocks, _) = slice_to_blocks::<C>(data);
+    let mut compressed = Vec::new();
+    codec.encode_blocks(blocks, &mut compressed).unwrap();
+    let mut decoded = Vec::new();
+    let expected_values = blocks.len() * (size_of::<C::Block>() / 4);
+    codec
+        .decode_blocks(
+            &compressed,
+            Some(u32::try_from(expected_values).expect("expected_values fits in u32")),
+            &mut decoded,
+        )
         .unwrap();
-
-    assert_eq!(decompressed, data);
+    assert_eq!(decoded, &data[..expected_values]);
 }
 
 // ── VariableByte round-trip ───────────────────────────────────────────────────
@@ -35,134 +46,102 @@ fn roundtrip(mut codec: Codec, data: &[u32]) {
 #[test]
 fn variable_byte_roundtrip_all_widths() {
     roundtrip(
-        Codec::from(VariableByte::new()),
+        &mut VariableByte::new(),
         &[1u32, 127, 128, 16383, 16384, u32::MAX],
     );
 }
 
-// ── VariableByte::headless_uncompress → Unimplemented ────────────────────────
-
-/// `VariableByte::headless_uncompress` is intentionally unimplemented.
-/// Calling it must return `Err(Unimplemented)`.
 #[test]
-fn variable_byte_headless_uncompress_unimplemented() {
-    let result = VariableByte::new().headless_uncompress(
-        &[],
-        0,
-        &mut Cursor::new(0u32),
-        &mut [],
-        &mut Cursor::new(0u32),
-        0,
-    );
-    assert!(result.is_err());
+fn variable_byte_roundtrip_empty() {
+    roundtrip(&mut VariableByte::new(), &[]);
 }
 
-// ── OutputBufferTooSmall in fast-path decompression ───────────────────────────
+// ── JustCopy via AnyLenCodec ─────────────────────────────────────────────────
 
-/// Compress enough integers so the fast path (`byte_pos + 10 <= byte_length`)
-/// is entered, then decompress into a zero-capacity buffer so that
-/// `tmp_outpos >= output.len()` fires in the fast-path loop.
 #[test]
-fn variable_byte_uncompress_fast_path_output_too_small() {
-    // 20 small values → 20 compressed bytes, well above the 10-byte fast-path threshold.
-    let data: Vec<u32> = (1..=20).collect();
-    let mut compressed = vec![0u32; 64];
-    let compressed = Codec::from(VariableByte::new())
-        .compress_to_slice(&data, &mut compressed)
-        .unwrap();
-
-    let mut out: Vec<u32> = vec![]; // zero capacity → error on first decoded value
-    let result = Codec::from(VariableByte::new()).decompress_to_slice(compressed, &mut out);
-    assert!(result.is_err());
+fn justcopy_roundtrip() {
+    roundtrip(&mut JustCopy::new(), &[1u32, 2, 3, 42, u32::MAX]);
 }
 
-// ── OutputBufferTooSmall in slow-path decompression ───────────────────────────
+// ── BlockCodec: FastPForBlock256 — block-exact input ───────────────────────────────
 
-/// Compress only 2 integers (2 compressed bytes < 10), so only the slow path
-/// runs, then decompress into a zero-capacity buffer so that
-/// `tmp_outpos >= output.len()` fires in the slow-path loop.
 #[test]
-fn variable_byte_uncompress_slow_path_output_too_small() {
-    let data = vec![1u32, 2u32]; // 2 bytes total → slow path only
-    let mut compressed = vec![0u32; 16];
-    let compressed = Codec::from(VariableByte::new())
-        .compress_to_slice(&data, &mut compressed)
-        .unwrap();
-
-    let mut out: Vec<u32> = vec![];
-    let result = Codec::from(VariableByte::new()).decompress_to_slice(compressed, &mut out);
-    assert!(result.is_err());
+fn fastpfor256_block_roundtrip() {
+    block_roundtrip::<FastPForBlock256>(&(0u32..512).collect::<Vec<_>>());
 }
 
-// ── FastPFOR encoding: multi-page path ───────────────────────────────────────
+// ── CompositeCodec (FastPForBlock256 + VByte) ─────────────────────────────────────
 
-/// Compressing more than `page_size` integers causes `headless_compress` to
-/// loop more than once, exercising the `this_size == page_size` branch.
+/// Compressing more than the default page size (65536) causes `compress_blocks`
+/// to loop more than once, exercising the `this_size == page_size` branch.
 #[test]
 fn fastpfor_multi_page_encode_decode() {
-    let n = DEFAULT_PAGE_SIZE.get() as usize + BLOCK_SIZE_256.get() as usize;
-    let data: Vec<u32> = (0..n as u32).map(|i| i % 1024).collect();
-    roundtrip(Codec::from(FastPFOR::default()), &data);
+    // 65536 (default page size) + 256 (one block) — enough to span two pages
+    let data: Vec<u32> = (0..65792u32).map(|i| i % 1024).collect();
+    roundtrip(&mut FastPFor256::default(), &data);
 }
 
-// ── FastPFOR encoding: all-zero block (b=0, no packing) ──────────────────────
-
 /// A block of all zeros causes `best_b_from_data` to decrement `optimal_bits`
 /// all the way to 0 — no packed words are written.
 #[test]
 fn fastpfor_encode_all_zeros() {
-    roundtrip(Codec::from(FastPFOR::default()), &vec![0u32; 256]);
+    roundtrip(&mut FastPFor256::default(), &vec![0u32; 256]);
 }
 
-// ── FastPFOR encoding: bytes_container already 4-byte aligned ────────────────
-
-/// When the metadata byte count is already a multiple of 4 the padding
-/// `while (bytes_container.len() & 3) != 0` loop runs zero iterations.
-/// 128 blocks × 2 metadata bytes each = 256 bytes ≡ 0 (mod 4).
+/// When the metadata byte count is already a multiple of 4 the padding loop
+/// runs zero iterations.
 #[test]
 fn fastpfor_encode_metadata_already_aligned() {
     let data = vec![0u32; 32768]; // 128 blocks of 256 zeros
-    roundtrip(
-        Codec::from(FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_256)),
-        &data,
-    );
+    roundtrip(&mut FastPFor256::default(), &data);
 }
 
-// ── FastPFOR encoding: best_b stays at 32 ────────────────────────────────────
-
-/// When every value needs all 32 bits, `best_b_from_data` keeps `optimal_bits`
-/// at 32 because no lower bit-width reduces the total cost.
+/// When every value needs all 32 bits.
 #[test]
 fn fastpfor_encode_all_max_u32() {
-    roundtrip(Codec::from(FastPFOR::default()), &vec![u32::MAX; 256]);
+    roundtrip(&mut FastPFor256::default(), &vec![u32::MAX; 256]);
 }
 
-// ── FastPFOR encoding: exception index == 1 ──────────────────────────────────
-
-/// When `max_bits - optimal_bits == 1` the cost formula applies a discount
-/// (`thiscost -= cexcept`).  This exercises the `if self.max_bits - b == 1`
-/// branch in `best_b_from_data`.
+/// Exception index == 1 branch.
 #[test]
 fn fastpfor_encode_exception_index1() {
-    // Almost all values fit in 1 bit; two need exactly 2 bits.
-    // Encoder picks optimal_bits=1, max_bits=2, index=1.
     let mut data = vec![1u32; 256];
     data[0] = 3;
     data[128] = 3;
-    roundtrip(Codec::from(FastPFOR::default()), &data);
+    roundtrip(&mut FastPFor256::default(), &data);
 }
 
-// ── FastPFOR encoding: 128-element block size ─────────────────────────────────
-
-/// `BLOCK_SIZE_128` uses different inner-loop bounds in `encode_page`; verify
-/// it compresses and decompresses correctly with exceptions present.
+/// 128-element block size with exceptions.
 #[test]
 fn fastpfor_encode_128_block_with_exceptions() {
     let data: Vec<u32> = (0..128)
         .map(|i| if i % 4 == 0 { 1u32 << 28 } else { 1 })
         .collect();
-    roundtrip(
-        Codec::from(FastPFOR::new(DEFAULT_PAGE_SIZE, BLOCK_SIZE_128)),
-        &data,
-    );
+    roundtrip(&mut FastPFor128::default(), &data);
+}
+
+// ── VariableByte AnyLenCodec edge cases ──────────────────────────────────────
+
+/// Decompressing an empty stream succeeds with empty output.
+#[test]
+fn variable_byte_anylen_decompress_short_input() {
+    let mut codec = VariableByte::new();
+    let mut out = Vec::new();
+    let result = codec.decode(&[], &mut out, None);
+    assert!(result.is_ok());
+    assert!(out.is_empty());
+}
+
+/// Decompressing into a `Vec` that starts empty is fine — it grows as needed.
+#[test]
+fn variable_byte_anylen_decompress_into_small_vec() {
+    let data: Vec<u32> = (1..=20).collect();
+    let mut compressed = Vec::new();
+    VariableByte::new().encode(&data, &mut compressed).unwrap();
+
+    let mut out = Vec::new();
+    VariableByte::new()
+        .decode(&compressed, &mut out, None)
+        .unwrap();
+    assert_eq!(out, data);
 }

From 48062b9cdd0590237f9fdfc2be2e0837da82fb01 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Sun, 22 Mar 2026 17:11:26 -0400
Subject: [PATCH 02/26] results

---
 fuzz/fuzz_targets/common.rs                   |  8 +-
 src/codec.rs                                  | 24 +++---
 src/cpp/codecs.rs                             | 14 ++--
 src/cpp/tests.rs                              | 80 ++++++++-----------
 src/cpp/wrappers.rs                           | 12 +--
 src/error.rs                                  |  3 +
 src/helpers.rs                                | 12 +--
 src/lib.rs                                    |  2 +-
 src/rust/composite.rs                         |  7 +-
 src/rust/integer_compression/fastpfor.rs      | 20 ++---
 src/rust/integer_compression/just_copy.rs     |  7 +-
 src/rust/integer_compression/variable_byte.rs | 14 ++--
 12 files changed, 94 insertions(+), 109 deletions(-)

diff --git a/fuzz/fuzz_targets/common.rs b/fuzz/fuzz_targets/common.rs
index 9d76b9d..a8acbac 100644
--- a/fuzz/fuzz_targets/common.rs
+++ b/fuzz/fuzz_targets/common.rs
@@ -151,10 +151,10 @@ pub fn resolve_encode_compare_pair(idx: u8) -> Option<CodecPair> {
     let pairs = ENCODE_COMPARE_PAIRS;
     let i = idx as usize % pairs.len();
     let pair = pairs[i];
-    if let Some(ref f) = filter {
-        if !f.eq_ignore_ascii_case(pair.name) {
-            return None;
-        }
+    if let Some(ref f) = filter
+        && !f.eq_ignore_ascii_case(pair.name)
+    {
+        return None;
     }
     Some(pair)
 }
diff --git a/src/codec.rs b/src/codec.rs
index 1a021e6..e23223a 100644
--- a/src/codec.rs
+++ b/src/codec.rs
@@ -1,6 +1,6 @@
 use bytemuck::{Pod, cast_slice};
 
-use crate::FastPForError;
+use crate::FastPForResult;
 
 /// Internal default for max decompressed length. Used by trait defaults and C++ FFI.
 #[inline]
@@ -27,9 +27,9 @@ pub(crate) fn default_max_decoded_len(compressed_words: usize) -> usize {
 /// impl BlockCodec for MyCodec {
 ///     type Block = [u32; 256];
 ///     fn encode_blocks(&self, blocks: &[[u32; 256]], out: &mut Vec<u32>)
-///         -> Result<(), FastPForError> { ... }
+///         -> FastPForResult<()> { ... }
 ///     fn decode_blocks(&self, input: &[u32], expected_len: Option<u32>,
-///         out: &mut Vec<u32>) -> Result<usize, FastPForError> { ... }
+///         out: &mut Vec<u32>) -> FastPForResult<usize> { ... }
 /// }
 /// ```
 pub trait BlockCodec {
@@ -54,16 +54,12 @@ pub trait BlockCodec {
     ///
     /// No remainder is possible — the caller must split the input first using
     /// [`slice_to_blocks`] and handle any remainder separately.
-    fn encode_blocks(
-        &mut self,
-        blocks: &[Self::Block],
-        out: &mut Vec<u32>,
-    ) -> Result<(), FastPForError>;
+    fn encode_blocks(&mut self, blocks: &[Self::Block], out: &mut Vec<u32>) -> FastPForResult<()>;
 
     /// Decompress blocks from `input`, using the length stored in the header.
     ///
     /// Returns the number of input `u32` words consumed, so the caller (e.g.
-    /// [`CompositeCodec`]) can locate the tail without parsing the block format.
+    /// [`CompositeCodec`](crate::CompositeCodec)) can locate the tail without parsing the block format.
     ///
     /// When `expected_len` is `Some(n)`:
     /// - Validates that the header value equals `n` (must be a multiple of
@@ -78,7 +74,7 @@ pub trait BlockCodec {
         input: &[u32],
         expected_len: Option<u32>,
         out: &mut Vec<u32>,
-    ) -> Result<usize, FastPForError>;
+    ) -> FastPForResult<usize>;
 
     /// Maximum decompressed element count for a given compressed input length.
     /// Reject `expected_len` values exceeding this to avoid allocation from bad data.
@@ -103,9 +99,9 @@ pub trait BlockCodec {
 #[cfg(feature = "cpp")]
 pub trait BlockCodec64 {
     /// Compress 64-bit integers into a 32-bit word stream.
-    fn encode64(&mut self, input: &[u64], out: &mut Vec<u32>) -> Result<(), FastPForError>;
+    fn encode64(&mut self, input: &[u64], out: &mut Vec<u32>) -> FastPForResult<()>;
     /// Decompress 64-bit integers from a 32-bit word stream.
-    fn decode64(&mut self, input: &[u32], out: &mut Vec<u64>) -> Result<(), FastPForError>;
+    fn decode64(&mut self, input: &[u32], out: &mut Vec<u64>) -> FastPForResult<()>;
 }
 
 /// Compresses and decompresses an arbitrary-length `&[u32]` slice.
@@ -116,7 +112,7 @@ pub trait BlockCodec64 {
 /// to produce an `AnyLenCodec`.
 pub trait AnyLenCodec {
     /// Compress an arbitrary-length slice of `u32` values.
-    fn encode(&mut self, input: &[u32], out: &mut Vec<u32>) -> Result<(), FastPForError>;
+    fn encode(&mut self, input: &[u32], out: &mut Vec<u32>) -> FastPForResult<()>;
 
     /// Maximum decompressed element count for a given compressed input length.
     /// Reject `expected_len` values exceeding this to avoid allocation from bad data.
@@ -143,7 +139,7 @@ pub trait AnyLenCodec {
         input: &[u32],
         out: &mut Vec<u32>,
         expected_len: Option<u32>,
-    ) -> Result<(), FastPForError>;
+    ) -> FastPForResult<()>;
 }
 
 /// Split a flat `&[u32]` into `(&[Blocks::Block], &[u32])` without copying.
diff --git a/src/cpp/codecs.rs b/src/cpp/codecs.rs
index 0c8af9d..2def24a 100644
--- a/src/cpp/codecs.rs
+++ b/src/cpp/codecs.rs
@@ -1,6 +1,6 @@
 use cxx::UniquePtr;
 
-use crate::FastPForError;
+use crate::FastPForResult;
 use crate::codec::{AnyLenCodec, BlockCodec64};
 use crate::cpp::ffi;
 use crate::cpp::wrappers::{
@@ -12,7 +12,7 @@ use crate::cpp::wrappers::{
 // Single macro: all C++ codecs implement AnyLenCodec. Codecs marked with `@ 64`
 // also implement BlockCodec64 for 64-bit integer support.
 
-/// Macro for C++ codec wrappers: struct + Default + `AnyLenCodec`.
+/// Macro for C++ codec wrappers: struct + Default + [`AnyLenCodec`].
 macro_rules! implement_cpp_codecs {
     ($(
         $(#[$($attrs:tt)*])*
@@ -37,7 +37,7 @@ macro_rules! implement_cpp_codecs {
             }
 
             impl AnyLenCodec for $name {
-                fn encode(&mut self, input: &[u32], out: &mut Vec<u32>) -> Result<(), FastPForError> {
+                fn encode(&mut self, input: &[u32], out: &mut Vec<u32>) -> FastPForResult<()> {
                     encode32_to_vec_ffi(&self.0, input, out)
                 }
 
@@ -46,7 +46,7 @@ macro_rules! implement_cpp_codecs {
                     input: &[u32],
                     out: &mut Vec<u32>,
                     expected_len: Option<u32>,
-                ) -> Result<(), FastPForError> {
+                ) -> FastPForResult<()> {
                     decode32_anylen_ffi(&self.0, input, out, expected_len)
                 }
             }
@@ -139,7 +139,7 @@ implement_cpp_codecs! {
 
     // CppSnappy => snappy_codec,  // Conditional with #ifdef
 
-    /// [`StreamVByte`](https://github.com/lemire/streamvbyte) encoding for fast variable-byte compression.
+    /// [`CppStreamVByte`](https://github.com/lemire/streamvbyte) encoding for fast variable-byte compression.
     CppStreamVByte => streamvbyte_codec,
 
     /// Standard variable-byte encoding.
@@ -161,10 +161,10 @@ macro_rules! implement_cpp_codecs_64 {
     ($($name:ident => $ffi:ident ,)*) => {
         $(
             impl BlockCodec64 for $name {
-                fn encode64(&mut self, input: &[u64], out: &mut Vec<u32>) -> Result<(), FastPForError> {
+                fn encode64(&mut self, input: &[u64], out: &mut Vec<u32>) -> FastPForResult<()> {
                     encode64_to_vec_ffi(&self.0, input, out)
                 }
-                fn decode64(&mut self, input: &[u32], out: &mut Vec<u64>) -> Result<(), FastPForError> {
+                fn decode64(&mut self, input: &[u32], out: &mut Vec<u64>) -> FastPForResult<()> {
                     decode64_to_vec_ffi(&self.0, input, out)
                 }
             }
diff --git a/src/cpp/tests.rs b/src/cpp/tests.rs
index ce29230..4845c0a 100644
--- a/src/cpp/tests.rs
+++ b/src/cpp/tests.rs
@@ -1,31 +1,31 @@
 use crate::cpp::codecs::tests::roundtrip_32;
 
-// Test all codecs compile and do a basic 32-bit roundtrip
+/// Test all codecs compile and do a basic 32-bit roundtrip
 macro_rules! test_anylen {
-        ($($name:ident),*) => {
-            $(
-                #[test]
-                #[allow(non_snake_case)]
-                fn $name() {
-                    roundtrip_32(&mut $crate::cpp::$name::new(), &[1u32, 2, 3, 4, 5]);
-                }
-            )*
-        };
-    }
+    ($($name:ident),* $(,)?) => {
+        $(
+            #[test]
+            #[allow(non_snake_case)]
+            fn $name() {
+                roundtrip_32(&mut $crate::cpp::$name::new(), &[1u32, 2, 3, 4, 5]);
+            }
+        )*
+    };
+}
 
 test_anylen!(
     CppBP32,
     CppCopy,
-    CppFastBinaryPacking8,
     CppFastBinaryPacking16,
     CppFastBinaryPacking32,
+    CppFastBinaryPacking8,
     CppFastPFor128,
     CppFastPFor256,
     CppMaskedVByte,
     CppNewPFor,
     CppOptPFor,
-    CppPFor2008,
     CppPFor,
+    CppPFor2008,
     CppSimdBinaryPacking,
     CppSimdFastPFor128,
     CppSimdFastPFor256,
@@ -39,13 +39,13 @@ test_anylen!(
     CppStreamVByte,
     CppVByte,
     CppVarInt,
-    CppVarIntGb
+    CppVarIntGb,
 );
 
-// Simple-9/16/8b codecs require values that fit in small bit widths and a
-// block-aligned count; test them separately with 128 small values.
+/// Simple-9/16/8b codecs require values that fit in small bit widths and a
+/// block-aligned count; test them separately with 128 small values.
 macro_rules! test_anylen_128 {
-        ($($name:ident),*) => {
+        ($($name:ident),* $(,)?) => {
             $(
                 #[test]
                 #[allow(non_snake_case)]
@@ -57,45 +57,37 @@ macro_rules! test_anylen_128 {
         };
     }
 
-// Note: CppSimple9Rle crashes with heap corruption on various inputs; skip everywhere.
-test_anylen_128!(CppSimple16, CppSimple8b, CppSimple9);
-
-// CppSimple8bRle reinterpret-casts uint32_t* → uint64_t* inside the C++ header,
-// which is UB on strict-alignment architectures (ARM64 requires 8-byte alignment
-// for 64-bit loads/stores and will SIGSEGV on unaligned access). The codec is
-// otherwise correct on x86/x86_64 where unaligned access is handled in hardware.
-// Tracked upstream; skip on aarch64 until fixed in the submodule.
-// #[cfg(not(target_arch = "aarch64"))]
-test_anylen_128!(CppSimple8bRle);
+// Note: Simple9Rle crashes with heap corruption on various inputs; skip everywhere.
+test_anylen_128!(CppSimple16, CppSimple8b, CppSimple9, CppSimple8bRle);
 
 // Verify Default impl routes through new() for all generated codec types.
 macro_rules! test_default {
-        ($($name:ident),*) => {
-            $(
-                #[test]
-                #[allow(non_snake_case)]
-                fn $name() {
-                    let _codec = $crate::cpp::$name::default();
-                }
-            )*
-        };
-    }
+    ($($name:ident),* $(,)?) => {
+        $(
+            #[test]
+            #[allow(non_snake_case)]
+            fn $name() {
+                let _codec = $crate::cpp::$name::default();
+            }
+        )*
+    };
+}
 
-// Use a distinct prefix to avoid name collisions with test_anylen tests.
+/// Use a distinct prefix to avoid name collisions with `test_anylen` tests.
 mod default_impls {
     test_default!(
         CppBP32,
         CppCopy,
-        CppFastBinaryPacking8,
         CppFastBinaryPacking16,
         CppFastBinaryPacking32,
+        CppFastBinaryPacking8,
         CppFastPFor128,
         CppFastPFor256,
         CppMaskedVByte,
         CppNewPFor,
         CppOptPFor,
-        CppPFor2008,
         CppPFor,
+        CppPFor2008,
         CppSimdBinaryPacking,
         CppSimdFastPFor128,
         CppSimdFastPFor256,
@@ -109,15 +101,11 @@ mod default_impls {
         CppSimple8b,
         CppSimple8bRle,
         CppSimple9,
+        CppSimple9Rle,
         CppSimplePFor,
         CppStreamVByte,
         CppVByte,
         CppVarInt,
-        CppVarIntGb
+        CppVarIntGb,
     );
 }
-
-mod default_impls2 {
-    // #[cfg(not(target_arch = "aarch64"))]
-    test_default!(CppSimple9Rle);
-}
diff --git a/src/cpp/wrappers.rs b/src/cpp/wrappers.rs
index 7d600a3..90a58ad 100644
--- a/src/cpp/wrappers.rs
+++ b/src/cpp/wrappers.rs
@@ -1,6 +1,6 @@
 use cxx::UniquePtr;
 
-use crate::FastPForError;
+use crate::FastPForResult;
 use crate::codec::default_max_decoded_len;
 use crate::cpp::ffi;
 use crate::helpers::AsUsize;
@@ -14,7 +14,7 @@ pub fn encode32_to_vec_ffi(
     codec: &UniquePtr<ffi::IntegerCODEC>,
     input: &[u32],
     out: &mut Vec<u32>,
-) -> Result<(), FastPForError> {
+) -> FastPForResult<()> {
     let capacity = input.len() * 2 + 1024;
     let start = out.len();
     out.resize(start + capacity, 0);
@@ -28,7 +28,7 @@ fn decode32_to_vec_ffi(
     input: &[u32],
     out: &mut Vec<u32>,
     capacity: usize,
-) -> Result<(), FastPForError> {
+) -> FastPForResult<()> {
     if !input.is_empty() {
         let start = out.len();
         out.resize(start + capacity, 0);
@@ -43,7 +43,7 @@ pub fn decode32_anylen_ffi(
     input: &[u32],
     out: &mut Vec<u32>,
     expected_len: Option<u32>,
-) -> Result<(), FastPForError> {
+) -> FastPForResult<()> {
     let max = default_max_decoded_len(input.len());
     let capacity = if let Some(n) = expected_len {
         n.is_valid_expected(max)?
@@ -64,7 +64,7 @@ pub fn encode64_to_vec_ffi(
     codec: &UniquePtr<ffi::IntegerCODEC>,
     input: &[u64],
     out: &mut Vec<u32>,
-) -> Result<(), FastPForError> {
+) -> FastPForResult<()> {
     let capacity = input.len() * 3 + 1024;
     let start = out.len();
     out.resize(start + capacity, 0);
@@ -77,7 +77,7 @@ pub fn decode64_to_vec_ffi(
     codec: &UniquePtr<ffi::IntegerCODEC>,
     input: &[u32],
     out: &mut Vec<u64>,
-) -> Result<(), FastPForError> {
+) -> FastPForResult<()> {
     if !input.is_empty() {
         // C++ decodeArray needs output buffer. Variable-byte can pack multiple values per word.
         let capacity = input.len().saturating_mul(4);
diff --git a/src/error.rs b/src/error.rs
index d97b2e6..449c6c6 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -1,5 +1,8 @@
 use thiserror::Error;
 
+/// Alias for the result type of `FastPFor` operations.
+pub type FastPForResult<T> = Result<T, FastPForError>;
+
 /// Errors that can occur when using the `FastPFor` codecs.
 #[non_exhaustive]
 #[derive(Error, Debug)]
diff --git a/src/helpers.rs b/src/helpers.rs
index 7acf2a4..cf0d438 100644
--- a/src/helpers.rs
+++ b/src/helpers.rs
@@ -1,4 +1,4 @@
-use crate::FastPForError;
+use crate::{FastPForError, FastPForResult};
 
 /// Finds the greatest multiple of `factor` that is less than or equal to `value`.
 #[cfg_attr(feature = "cpp", allow(dead_code))]
@@ -17,7 +17,7 @@ pub trait AsUsize: Eq + Copy {
     fn as_usize(self) -> usize;
 
     #[inline]
-    fn is_decoded_mismatch(self, expected: impl AsUsize) -> Result<(), FastPForError> {
+    fn is_decoded_mismatch(self, expected: impl AsUsize) -> FastPForResult<()> {
         let actual = self.as_usize();
         let expected = expected.as_usize();
         if self.as_usize() == expected {
@@ -29,7 +29,7 @@ pub trait AsUsize: Eq + Copy {
 
     /// Returns an error if `expected` exceeds `max`.
     #[inline]
-    fn is_valid_expected(self, max: impl AsUsize) -> Result<usize, FastPForError> {
+    fn is_valid_expected(self, max: impl AsUsize) -> FastPForResult<usize> {
         let expected = self.as_usize();
         let max = max.as_usize();
         if expected > max {
@@ -67,12 +67,12 @@ impl AsUsize for u32 {
 
 #[cfg_attr(feature = "cpp", allow(dead_code))]
 pub trait GetWithErr<T> {
-    fn get_val(&self, pos: impl AsUsize) -> Result<T, FastPForError>;
+    fn get_val(&self, pos: impl AsUsize) -> FastPForResult<T>;
 }
 
 impl<T: Copy> GetWithErr<T> for &[T] {
     #[inline]
-    fn get_val(&self, pos: impl AsUsize) -> Result<T, FastPForError> {
+    fn get_val(&self, pos: impl AsUsize) -> FastPForResult<T> {
         self.get(pos.as_usize())
             .copied()
             .ok_or(FastPForError::NotEnoughData)
@@ -81,7 +81,7 @@ impl<T: Copy> GetWithErr<T> for &[T] {
 
 impl<T: Copy> GetWithErr<T> for Vec<T> {
     #[inline]
-    fn get_val(&self, pos: impl AsUsize) -> Result<T, FastPForError> {
+    fn get_val(&self, pos: impl AsUsize) -> FastPForResult<T> {
         self.as_slice().get_val(pos)
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index d4a71e4..22bf842 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -7,7 +7,7 @@ compile_error!("At least one of the features 'cpp' or 'rust' must be enabled");
 
 // Error types are always available regardless of which codec features are enabled.
 mod error;
-pub use error::FastPForError;
+pub use error::{FastPForError, FastPForResult};
 
 #[cfg(feature = "cpp")]
 /// Rust wrapper for the [`FastPFOR` C++ library](https://github.com/fast-pack/FastPFor)
diff --git a/src/rust/composite.rs b/src/rust/composite.rs
index 1e2e280..2225d80 100644
--- a/src/rust/composite.rs
+++ b/src/rust/composite.rs
@@ -3,7 +3,7 @@
 //!
 //! Rust-only: combines Rust block codecs with Rust tail codecs. Do not wrap C++ codecs.
 
-use crate::FastPForError;
+use crate::FastPForResult;
 use crate::codec::{AnyLenCodec, BlockCodec, slice_to_blocks};
 use crate::helpers::AsUsize;
 
@@ -63,7 +63,7 @@ impl<Blocks: BlockCodec, Tail: AnyLenCodec> CompositeCodec<Blocks, Tail> {
 }
 
 impl<Blocks: BlockCodec, Tail: AnyLenCodec> AnyLenCodec for CompositeCodec<Blocks, Tail> {
-    fn encode(&mut self, input: &[u32], out: &mut Vec<u32>) -> Result<(), FastPForError> {
+    fn encode(&mut self, input: &[u32], out: &mut Vec<u32>) -> FastPForResult<()> {
         let (blocks, remainder) = slice_to_blocks::<Blocks>(input);
         // C++ CompositeCodec: concatenate block + tail. Block codec writes length header (0 when empty).
         self.block.encode_blocks(blocks, out)?;
@@ -76,7 +76,7 @@ impl<Blocks: BlockCodec, Tail: AnyLenCodec> AnyLenCodec for CompositeCodec<Block
         input: &[u32],
         out: &mut Vec<u32>,
         expected_len: Option<u32>,
-    ) -> Result<(), FastPForError> {
+    ) -> FastPForResult<()> {
         let start_len = out.len();
         let max = Self::max_decompressed_len(input.len());
 
@@ -112,6 +112,7 @@ impl<Blocks: BlockCodec, Tail: AnyLenCodec> AnyLenCodec for CompositeCodec<Block
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::FastPForError;
     use crate::rust::{FastPForBlock128, FastPForBlock256, JustCopy, VariableByte};
 
     fn roundtrip<C: AnyLenCodec>(codec: &mut C, data: &[u32]) {
diff --git a/src/rust/integer_compression/fastpfor.rs b/src/rust/integer_compression/fastpfor.rs
index cf5d216..cbc3ad9 100644
--- a/src/rust/integer_compression/fastpfor.rs
+++ b/src/rust/integer_compression/fastpfor.rs
@@ -7,7 +7,7 @@ use bytes::{Buf as _, BufMut as _, BytesMut};
 use crate::helpers::{AsUsize, GetWithErr, bits, greatest_multiple};
 use crate::rust::cursor::IncrementCursor;
 use crate::rust::integer_compression::{bitpacking, bitunpacking};
-use crate::{BlockCodec, FastPForError};
+use crate::{BlockCodec, FastPForError, FastPForResult};
 
 /// Overhead cost (in bits) for storing each exception's position in the block
 const OVERHEAD_OF_EACH_EXCEPT: u32 = 8;
@@ -75,7 +75,7 @@ impl FastPFor<128> {
     ///
     /// Returns an error if `page_size` is not a multiple of 128.
     /// Use [`Default`] for the default page size.
-    pub fn new(page_size: u32) -> Result<Self, FastPForError> {
+    pub fn new(page_size: u32) -> FastPForResult<Self> {
         Self::create(page_size)
     }
 }
@@ -85,13 +85,13 @@ impl FastPFor<256> {
     ///
     /// Returns an error if `page_size` is not a multiple of 256.
     /// Use [`Default`] for the default page size.
-    pub fn new(page_size: u32) -> Result<Self, FastPForError> {
+    pub fn new(page_size: u32) -> FastPForResult<Self> {
         Self::create(page_size)
     }
 }
 
 impl<const N: usize> FastPFor<N> {
-    fn create(page_size: u32) -> Result<Self, FastPForError> {
+    fn create(page_size: u32) -> FastPForResult<Self> {
         if page_size % N as u32 != 0 {
             return Err(FastPForError::InvalidPageSize {
                 page_size,
@@ -136,7 +136,7 @@ impl<const N: usize> FastPFor<N> {
         input_offset: &mut Cursor<u32>,
         output: &mut [u32],
         output_offset: &mut Cursor<u32>,
-    ) -> Result<(), FastPForError> {
+    ) -> FastPForResult<()> {
         let mynvalue = greatest_multiple(inlength, N as u32);
         let final_out = output_offset.position() as u32 + mynvalue;
         while output_offset.position() as u32 != final_out {
@@ -317,7 +317,7 @@ impl<const N: usize> FastPFor<N> {
         output: &mut [u32],
         output_offset: &mut Cursor<u32>,
         this_size: u32,
-    ) -> Result<(), FastPForError> {
+    ) -> FastPForResult<()> {
         let n = u32::try_from(input.len())
             .map_err(|_| FastPForError::InvalidInputLength(input.len()))?;
 
@@ -504,11 +504,7 @@ where
 {
     type Block = [u32; N];
 
-    fn encode_blocks(
-        &mut self,
-        blocks: &[[u32; N]],
-        out: &mut Vec<u32>,
-    ) -> Result<(), FastPForError> {
+    fn encode_blocks(&mut self, blocks: &[[u32; N]], out: &mut Vec<u32>) -> FastPForResult<()> {
         let n_values = (blocks.len() * N) as u32;
         if blocks.is_empty() {
             out.push(n_values);
@@ -544,7 +540,7 @@ where
         input: &[u32],
         expected_len: Option<u32>,
         out: &mut Vec<u32>,
-    ) -> Result<usize, FastPForError> {
+    ) -> FastPForResult<usize> {
         let Some((&block_n_values, rest)) = input.split_first() else {
             return Err(FastPForError::NotEnoughData);
         };
diff --git a/src/rust/integer_compression/just_copy.rs b/src/rust/integer_compression/just_copy.rs
index 55e996d..228b9b3 100644
--- a/src/rust/integer_compression/just_copy.rs
+++ b/src/rust/integer_compression/just_copy.rs
@@ -1,4 +1,4 @@
-use crate::FastPForError;
+use crate::FastPForResult;
 use crate::codec::AnyLenCodec;
 use crate::helpers::AsUsize;
 
@@ -23,7 +23,7 @@ impl Default for JustCopy {
 }
 
 impl AnyLenCodec for JustCopy {
-    fn encode(&mut self, input: &[u32], out: &mut Vec<u32>) -> Result<(), FastPForError> {
+    fn encode(&mut self, input: &[u32], out: &mut Vec<u32>) -> FastPForResult<()> {
         out.extend_from_slice(input);
         Ok(())
     }
@@ -33,7 +33,7 @@ impl AnyLenCodec for JustCopy {
         input: &[u32],
         out: &mut Vec<u32>,
         expected_len: Option<u32>,
-    ) -> Result<(), FastPForError> {
+    ) -> FastPForResult<()> {
         if let Some(expected) = expected_len {
             let expected = expected.is_valid_expected(Self::max_decompressed_len(input.len()))?;
             input.len().is_decoded_mismatch(expected)?;
@@ -46,6 +46,7 @@ impl AnyLenCodec for JustCopy {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::FastPForError;
 
     #[test]
     fn justcopy_default_and_roundtrip() {
diff --git a/src/rust/integer_compression/variable_byte.rs b/src/rust/integer_compression/variable_byte.rs
index 345d2c9..020697a 100644
--- a/src/rust/integer_compression/variable_byte.rs
+++ b/src/rust/integer_compression/variable_byte.rs
@@ -2,10 +2,10 @@ use std::io::Cursor;
 
 use bytemuck::{cast_slice, cast_slice_mut};
 
-use crate::FastPForError;
 use crate::codec::AnyLenCodec;
 use crate::helpers::AsUsize;
 use crate::rust::cursor::IncrementCursor;
+use crate::{FastPForError, FastPForResult};
 
 /// Variable-byte encoding codec for integer compression.
 #[derive(Debug)]
@@ -42,7 +42,7 @@ impl VariableByte {
         input_offset: &mut Cursor<u32>,
         output: &mut [u32],
         output_offset: &mut Cursor<u32>,
-    ) -> Result<(), FastPForError> {
+    ) -> FastPForResult<()> {
         if input_length == 0 {
             return Ok(());
         }
@@ -102,7 +102,7 @@ impl VariableByte {
         input_offset: &mut Cursor<u32>,
         output: &mut [u32],
         output_offset: &mut Cursor<u32>,
-    ) -> Result<(), FastPForError> {
+    ) -> FastPForResult<()> {
         if input_length == 0 {
             return Ok(());
         }
@@ -193,7 +193,7 @@ impl VariableByte {
         input_offset: &mut Cursor<u32>,
         output: &mut [i8],
         output_offset: &mut Cursor<u32>,
-    ) -> Result<(), FastPForError> {
+    ) -> FastPForResult<()> {
         if input_length == 0 {
             return Ok(());
         }
@@ -252,7 +252,7 @@ impl VariableByte {
         input_offset: &mut Cursor<u32>,
         output: &mut [u32],
         output_offset: &mut Cursor<u32>,
-    ) -> Result<(), FastPForError> {
+    ) -> FastPForResult<()> {
         let mut p = input_offset.position() as u32;
         let final_p = input_offset.position() as u32 + input_length;
         let mut tmp_outpos = output_offset.position();
@@ -308,7 +308,7 @@ impl Default for VariableByte {
 }
 
 impl AnyLenCodec for VariableByte {
-    fn encode(&mut self, input: &[u32], out: &mut Vec<u32>) -> Result<(), FastPForError> {
+    fn encode(&mut self, input: &[u32], out: &mut Vec<u32>) -> FastPForResult<()> {
         let capacity = input.len() * 2 + 4;
         let start = out.len();
         out.resize(start + capacity, 0);
@@ -331,7 +331,7 @@ impl AnyLenCodec for VariableByte {
         input: &[u32],
         out: &mut Vec<u32>,
         expected_len: Option<u32>,
-    ) -> Result<(), FastPForError> {
+    ) -> FastPForResult<()> {
         let capacity = if let Some(expected) = expected_len {
             expected.is_valid_expected(Self::max_decompressed_len(input.len()))?
         } else {

From fb439e6c60f2d8227ef42d5c809f21b35c023666 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Sun, 22 Mar 2026 17:49:01 -0400
Subject: [PATCH 03/26] fix docs

---
 .github/workflows/ci.yml | 73 +---------------------------------------
 src/codec.rs             |  2 +-
 src/rust/composite.rs    |  4 +--
 3 files changed, 4 insertions(+), 75 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0794361..376aaaf 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -31,78 +31,7 @@ jobs:
           prefix-key: "v0-${{ matrix.simd_mode }}"
       - uses: taiki-e/install-action@v2
         with: { tool: 'just,cargo-binstall' }
-      # Enable core dumps so SIGSEGV crashes produce a dump for post-mortem analysis.
-      # ulimit is per-process, so it must be set in the same shell that runs the tests.
-      - name: Run tests (with core dumps enabled)
-        run: |
-          if [[ "$RUNNER_OS" == "macOS" ]]; then
-            sudo sysctl -w kern.coredump=1
-            sudo sysctl -w kern.corefile='/cores/core.%N.%P'
-            sudo mkdir -p /cores && sudo chmod 1777 /cores
-          else
-            sudo sysctl -w kernel.core_pattern='/tmp/cores/core.%e.%p'
-            sudo mkdir -p /tmp/cores && sudo chmod 1777 /tmp/cores
-          fi
-          ulimit -c unlimited
-          just ci-test
-      # On failure, extract a minimal text report from any core dump (backtrace, file info).
-      # We deliberately avoid uploading the raw core dump (can be ~1.5 GB).
-      - name: Collect crash report
-        if: failure()
-        run: |
-          mkdir -p crash-report
-          CORE_DIR=$([[ "$RUNNER_OS" == "macOS" ]] && echo /cores || echo /tmp/cores)
-          for core in "$CORE_DIR"/core.* ; do
-            [[ -f "$core" ]] || continue
-            report="crash-report/$(basename "$core").txt"
-            {
-              echo "=== Core dump ==="
-              echo "Path: $core"
-              echo "File: $(file "$core")"
-              echo ""
-              if [[ "$RUNNER_OS" == "macOS" ]]; then
-                bin_name=$(basename "$core" | sed -E 's/core\.([^.]+)\.[0-9]+/\1/')
-                bin=$(find target -path '*/deps/*' -name "${bin_name}*" -type f -perm /111 ! -name '*.d' 2>/dev/null | head -1)
-                [[ -z "$bin" ]] && bin=$(find target/debug -maxdepth 2 -name "${bin_name}*" -type f -perm /111 2>/dev/null | head -1)
-                echo "Binary: ${bin:-not found}"
-                echo ""
-                echo "=== Backtrace ==="
-                if [[ -n "$bin" && -x "$bin" ]]; then
-                  lldb --no-lldbinit --batch \
-                    -o "target create --core '$core' '$bin'" \
-                    -o "thread backtrace all" \
-                    -o quit 2>&1
-                else
-                  echo "(trying lldb with core only)"
-                  lldb --no-lldbinit --batch \
-                    -o "target create --core '$core'" \
-                    -o "thread backtrace all" \
-                    -o quit 2>&1
-                fi
-              else
-                bin=$(file "$core" | sed -n "s/.*from '\([^']*\)'.*/\1/p")
-                [[ -z "$bin" ]] && bin=$(file "$core" | grep -oE "execfn: '[^']+'" | cut -d"'" -f2)
-                echo "Binary: ${bin:-not found}"
-                echo ""
-                echo "=== Backtrace ==="
-                if [[ -n "$bin" && -x "$bin" ]]; then
-                  gdb -batch -ex "thread apply all bt full" "$bin" "$core" 2>&1
-                else
-                  echo "(trying gdb with core only)"
-                  gdb -batch -ex "core-file $core" -ex "thread apply all bt" 2>&1
-                fi
-              fi
-            } > "$report" 2>&1
-          done
-          echo "=== crash-report ===" && ls -lh crash-report/ || true
-          shopt -s nullglob; reports=(crash-report/*.txt)
-          if [[ ${#reports[@]} -gt 0 ]]; then cat "${reports[@]}"; else echo "(no core dumps found)"; fi
-      - uses: actions/upload-artifact@v7
-        if: failure()
-        with:
-          name: crash-report-${{ matrix.os }}-${{ matrix.simd_mode }}-${{ github.sha }}
-          path: crash-report/
-          if-no-files-found: warn
+      - run: just ci-test
 
   test-nightly:
     name: Nightly-specific tests
diff --git a/src/codec.rs b/src/codec.rs
index 9e3acac..53fc425 100644
--- a/src/codec.rs
+++ b/src/codec.rs
@@ -155,7 +155,7 @@ pub trait AnyLenCodec {
 ///
 /// # Example
 ///
-/// ```ignore
+/// ```
 /// # use fastpfor::{slice_to_blocks, FastPForBlock256};
 /// let data: Vec<u32> = (0..600).collect(); // 2 × 256 + 88 remainder
 /// let (blocks, remainder) = slice_to_blocks::<FastPForBlock256>(&data);
diff --git a/src/rust/composite.rs b/src/rust/composite.rs
index 2225d80..a633694 100644
--- a/src/rust/composite.rs
+++ b/src/rust/composite.rs
@@ -27,11 +27,11 @@ use crate::helpers::AsUsize;
 ///
 /// # Example
 ///
-/// ```rust,ignore
+/// ```
 /// use fastpfor::{AnyLenCodec, FastPFor256};
 ///
 /// let data: Vec<u32> = (0..600).collect(); // 2 × 256 + 88 remainder
-/// let codec = FastPFor256::default();
+/// let mut codec = FastPFor256::default();
 ///
 /// let mut encoded = Vec::new();
 /// codec.encode(&data, &mut encoded).unwrap();

From 80dd094a46537f3707ede0e2cd4a26a366e51d08 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Sun, 22 Mar 2026 18:02:30 -0400
Subject: [PATCH 04/26] cleanup

---
 benches/bench_utils.rs                   | 9 +++++----
 fuzz/fuzz_targets/common.rs              | 9 +++++----
 src/rust/integer_compression/fastpfor.rs | 9 ++++-----
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/benches/bench_utils.rs b/benches/bench_utils.rs
index a60a251..2cb2b2c 100644
--- a/benches/bench_utils.rs
+++ b/benches/bench_utils.rs
@@ -10,6 +10,7 @@
 #![allow(missing_docs)]
 
 use core::ops::Range;
+use std::marker::PhantomData;
 
 #[allow(unused_imports)]
 use fastpfor::{AnyLenCodec, BlockCodec, slice_to_blocks};
@@ -180,7 +181,7 @@ pub struct CompressFixture<C: BlockCodec> {
     pub compressed: Vec<u32>,
     /// Number of blocks in `data`.
     pub n_blocks: usize,
-    _codec: std::marker::PhantomData<C>,
+    _codec: PhantomData<C>,
 }
 
 impl<C: BlockCodec + Default> CompressFixture<C> {
@@ -194,7 +195,7 @@ impl<C: BlockCodec + Default> CompressFixture<C> {
             data,
             compressed,
             n_blocks: block_count,
-            _codec: std::marker::PhantomData,
+            _codec: PhantomData,
         }
     }
 }
@@ -228,7 +229,7 @@ pub struct BlockSizeFixture<C: BlockCodec> {
     pub data: Vec<u32>,
     pub compressed: Vec<u32>,
     pub n_blocks: usize,
-    _codec: std::marker::PhantomData<C>,
+    _codec: PhantomData<C>,
 }
 
 impl<C: BlockCodec + Default> BlockSizeFixture<C> {
@@ -240,7 +241,7 @@ impl<C: BlockCodec + Default> BlockSizeFixture<C> {
             data,
             compressed,
             n_blocks: block_count,
-            _codec: std::marker::PhantomData,
+            _codec: PhantomData,
         }
     }
 }
diff --git a/fuzz/fuzz_targets/common.rs b/fuzz/fuzz_targets/common.rs
index a8acbac..9dd2d05 100644
--- a/fuzz/fuzz_targets/common.rs
+++ b/fuzz/fuzz_targets/common.rs
@@ -11,15 +11,20 @@ pub struct HexSlice<'a>(pub &'a [u32]);
 impl std::fmt::Debug for HexSlice<'_> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         const MAX: usize = 20;
+
         let total = self.0.len();
         let shown = total.min(MAX);
+
         let mut list = f.debug_list();
+
         for v in &self.0[..shown] {
             list.entry(&format_args!("{v:#010x}"));
         }
+
         if total > MAX {
             list.entry(&format_args!(".. out of {total} total"));
         }
+
         list.finish()
     }
 }
@@ -33,12 +38,8 @@ pub struct FuzzInput<C> {
 
 pub type AnyLen = Box<dyn AnyLenCodec>;
 
-// ── List entry type ───────────────────────────────────────────────────────────
-
 pub type CodecEntry = (&'static str, fn() -> AnyLen);
 
-// ── Two codec lists ──────────────────────────────────────────────────────────
-
 /// Generates `(name, || Box::new(T::default()))` entries from a list of types.
 macro_rules! codec_list {
     ($($t:ty),* $(,)?) => {
diff --git a/src/rust/integer_compression/fastpfor.rs b/src/rust/integer_compression/fastpfor.rs
index cbc3ad9..28e3b4e 100644
--- a/src/rust/integer_compression/fastpfor.rs
+++ b/src/rust/integer_compression/fastpfor.rs
@@ -1,4 +1,5 @@
 use std::array;
+use std::cmp::min;
 use std::io::Cursor;
 
 use bytemuck::cast_slice;
@@ -123,8 +124,7 @@ impl<const N: usize> FastPFor<N> {
         let inlength = greatest_multiple(input_length, N as u32);
         let final_inpos = input_offset.position() as u32 + inlength;
         while input_offset.position() as u32 != final_inpos {
-            let this_size =
-                std::cmp::min(self.page_size, final_inpos - input_offset.position() as u32);
+            let this_size = min(self.page_size, final_inpos - input_offset.position() as u32);
             self.encode_page(input, this_size, input_offset, output, output_offset);
         }
     }
@@ -140,8 +140,7 @@ impl<const N: usize> FastPFor<N> {
         let mynvalue = greatest_multiple(inlength, N as u32);
         let final_out = output_offset.position() as u32 + mynvalue;
         while output_offset.position() as u32 != final_out {
-            let this_size =
-                std::cmp::min(self.page_size, final_out - output_offset.position() as u32);
+            let this_size = min(self.page_size, final_out - output_offset.position() as u32);
             self.decode_page(input, input_offset, output, output_offset, this_size)?;
         }
         Ok(())
@@ -264,7 +263,7 @@ impl<const N: usize> FastPFor<N> {
     /// Analyzes frequency distribution to balance regular value bits against exception overhead.
     fn best_bit_from_data(&mut self, input: &[u32], pos: u32) {
         self.freqs.fill(0);
-        let k_end = std::cmp::min(pos + N as u32, input.len() as u32);
+        let k_end = min(pos + N as u32, input.len() as u32);
         for k in pos..k_end {
             self.freqs[bits(input[k as usize])] += 1;
         }

From ae89f2384c0f1850e52047aab3c3710f8ac26e4b Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Sun, 22 Mar 2026 18:20:56 -0400
Subject: [PATCH 05/26] wip

---
 src/rust/integer_compression/fastpfor.rs | 42 ++++++++++++------------
 tests/decode_validation.rs               | 39 ++++++++++++++++++++++
 2 files changed, 60 insertions(+), 21 deletions(-)
 create mode 100644 tests/decode_validation.rs

diff --git a/src/rust/integer_compression/fastpfor.rs b/src/rust/integer_compression/fastpfor.rs
index 28e3b4e..5a8f0d2 100644
--- a/src/rust/integer_compression/fastpfor.rs
+++ b/src/rust/integer_compression/fastpfor.rs
@@ -393,10 +393,9 @@ impl<const N: usize> FastPFor<N> {
                     }
                     let copy_len = words_needed as usize;
                     let mut tail_buf = [0u32; 64];
-                    debug_assert!(
-                        copy_len > 0,
-                        "j < size and k >= 2 guarantee words_needed >= 1"
-                    );
+                    if copy_len == 0 {
+                        return Err(FastPForError::NotEnoughData);
+                    }
                     let start = inexcept as usize;
                     let src = input
                         .get(start..start + copy_len)
@@ -434,18 +433,18 @@ impl<const N: usize> FastPFor<N> {
             for k in (0..N as u32).step_by(32) {
                 let in_start = tmp_input_offset as usize;
                 let out_start = (tmp_output_offset + k) as usize;
-                // Both invariants are guaranteed by the caller:
-                // - packed data lies within [init_pos+1, init_pos+where_meta), which is
-                //   within bounds because metadata was successfully read at init_pos+where_meta.
-                // - output is pre-allocated to n_blocks*N by decode_blocks.
-                debug_assert!(
-                    in_start + usize::from(bits) <= input.len(),
-                    "packed data overruns input"
-                );
-                debug_assert!(
-                    out_start + 32 <= output.len(),
-                    "output pre-allocated to wrong size"
-                );
+                let in_end = in_start
+                    .checked_add(usize::from(bits))
+                    .ok_or(FastPForError::NotEnoughData)?;
+                if in_end > input.len() {
+                    return Err(FastPForError::NotEnoughData);
+                }
+                let out_end = out_start
+                    .checked_add(32)
+                    .ok_or(FastPForError::OutputBufferTooSmall)?;
+                if out_end > output.len() {
+                    return Err(FastPForError::OutputBufferTooSmall);
+                }
                 bitunpacking::fast_unpack(input, in_start, output, out_start, bits);
                 tmp_input_offset += u32::from(bits);
             }
@@ -467,9 +466,9 @@ impl<const N: usize> FastPFor<N> {
                             return Err(FastPForError::NotEnoughData);
                         }
                         let out_idx = tmp_output_offset as usize + pos as usize;
-                        // out_idx < output.len(): pos < block_size and the bitunpack
-                        // guard above already confirmed output.len() >= tmp_output_offset + block_size.
-                        debug_assert!(out_idx < output.len());
+                        if out_idx >= output.len() {
+                            return Err(FastPForError::OutputBufferTooSmall);
+                        }
                         output[out_idx] |= 1 << bits;
                     }
                 } else {
@@ -480,8 +479,9 @@ impl<const N: usize> FastPFor<N> {
                             return Err(FastPForError::NotEnoughData);
                         }
                         let out_idx = tmp_output_offset as usize + pos as usize;
-                        // out_idx < output.len(): same invariant as index==1 branch above.
-                        debug_assert!(out_idx < output.len());
+                        if out_idx >= output.len() {
+                            return Err(FastPForError::OutputBufferTooSmall);
+                        }
                         let ptr = self.data_pointers[index];
                         let except_value = self.exception_buffers[index].get_val(ptr)?;
                         output[out_idx] |= except_value << bits;
diff --git a/tests/decode_validation.rs b/tests/decode_validation.rs
new file mode 100644
index 0000000..edab2b6
--- /dev/null
+++ b/tests/decode_validation.rs
@@ -0,0 +1,39 @@
+//! Integration tests for **untrusted** compressed input: decoding must reject malformed
+//! streams with [`fastpfor::FastPForResult::Err`], not panic.
+#![cfg(feature = "rust")]
+
+use fastpfor::{AnyLenCodec, FastPFor128, FastPFor256};
+
+/// `compressed` must not be a valid stream for `codec`. Decoding must return `Err`.
+fn assert_fails<C: AnyLenCodec + Default>(compressed: &[u32]) {
+    let mut codec = C::default();
+    let mut out = Vec::new();
+    assert!(
+        codec.decode(compressed, &mut out, None).is_err(),
+        "expected decode to fail with Err, but it succeeded"
+    );
+}
+
+#[test]
+fn test1() {
+    let data: &[u32] = &[
+        42_926_275,
+        589_967,
+        4_522_053,
+        589_967,
+        3_646_554_563,
+        55_438,
+        u32::MAX,
+        36,
+    ];
+    assert_fails::<FastPFor128>(data);
+    assert_fails::<FastPFor256>(data);
+}
+
+/// Minimal garbage: tiny slice that cannot be a well-formed composite block stream.
+#[test]
+fn test2() {
+    let data = &[0x200, 0, 1];
+    assert_fails::<FastPFor128>(data);
+    assert_fails::<FastPFor256>(data);
+}

From cc5ca02b585a9b78693b2473e3ab23153cc7d854 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Sun, 22 Mar 2026 18:37:55 -0400
Subject: [PATCH 06/26] decode failures

---
 src/rust/integer_compression/fastpfor.rs | 243 +----------------------
 tests/decode_validation.rs               | 236 ++++++++++++++++++++--
 2 files changed, 226 insertions(+), 253 deletions(-)

diff --git a/src/rust/integer_compression/fastpfor.rs b/src/rust/integer_compression/fastpfor.rs
index 5a8f0d2..a4dfa1b 100644
--- a/src/rust/integer_compression/fastpfor.rs
+++ b/src/rust/integer_compression/fastpfor.rs
@@ -588,8 +588,6 @@ where
 
 #[cfg(test)]
 mod tests {
-    use bytemuck::cast_slice_mut;
-
     use super::*;
 
     // ── Generic helpers ───────────────────────────────────────────────────────
@@ -625,53 +623,6 @@ mod tests {
         out
     }
 
-    /// Try to decode `compressed` as 1 block with `FastPFor<N>`.
-    fn try_decode<const N: usize>(compressed: &[u32]) -> Result<(), impl std::fmt::Debug>
-    where
-        FastPFor<N>: BlockCodec<Block = [u32; N]>,
-        [u32; N]: bytemuck::Pod,
-    {
-        FastPFor::<N>::default()
-            .decode_blocks(compressed, Some(N as u32), &mut Vec::new())
-            .map(|_| ())
-    }
-
-    // ── Wire format index helpers (FastPFor block layout) ─────────────────────
-    //
-    // Full `compressed` layout (output of `encode_blocks` for a single block):
-    //   [0]                              = out_length  (number of encoded u32 values)
-    //   [1]                              = where_meta (offset to metadata section)
-    //   [2 .. where_meta]               = packed regular values
-    //   [1+where_meta]                  = bytesize   (byte count of block metadata)
-    //   [+1 .. +ceil(bytesize/4)]       = block metadata bytes
-    //   [+ceil(bytesize/4)+1]           = bitmap
-    //   for each set bit k (2..=32):
-    //     [next]                        = size  (# of packed exceptions at width k)
-    //     [next ceil(size*k/32) words]  = bit-packed exception values
-
-    fn meta_byte_start(compressed: &[u32]) -> usize {
-        let where_meta = compressed[1] as usize;
-        (1 + where_meta + 1) * 4
-    }
-
-    fn bitmap_idx(compressed: &[u32]) -> usize {
-        let where_meta = compressed[1] as usize;
-        let bytesize_idx = 1 + where_meta;
-        let bytesize = compressed[bytesize_idx] as usize;
-        bytesize_idx + 1 + bytesize.div_ceil(4)
-    }
-
-    fn find_exception_block(bytes: &[u8], meta_start: usize) -> Option<(usize, usize, usize)> {
-        let mut pos = meta_start;
-        while pos + 1 < bytes.len() {
-            if bytes[pos + 1] > 0 {
-                return Some((pos, pos + 1, pos + 2));
-            }
-            pos += 2;
-        }
-        None
-    }
-
     /// Compressed data containing at least one non-trivial exception group.
     fn compressed_with_exceptions() -> (Vec<u32>, Vec<u32>) {
         let data: Vec<u32> = (0..256u32)
@@ -758,29 +709,10 @@ mod tests {
         assert_eq!(roundtrip::<128>(&input), input);
     }
 
-    // ── Error-path tests: truncated / corrupted compressed data ──────────────
+    // ── Error / edge tests not covered by `tests/decode_validation.rs` ─────
     //
-    // Each test: compress valid data → surgically corrupt one field →
-    // assert `Err` is returned rather than a panic.
-
-    #[test]
-    fn test_truncated_input_returns_error() {
-        let compressed = encode_block::<256>(&vec![42u32; 256]);
-        for truncated_len in [1, 2, compressed.len() / 2, compressed.len() - 1] {
-            assert!(
-                try_decode::<256>(&compressed[..truncated_len]).is_err(),
-                "expected error for truncated len {truncated_len}"
-            );
-        }
-    }
-
-    #[test]
-    fn test_corrupted_where_meta_returns_error() {
-        let mut compressed = encode_block::<256>(&vec![1u32; 256]);
-        // word [1] = where_meta; point it past the end
-        compressed[1] = u32::MAX;
-        assert!(try_decode::<256>(&compressed).is_err());
-    }
+    // `AnyLenCodec::decode` treats an empty slice as tail-only and succeeds; an empty
+    // `decode_blocks` input is still invalid. Headless decode is internal-only.
 
     #[test]
     fn uncompress_zero_input_length_err() {
@@ -805,22 +737,9 @@ mod tests {
             .expect("zero-length decompress must succeed");
     }
 
-    #[test]
-    fn decode_where_meta_missing() {
-        // Only an out_length word, no where_meta follows → must error.
-        assert!(try_decode::<256>(&[256u32]).is_err());
-    }
-
-    #[test]
-    fn decode_where_meta_out_of_bounds() {
-        let (mut compressed, _) = compressed_with_exceptions();
-        compressed[1] = u32::MAX;
-        assert!(try_decode::<256>(&compressed).is_err());
-    }
-
     #[test]
     fn decode_where_meta_overflow() {
-        // FIXME: this test should be modified to use public API
+        // `decode_headless_blocks` only: no `AnyLenCodec` entry point passes this layout.
         let (compressed, _) = compressed_with_exceptions();
         let mut padded = vec![0u32];
         padded.extend_from_slice(&compressed);
@@ -839,92 +758,6 @@ mod tests {
         );
     }
 
-    #[test]
-    fn decode_bytesize_out_of_bounds() {
-        let (mut compressed, _) = compressed_with_exceptions();
-        compressed[1] = compressed.len() as u32 - 1;
-        assert!(try_decode::<256>(&compressed).is_err());
-    }
-
-    #[test]
-    fn decode_bytesize_length_overflow() {
-        let (mut compressed, _) = compressed_with_exceptions();
-        let bytesize_idx = 1 + compressed[1] as usize;
-        compressed[bytesize_idx] = u32::MAX - 3;
-        assert!(try_decode::<256>(&compressed).is_err());
-    }
-
-    #[test]
-    fn decode_bitmap_out_of_bounds() {
-        let (mut compressed, _) = compressed_with_exceptions();
-        let bytesize_idx = 1 + compressed[1] as usize;
-        let remaining = (compressed.len() - bytesize_idx - 1) as u32;
-        compressed[bytesize_idx] = remaining * 4;
-        assert!(try_decode::<256>(&compressed).is_err());
-    }
-
-    #[test]
-    fn decode_exception_size_exceeds_page_size() {
-        let (mut compressed, _) = compressed_with_exceptions();
-        let size_idx = bitmap_idx(&compressed) + 1;
-        compressed[size_idx] = DEFAULT_PAGE_SIZE + 1;
-        assert!(try_decode::<256>(&compressed).is_err());
-    }
-
-    #[test]
-    fn decode_exception_partial_group_not_enough_data() {
-        let (compressed, _) = compressed_with_exceptions();
-        assert!(try_decode::<256>(&compressed[..compressed.len() - 2]).is_err());
-    }
-
-    #[test]
-    fn decode_block_b_too_large() {
-        let (mut compressed, _) = compressed_with_exceptions();
-        let start = meta_byte_start(&compressed);
-        cast_slice_mut::<_, u8>(&mut compressed)[start] = 33;
-        assert!(try_decode::<256>(&compressed).is_err());
-    }
-
-    #[test]
-    fn decode_packed_region_truncated() {
-        let (compressed, _) = compressed_with_exceptions();
-        let where_meta = compressed[1] as usize;
-        assert!(try_decode::<256>(&compressed[..where_meta]).is_err());
-    }
-
-    #[test]
-    fn decode_exception_maxbits_too_large() {
-        let (mut compressed, _) = compressed_with_exceptions();
-        let start = meta_byte_start(&compressed);
-        let bytes: &mut [u8] = cast_slice_mut(&mut compressed);
-        if let Some((_, _, mb_off)) = find_exception_block(bytes, start) {
-            bytes[mb_off] = 33;
-        }
-        assert!(try_decode::<256>(&compressed).is_err());
-    }
-
-    #[test]
-    fn decode_exception_index_underflow() {
-        let (mut compressed, _) = compressed_with_exceptions();
-        let start = meta_byte_start(&compressed);
-        let bytes: &mut [u8] = cast_slice_mut(&mut compressed);
-        if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
-            bytes[mb_off] = bytes[bb_off].saturating_sub(1);
-        }
-        assert!(try_decode::<256>(&compressed).is_err());
-    }
-
-    #[test]
-    fn decode_exception_index_zero() {
-        let (mut compressed, _) = compressed_with_exceptions();
-        let start = meta_byte_start(&compressed);
-        let bytes: &mut [u8] = cast_slice_mut(&mut compressed);
-        if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
-            bytes[mb_off] = bytes[bb_off];
-        }
-        assert!(try_decode::<256>(&compressed).is_err());
-    }
-
     #[test]
     fn decode_index1_branch_valid() {
         let (compressed, data) = compressed_with_index1_exceptions();
@@ -935,49 +768,6 @@ mod tests {
         assert_eq!(out, data);
     }
 
-    #[test]
-    fn decode_index1_pos_byte_missing() {
-        let (compressed, _) = compressed_with_index1_exceptions();
-        assert!(try_decode::<256>(&compressed[..compressed.len() - 1]).is_err());
-    }
-
-    #[test]
-    fn decode_index1_pos_out_of_block() {
-        let mut data = vec![1u32; 128];
-        data[0] = 3;
-        let mut buf = encode_block::<128>(&data);
-        let start = meta_byte_start(&buf);
-        let bytes: &mut [u8] = cast_slice_mut(&mut buf);
-        if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
-            if bytes[mb_off].wrapping_sub(bytes[bb_off]) == 1 && mb_off + 1 < bytes.len() {
-                bytes[mb_off + 1] = 200; // position 200 >= block_size 128
-            }
-        }
-        assert!(try_decode::<128>(&buf).is_err());
-    }
-
-    #[test]
-    fn decode_exception_pos_byte_missing() {
-        let (compressed, _) = compressed_with_exceptions();
-        assert!(try_decode::<256>(&compressed[..compressed.len() - 1]).is_err());
-    }
-
-    #[test]
-    fn decode_exception_pos_out_of_block() {
-        let data: Vec<u32> = (0..128u32)
-            .map(|i| if i % 4 == 0 { 1u32 << 30 } else { 1 })
-            .collect();
-        let mut buf = encode_block::<128>(&data);
-        let start = meta_byte_start(&buf);
-        let bytes: &mut [u8] = cast_slice_mut(&mut buf);
-        if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
-            if bytes[mb_off].wrapping_sub(bytes[bb_off]) > 1 && mb_off + 1 < bytes.len() {
-                bytes[mb_off + 1] = 200; // position 200 >= block_size 128
-            }
-        }
-        assert!(try_decode::<128>(&buf).is_err());
-    }
-
     /// `decode_blocks` with `expected_len: None` and header=0 returns `Ok` with empty output.
     #[test]
     fn decode_blocks_header_only_input() {
@@ -989,29 +779,4 @@ mod tests {
             .unwrap();
         assert!(out.is_empty());
     }
-
-    #[test]
-    fn decode_exception_unpopulated_data_to_be_packed() {
-        // Hand-crafted compressed stream: out_length=256, where_meta=9,
-        // 8 packed zero words (bits=1), bytesize=4,
-        // meta=[bits=1, cexcept=1, maxbits=3, pos=0], bitmap=0.
-        // The exception buffer is never filled, so decoding must error.
-        let compressed: Vec<u32> = [
-            256u32, // out_length
-            9,      // where_meta
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,                                // 8 packed words
-            4,                                // bytesize = 4 bytes
-            u32::from_le_bytes([1, 1, 3, 0]), // meta: bits=1, cexcept=1, maxbits=3, pos=0
-            0,                                // bitmap=0
-        ]
-        .into();
-        assert!(try_decode::<256>(&compressed).is_err());
-    }
 }
diff --git a/tests/decode_validation.rs b/tests/decode_validation.rs
index edab2b6..56cf719 100644
--- a/tests/decode_validation.rs
+++ b/tests/decode_validation.rs
@@ -1,21 +1,77 @@
-//! Integration tests for **untrusted** compressed input: decoding must reject malformed
-//! streams with [`fastpfor::FastPForResult::Err`], not panic.
+//! Integration tests: malformed compressed input must be rejected via
+//! [`fastpfor::FastPForResult::Err`] through the public [`fastpfor::AnyLenCodec`] API
+//! ([`fastpfor::FastPFor128`] only).
+//!
+//! Error cases that previously lived in `fastpfor.rs` unit tests (`try_decode` /
+//! `decode_blocks`) are exercised here via `assert_fails` and `AnyLenCodec::decode`.
+
 #![cfg(feature = "rust")]
 
-use fastpfor::{AnyLenCodec, FastPFor128, FastPFor256};
+use bytemuck::{cast_slice, cast_slice_mut};
+use fastpfor::{AnyLenCodec, BlockCodec, FastPFor128, FastPForBlock128};
+
+/// Matches `DEFAULT_PAGE_SIZE` in `fastpfor` (64 Ki integers per page).
+const DEFAULT_PAGE_SIZE: u32 = 65536;
 
-/// `compressed` must not be a valid stream for `codec`. Decoding must return `Err`.
-fn assert_fails<C: AnyLenCodec + Default>(compressed: &[u32]) {
+/// `compressed` must not decode successfully. Use `Some(128)` for a single full 128-block
+/// stream; `None` for arbitrary garbage.
+fn assert_fails<C: AnyLenCodec + Default>(compressed: &[u32], expected_len: Option<u32>) {
     let mut codec = C::default();
     let mut out = Vec::new();
     assert!(
-        codec.decode(compressed, &mut out, None).is_err(),
+        codec.decode(compressed, &mut out, expected_len).is_err(),
         "expected decode to fail with Err, but it succeeded"
     );
 }
 
+fn encode<C: BlockCodec + Default>(data: &[u32]) -> Vec<u32> {
+    assert_eq!(data.len() % C::size(), 0);
+    let blocks: &[C::Block] = cast_slice(data);
+    let mut out = Vec::new();
+    C::default()
+        .encode_blocks(blocks, &mut out)
+        .expect("encode one or more blocks");
+    out
+}
+
+fn compressed_with_exceptions() -> Vec<u32> {
+    let data: Vec<u32> = (0..128u32)
+        .map(|i| if i % 2 == 0 { 1u32 << 30 } else { 3 })
+        .collect();
+    encode::<FastPForBlock128>(&data)
+}
+
+fn compressed_with_index1_exceptions() -> Vec<u32> {
+    let mut data = vec![1u32; 128];
+    data[0] = 3;
+    encode::<FastPForBlock128>(&data)
+}
+
+fn meta_byte_start(compressed: &[u32]) -> usize {
+    let where_meta = compressed[1] as usize;
+    (1 + where_meta + 1) * 4
+}
+
+fn bitmap_idx(compressed: &[u32]) -> usize {
+    let where_meta = compressed[1] as usize;
+    let bytesize_idx = 1 + where_meta;
+    let bytesize = compressed[bytesize_idx] as usize;
+    bytesize_idx + 1 + bytesize.div_ceil(4)
+}
+
+fn find_exception_block(bytes: &[u8], meta_start: usize) -> Option<(usize, usize, usize)> {
+    let mut pos = meta_start;
+    while pos + 1 < bytes.len() {
+        if bytes[pos + 1] > 0 {
+            return Some((pos, pos + 1, pos + 2));
+        }
+        pos += 2;
+    }
+    None
+}
+
 #[test]
-fn test1() {
+fn decode_returns_error_for_libfuzzer_arbitrary_words() {
     let data: &[u32] = &[
         42_926_275,
         589_967,
@@ -26,14 +82,166 @@ fn test1() {
         u32::MAX,
         36,
     ];
-    assert_fails::<FastPFor128>(data);
-    assert_fails::<FastPFor256>(data);
+    assert_fails::<FastPFor128>(data, None);
+}
+
+#[test]
+fn decode_returns_error_for_minimal_three_word_garbage() {
+    assert_fails::<FastPFor128>(&[0x200, 0, 1], None);
+}
+
+#[test]
+fn decode_returns_error_when_block_stream_truncated() {
+    let compressed = encode::<FastPForBlock128>(&[42u32; 128]);
+    for truncated_len in [1, 2, compressed.len() / 2, compressed.len() - 1] {
+        assert_fails::<FastPFor128>(&compressed[..truncated_len], Some(128));
+    }
+}
+
+#[test]
+fn decode_returns_error_when_where_meta_word_points_past_buffer() {
+    let mut compressed = encode::<FastPForBlock128>(&[1u32; 128]);
+    compressed[1] = u32::MAX;
+    assert_fails::<FastPFor128>(&compressed, Some(128));
+}
+
+#[test]
+fn decode_returns_error_when_only_out_length_word_present() {
+    assert_fails::<FastPFor128>(&[128u32], Some(128));
+}
+
+#[test]
+fn decode_returns_error_when_where_meta_out_of_bounds_on_exception_stream() {
+    let mut compressed = compressed_with_exceptions();
+    compressed[1] = u32::MAX;
+    assert_fails::<FastPFor128>(&compressed, Some(128));
+}
+
+#[test]
+fn decode_returns_error_when_bytesize_points_past_end() {
+    let mut compressed = compressed_with_exceptions();
+    compressed[1] = compressed.len() as u32 - 1;
+    assert_fails::<FastPFor128>(&compressed, Some(128));
+}
+
+#[test]
+fn decode_returns_error_when_bytesize_overflows_length() {
+    let mut compressed = compressed_with_exceptions();
+    let bytesize_idx = 1 + compressed[1] as usize;
+    compressed[bytesize_idx] = u32::MAX - 3;
+    assert_fails::<FastPFor128>(&compressed, Some(128));
+}
+
+#[test]
+fn decode_returns_error_when_bitmap_reads_past_end() {
+    let mut compressed = compressed_with_exceptions();
+    let bytesize_idx = 1 + compressed[1] as usize;
+    let remaining = (compressed.len() - bytesize_idx - 1) as u32;
+    compressed[bytesize_idx] = remaining * 4;
+    assert_fails::<FastPFor128>(&compressed, Some(128));
+}
+
+#[test]
+fn decode_returns_error_when_exception_group_size_exceeds_page() {
+    let mut compressed = compressed_with_exceptions();
+    let size_idx = bitmap_idx(&compressed) + 1;
+    compressed[size_idx] = DEFAULT_PAGE_SIZE + 1;
+    assert_fails::<FastPFor128>(&compressed, Some(128));
+}
+
+#[test]
+fn decode_returns_error_when_exception_bitstream_truncated() {
+    let compressed = compressed_with_exceptions();
+    assert_fails::<FastPFor128>(&compressed[..compressed.len() - 2], Some(128));
+}
+
+#[test]
+fn decode_returns_error_when_packed_bit_width_byte_too_large() {
+    let mut compressed = compressed_with_exceptions();
+    let start = meta_byte_start(&compressed);
+    cast_slice_mut::<_, u8>(&mut compressed)[start] = 33;
+    assert_fails::<FastPFor128>(&compressed, Some(128));
+}
+
+#[test]
+fn decode_returns_error_when_packed_region_truncated_before_metadata() {
+    let compressed = compressed_with_exceptions();
+    let where_meta = compressed[1] as usize;
+    assert_fails::<FastPFor128>(&compressed[..where_meta], Some(128));
+}
+
+#[test]
+fn decode_returns_error_when_exception_maxbits_too_large() {
+    let mut compressed = compressed_with_exceptions();
+    let start = meta_byte_start(&compressed);
+    let bytes: &mut [u8] = cast_slice_mut(&mut compressed);
+    if let Some((_, _, mb_off)) = find_exception_block(bytes, start) {
+        bytes[mb_off] = 33;
+    }
+    assert_fails::<FastPFor128>(&compressed, Some(128));
+}
+
+#[test]
+fn decode_returns_error_when_exception_index_underflows_optimal_bits() {
+    let mut compressed = compressed_with_exceptions();
+    let start = meta_byte_start(&compressed);
+    let bytes: &mut [u8] = cast_slice_mut(&mut compressed);
+    if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
+        bytes[mb_off] = bytes[bb_off].saturating_sub(1);
+    }
+    assert_fails::<FastPFor128>(&compressed, Some(128));
+}
+
+#[test]
+fn decode_returns_error_when_exception_index_equals_optimal_bits() {
+    let mut compressed = compressed_with_exceptions();
+    let start = meta_byte_start(&compressed);
+    let bytes: &mut [u8] = cast_slice_mut(&mut compressed);
+    if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
+        bytes[mb_off] = bytes[bb_off];
+    }
+    assert_fails::<FastPFor128>(&compressed, Some(128));
+}
+
+#[test]
+fn decode_returns_error_when_index1_exception_position_byte_truncated() {
+    let compressed = compressed_with_index1_exceptions();
+    assert_fails::<FastPFor128>(&compressed[..compressed.len() - 1], Some(128));
+}
+
+#[test]
+fn decode_returns_error_when_exception_position_byte_truncated() {
+    let compressed = compressed_with_exceptions();
+    assert_fails::<FastPFor128>(&compressed[..compressed.len() - 1], Some(128));
+}
+
+#[test]
+fn decode_returns_error_when_index1_exception_position_out_of_block() {
+    let mut data = vec![1u32; 128];
+    data[0] = 3;
+    let mut buf = encode::<FastPForBlock128>(&data);
+    let start = meta_byte_start(&buf);
+    let bytes: &mut [u8] = cast_slice_mut(&mut buf);
+    if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
+        if bytes[mb_off].wrapping_sub(bytes[bb_off]) == 1 && mb_off + 1 < bytes.len() {
+            bytes[mb_off + 1] = 200;
+        }
+    }
+    assert_fails::<FastPFor128>(&buf, Some(128));
 }
 
-/// Minimal garbage: tiny slice that cannot be a well-formed composite block stream.
 #[test]
-fn test2() {
-    let data = &[0x200, 0, 1];
-    assert_fails::<FastPFor128>(data);
-    assert_fails::<FastPFor256>(data);
+fn decode_returns_error_when_exception_position_out_of_block() {
+    let data: Vec<u32> = (0..128u32)
+        .map(|i| if i % 4 == 0 { 1u32 << 30 } else { 1 })
+        .collect();
+    let mut buf = encode::<FastPForBlock128>(&data);
+    let start = meta_byte_start(&buf);
+    let bytes: &mut [u8] = cast_slice_mut(&mut buf);
+    if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
+        if bytes[mb_off].wrapping_sub(bytes[bb_off]) > 1 && mb_off + 1 < bytes.len() {
+            bytes[mb_off + 1] = 200;
+        }
+    }
+    assert_fails::<FastPFor128>(&buf, Some(128));
 }

From dafe742b7fadfac578115712543d702f77d5db3c Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Sun, 22 Mar 2026 20:51:13 -0400
Subject: [PATCH 07/26] broken

---
 benches/bench_utils.rs                   | 208 +++++++++++++----------
 benches/fastpfor_benchmark.rs            |  20 +--
 src/codec.rs                             |   5 +-
 src/rust/integer_compression/fastpfor.rs |   2 +-
 tests/basic_tests.rs                     |  76 ++-------
 tests/benchmark_smoke.rs                 |  97 +++++------
 tests/encode_paths.rs                    |  56 ++----
 7 files changed, 202 insertions(+), 262 deletions(-)

diff --git a/benches/bench_utils.rs b/benches/bench_utils.rs
index 2cb2b2c..73a6a27 100644
--- a/benches/bench_utils.rs
+++ b/benches/bench_utils.rs
@@ -1,24 +1,103 @@
 //! Shared data generators, codec helpers, and pre-computed fixtures used by
-//! both the Criterion benchmark (`fastpfor_benchmark.rs`) and the smoke-test
-//! suite (`tests/benchmark_smoke.rs`).
+//! the Criterion benchmark (`fastpfor_benchmark.rs`), smoke tests
+//! (`tests/benchmark_smoke.rs`), and targeted integration tests
+//! (`tests/encode_paths.rs`).
 //!
-//! Loaded as a module via `#[path]` in both consumers, so every item consumed
-//! from outside must be `pub`.
+//! Loaded as a module via `#[path]`, so every item consumed from outside must
+//! be `pub`. Each consumer uses a different subset, so dead-code is allowed
+//! at module scope.
 
 // This is an internal dev-only module; doc-comments on every field would add
 // noise without benefit.
-#![allow(missing_docs)]
+#![allow(dead_code, missing_docs)]
 
 use core::ops::Range;
 use std::marker::PhantomData;
 
 #[allow(unused_imports)]
 use fastpfor::{AnyLenCodec, BlockCodec, slice_to_blocks};
+use fastpfor::{
+    FastPFor128, FastPFor256, FastPForBlock128, FastPForBlock256, JustCopy, VariableByte,
+};
 use rand::rngs::StdRng;
 use rand::{RngExt as _, SeedableRng};
 
 const SEED: u64 = 456;
 
+// ---------------------------------------------------------------------------
+// Generic codec helpers
+// ---------------------------------------------------------------------------
+
+pub fn roundtrip<C: AnyLenCodec>(data: &[u32]) {
+    let compressed = compress::<C>(data);
+    let decompressed = decompress::<C>(&compressed, Some(data.len() as u32));
+    assert_eq!(decompressed, data);
+}
+
+pub fn compress<C: AnyLenCodec>(data: &[u32]) -> Vec<u32> {
+    let mut codec = C::default();
+    let mut compressed = Vec::new();
+    codec.encode(data, &mut compressed).unwrap();
+    compressed
+}
+
+pub fn decompress<C: AnyLenCodec>(compressed: &Vec<u32>, expected_len: Option<u32>) -> Vec<u32> {
+    let mut codec = C::default();
+    let mut decompressed = Vec::new();
+    codec
+        .decode(&compressed, &mut decompressed, expected_len)
+        .unwrap();
+    decompressed
+}
+
+pub fn block_roundtrip<C: BlockCodec>(data: &[u32]) {
+    let compressed = block_compress::<C>(data);
+    let decompressed = block_decompress::<C>(&compressed, Some(data.len() as u32));
+    assert_eq!(decompressed, data);
+}
+
+pub fn block_compress<C: BlockCodec>(data: &[u32]) -> Vec<u32> {
+    let mut codec = C::default();
+    let (blocks, remainder) = slice_to_blocks::<C>(data);
+    assert_eq!(
+        remainder.len(),
+        0,
+        "data length must be a multiple of block size"
+    );
+    let mut out = Vec::new();
+    codec.encode_blocks(blocks, &mut out).unwrap();
+    out
+}
+
+pub fn block_decompress<C: BlockCodec>(compressed: &[u32], expected_len: Option<u32>) -> Vec<u32> {
+    let mut codec = C::default();
+    let mut out = Vec::new();
+    codec
+        .decode_blocks(compressed, expected_len, &mut out)
+        .unwrap();
+    out
+}
+
+/// Interpret `data` as little-endian `u32` words (length must be a multiple of 4) and
+/// run [`roundtrip`] for every any-length codec covered here.
+pub fn roundtrip_all(data: &[u32]) {
+    roundtrip::<VariableByte>(data);
+    roundtrip::<JustCopy>(data);
+    roundtrip::<FastPFor256>(data);
+    roundtrip::<FastPFor128>(data);
+
+    #[cfg(feature = "cpp")]
+    {
+        use fastpfor::cpp::*;
+        roundtrip::<CppFastPFor128>(data);
+    }
+}
+
+pub fn block_roundtrip_all(data: &[u32]) {
+    block_roundtrip::<FastPForBlock256>(data);
+    block_roundtrip::<FastPForBlock128>(data);
+}
+
 // ---------------------------------------------------------------------------
 // Data generators (private — only used to build fixtures)
 // ---------------------------------------------------------------------------
@@ -110,61 +189,6 @@ const ALL_PATTERNS: &[(&str, DataGeneratorFn)] = &[
     ("geometric", generate_geometric_data),
 ];
 
-// ---------------------------------------------------------------------------
-// Generic codec helpers
-// ---------------------------------------------------------------------------
-
-/// Compress `data` with codec `C`, appending to `out` (which is cleared first).
-///
-/// Only the block-aligned prefix of `data` is compressed; any sub-block
-/// remainder is silently dropped, matching what the benchmarks measure.
-pub fn compress<C: BlockCodec + Default>(data: &[u32], out: &mut Vec<u32>) {
-    let mut codec = C::default();
-    let (blocks, _remainder) = slice_to_blocks::<C>(data);
-    out.clear();
-    codec.encode_blocks(blocks, out).unwrap();
-}
-
-/// Decompress `n_blocks` blocks of codec `C` from `compressed` into `out`
-/// (cleared first), returning the number of elements written.
-#[allow(dead_code)] // used by smoke tests; benches use codec directly
-pub fn decompress<C: BlockCodec + Default>(
-    compressed: &[u32],
-    n_blocks: usize,
-    out: &mut Vec<u32>,
-) -> usize {
-    let mut codec = C::default();
-    out.clear();
-    let expected_values = n_blocks * C::size();
-    codec
-        .decode_blocks(
-            compressed,
-            Some(u32::try_from(expected_values).expect("expected_values fits in u32")),
-            out,
-        )
-        .unwrap();
-    out.len()
-}
-
-/// Decompress with any-length codec `C`, using `expected_len` for validation/pre-allocation.
-#[allow(dead_code)] // used by smoke_cpp_vs_rust
-pub fn decompress_anylen<C: AnyLenCodec + Default>(
-    compressed: &[u32],
-    expected_len: usize,
-    out: &mut Vec<u32>,
-) -> usize {
-    let mut codec = C::default();
-    out.clear();
-    codec
-        .decode(
-            compressed,
-            out,
-            Some(u32::try_from(expected_len).expect("expected_len fits in u32")),
-        )
-        .unwrap();
-    out.len()
-}
-
 // ---------------------------------------------------------------------------
 // Pre-computed fixtures
 // ---------------------------------------------------------------------------
@@ -176,7 +200,7 @@ pub fn decompress_anylen<C: AnyLenCodec + Default>(
 pub struct CompressFixture<C: BlockCodec> {
     pub name: &'static str,
     /// Block-aligned uncompressed data (exactly `n_blocks * C::elements_per_block()` elements).
-    pub data: Vec<u32>,
+    pub original: Vec<u32>,
     /// Pre-compressed form, ready for decompression benchmarks.
     pub compressed: Vec<u32>,
     /// Number of blocks in `data`.
@@ -184,16 +208,36 @@ pub struct CompressFixture<C: BlockCodec> {
     _codec: PhantomData<C>,
 }
 
-impl<C: BlockCodec + Default> CompressFixture<C> {
+/// One row for the block-size comparison benchmark.
+///
+/// Parameterised by `C: BlockCodec` — create one per codec to compare.
+/// FIXME: deduplicate these two structs if possible
+pub struct BlockSizeFixture<C: BlockCodec> {
+    pub compressed: Vec<u32>,
+    pub original: Vec<u32>,
+    pub n_blocks: usize,
+    _codec: PhantomData<C>,
+}
+
+impl<C: BlockCodec> CompressFixture<C> {
     fn new(name: &'static str, generator: DataGeneratorFn, block_count: usize) -> Self {
-        let data = generator(block_count * C::size());
-        // Data is already exactly block_count * blen elements; no trimming needed.
-        let mut compressed = Vec::new();
-        compress::<C>(&data, &mut compressed);
+        let original = generator(block_count * C::size());
         Self {
             name,
-            data,
-            compressed,
+            compressed: block_compress::<C>(&original),
+            original,
+            n_blocks: block_count,
+            _codec: PhantomData,
+        }
+    }
+}
+
+impl<C: BlockCodec> BlockSizeFixture<C> {
+    pub fn new(block_count: usize) -> Self {
+        let original = generate_uniform_data_small_value_distribution(block_count * C::size());
+        Self {
+            compressed: block_compress::<C>(&original),
+            original,
             n_blocks: block_count,
             _codec: PhantomData,
         }
@@ -201,7 +245,7 @@ impl<C: BlockCodec + Default> CompressFixture<C> {
 }
 
 /// Build fixtures for every `COMPRESS_PATTERNS × block_counts` combination.
-pub fn compress_fixtures<C: BlockCodec + Default>(
+pub fn compress_fixtures<C: BlockCodec>(
     block_counts: &[usize],
 ) -> Vec<(usize, CompressFixture<C>)> {
     block_counts
@@ -215,33 +259,9 @@ pub fn compress_fixtures<C: BlockCodec + Default>(
 }
 
 /// Build fixtures for every `ALL_PATTERNS` at a single block count.
-pub fn ratio_fixtures<C: BlockCodec + Default>(block_count: usize) -> Vec<CompressFixture<C>> {
+pub fn ratio_fixtures<C: BlockCodec>(block_count: usize) -> Vec<CompressFixture<C>> {
     ALL_PATTERNS
         .iter()
         .map(|&(name, generator)| CompressFixture::<C>::new(name, generator, block_count))
         .collect()
 }
-
-/// One row for the block-size comparison benchmark.
-///
-/// Parameterised by `C: BlockCodec` — create one per codec to compare.
-pub struct BlockSizeFixture<C: BlockCodec> {
-    pub data: Vec<u32>,
-    pub compressed: Vec<u32>,
-    pub n_blocks: usize,
-    _codec: PhantomData<C>,
-}
-
-impl<C: BlockCodec + Default> BlockSizeFixture<C> {
-    pub fn new(block_count: usize) -> Self {
-        let data = generate_uniform_data_small_value_distribution(block_count * C::size());
-        let mut compressed = Vec::new();
-        compress::<C>(&data, &mut compressed);
-        Self {
-            data,
-            compressed,
-            n_blocks: block_count,
-            _codec: PhantomData,
-        }
-    }
-}
diff --git a/benches/fastpfor_benchmark.rs b/benches/fastpfor_benchmark.rs
index 2e8f341..762057f 100644
--- a/benches/fastpfor_benchmark.rs
+++ b/benches/fastpfor_benchmark.rs
@@ -23,9 +23,9 @@ const BLOCK_COUNTS: &[usize] = &[8, 32];
 fn benchmark_compression(c: &mut Criterion) {
     let mut group = c.benchmark_group("compression");
     for (bc, fix) in compress_fixtures::<FastPForBlock128>(BLOCK_COUNTS) {
-        let n_elem = fix.data.len();
+        let n_elem = fix.original.len();
         group.throughput(Throughput::Elements(n_elem as u64));
-        group.bench_with_input(BenchmarkId::new(fix.name, bc), &fix.data, |b, data| {
+        group.bench_with_input(BenchmarkId::new(fix.name, bc), &fix.original, |b, data| {
             let mut codec = FastPForBlock128::default();
             let (blocks, _) = slice_to_blocks::<FastPForBlock128>(data);
             let mut out = Vec::new();
@@ -42,7 +42,7 @@ fn benchmark_compression(c: &mut Criterion) {
 fn benchmark_decompression(c: &mut Criterion) {
     let mut group = c.benchmark_group("decompression");
     for (bc, fix) in compress_fixtures::<FastPForBlock128>(BLOCK_COUNTS) {
-        let n_elem = fix.data.len();
+        let n_elem = fix.original.len();
         group.throughput(Throughput::Elements(n_elem as u64));
         group.bench_with_input(BenchmarkId::new(fix.name, bc), &fix, |b, fix| {
             let mut codec = FastPForBlock128::default();
@@ -113,14 +113,14 @@ fn benchmark_block_sizes(c: &mut Criterion) {
     for (label, data, compressed, n_blocks, is_256) in [
         (
             "128",
-            &fix128.data,
+            &fix128.original,
             &fix128.compressed,
             fix128.n_blocks,
             false,
         ),
         (
             "256",
-            &fix256.data,
+            &fix256.original,
             &fix256.compressed,
             fix256.n_blocks,
             true,
@@ -185,7 +185,7 @@ fn benchmark_compression_ratio(c: &mut Criterion) {
     for fix in ratio_fixtures::<FastPForBlock128>(bc) {
         group.bench_function(fix.name, |b| {
             let mut codec = FastPForBlock128::default();
-            let (blocks, _) = slice_to_blocks::<FastPForBlock128>(&fix.data);
+            let (blocks, _) = slice_to_blocks::<FastPForBlock128>(&fix.original);
             let mut out = Vec::new();
             b.iter(|| {
                 out.clear();
@@ -194,7 +194,7 @@ fn benchmark_compression_ratio(c: &mut Criterion) {
                     clippy::cast_precision_loss,
                     reason = "Loss of precision is acceptable for compression ratio calculation"
                 )]
-                black_box(fix.data.len() as f64 / out.len() as f64)
+                black_box(fix.original.len() as f64 / out.len() as f64)
             });
         });
     }
@@ -207,11 +207,11 @@ fn benchmark_compression_ratio(c: &mut Criterion) {
 fn benchmark_cpp_vs_rust(c: &mut Criterion) {
     let mut group = c.benchmark_group("cpp_vs_rust/encode");
     for (bc, fix) in compress_fixtures::<FastPForBlock128>(BLOCK_COUNTS) {
-        let n_elem = fix.data.len();
+        let n_elem = fix.original.len();
         group.throughput(Throughput::Elements(n_elem as u64));
         group.bench_with_input(
             BenchmarkId::new(format!("cpp/{}", fix.name), bc),
-            &fix.data,
+            &fix.original,
             |b, data| {
                 let mut codec = CppFastPFor128::default();
                 let mut out = Vec::new();
@@ -224,7 +224,7 @@ fn benchmark_cpp_vs_rust(c: &mut Criterion) {
         );
         group.bench_with_input(
             BenchmarkId::new(format!("rust/{}", fix.name), bc),
-            &fix.data,
+            &fix.original,
             |b, data| {
                 let mut codec = FastPForBlock128::default();
                 let (blocks, _) = slice_to_blocks::<FastPForBlock128>(data);
diff --git a/src/codec.rs b/src/codec.rs
index 53fc425..9f55563 100644
--- a/src/codec.rs
+++ b/src/codec.rs
@@ -25,6 +25,7 @@ pub(crate) fn default_max_decoded_len(compressed_words: usize) -> usize {
 ///
 /// ```
 /// # use fastpfor::{BlockCodec, FastPForResult};
+/// #[derive(Default)]
 /// struct MyCodec;
 /// impl BlockCodec for MyCodec {
 ///     type Block = [u32; 256];
@@ -34,7 +35,7 @@ pub(crate) fn default_max_decoded_len(compressed_words: usize) -> usize {
 ///         out: &mut Vec<u32>) -> FastPForResult<usize> { todo!() }
 /// }
 /// ```
-pub trait BlockCodec {
+pub trait BlockCodec: Default {
     /// The fixed-size block type.  Must be plain-old-data (`Pod`).
     /// In practice this will be `[u32; 128]` or `[u32; 256]`.
     type Block: Pod;
@@ -112,7 +113,7 @@ pub trait BlockCodec64 {
 /// variable-length codecs (e.g. `VariableByte`, `JustCopy`) implement this
 /// trait directly.  Block-oriented codecs are wrapped in `CompositeCodec`
 /// to produce an `AnyLenCodec`.
-pub trait AnyLenCodec {
+pub trait AnyLenCodec: Default {
     /// Compress an arbitrary-length slice of `u32` values.
     fn encode(&mut self, input: &[u32], out: &mut Vec<u32>) -> FastPForResult<()>;
 
diff --git a/src/rust/integer_compression/fastpfor.rs b/src/rust/integer_compression/fastpfor.rs
index a4dfa1b..8c18b5e 100644
--- a/src/rust/integer_compression/fastpfor.rs
+++ b/src/rust/integer_compression/fastpfor.rs
@@ -503,7 +503,7 @@ where
 {
     type Block = [u32; N];
 
-    fn encode_blocks(&mut self, blocks: &[[u32; N]], out: &mut Vec<u32>) -> FastPForResult<()> {
+    fn encode_blocks(&mut self, blocks: &[Self::Block], out: &mut Vec<u32>) -> FastPForResult<()> {
         let n_values = (blocks.len() * N) as u32;
         if blocks.is_empty() {
             out.push(n_values);
diff --git a/tests/basic_tests.rs b/tests/basic_tests.rs
index a0f3301..6f3ac8f 100644
--- a/tests/basic_tests.rs
+++ b/tests/basic_tests.rs
@@ -2,65 +2,23 @@
 
 #![cfg(feature = "rust")]
 
-use fastpfor::{
-    AnyLenCodec, BlockCodec, FastPFor128, FastPFor256, FastPForBlock128, FastPForBlock256,
-    JustCopy, VariableByte, slice_to_blocks,
-};
+#[path = "../benches/bench_utils.rs"]
+mod bench_utils;
+
+use fastpfor::{BlockCodec, FastPForBlock128, FastPForBlock256, slice_to_blocks};
 use rand::rngs::StdRng;
 use rand::{RngExt as _, SeedableRng};
 
-mod common;
-
-// ── Generic helpers ───────────────────────────────────────────────────────────
-
-fn anylen_roundtrip<C: AnyLenCodec + ?Sized>(codec: &mut C, data: &[u32]) {
-    let mut compressed = Vec::new();
-    codec
-        .encode(data, &mut compressed)
-        .unwrap_or_else(|e| panic!("encode failed: {e:?}"));
-    let mut decoded = Vec::new();
-    codec
-        .decode(&compressed, &mut decoded, None)
-        .unwrap_or_else(|e| panic!("decode failed: {e:?}"));
-    assert_eq!(decoded, data);
-}
+use crate::bench_utils::{block_roundtrip_all, roundtrip_all};
 
-fn block_roundtrip<C: BlockCodec + Default>(data: &[u32]) {
-    let mut codec = C::default();
-    let (blocks, _) = slice_to_blocks::<C>(data);
-    let mut compressed = Vec::new();
-    codec.encode_blocks(blocks, &mut compressed).unwrap();
-    let mut decoded = Vec::new();
-    let expected_values = blocks.len() * C::size();
-    codec
-        .decode_blocks(
-            &compressed,
-            Some(u32::try_from(expected_values).expect("expected_values fits in u32")),
-            &mut decoded,
-        )
-        .unwrap();
-    assert_eq!(decoded, &data[..expected_values]);
-}
+mod common;
 
 // ── Tests ─────────────────────────────────────────────────────────────────────
 
 #[test]
 #[cfg(feature = "cpp")]
 fn saul_test() {
-    use fastpfor::cpp::CppFastPFor128;
-    // Block codecs + tail for any-length. C++ block codecs are already any-length; use directly.
-    let mut codecs: Vec<(&str, Box<dyn AnyLenCodec>)> = vec![
-        ("JustCopy", Box::new(JustCopy)),
-        ("FastPFor256", Box::new(FastPFor256::default())),
-        ("FastPFor128", Box::new(FastPFor128::default())),
-        ("CppFastPFor128", Box::new(CppFastPFor128::default())),
-    ];
-    let input = vec![2u32, 3, 4, 5];
-    for (name, codec) in &mut codecs {
-        anylen_roundtrip(codec.as_mut(), &input);
-        // silence unused-variable warning when cpp feature is off
-        let _ = name;
-    }
+    roundtrip_all(&[2u32, 3, 4, 5]);
 }
 
 /// Sub-block-sized inputs produce no output via `BlockCodec`.
@@ -82,17 +40,13 @@ fn spurious_out_test() {
 /// `AnyLenCodec` round-trips empty input correctly.
 #[test]
 fn zero_in_zero_out_test() {
-    anylen_roundtrip(&mut VariableByte::new(), &[]);
-    anylen_roundtrip(&mut JustCopy::new(), &[]);
-    anylen_roundtrip(&mut FastPFor256::default(), &[]);
-    anylen_roundtrip(&mut FastPFor128::default(), &[]);
+    roundtrip_all(&[]);
 }
 
 #[test]
 fn test_increasing_sequence() {
     let data: Vec<u32> = (0..256u32).collect();
-    anylen_roundtrip(&mut FastPFor256::default(), &data);
-    anylen_roundtrip(&mut FastPFor128::default(), &data);
+    roundtrip_all(&data);
 }
 
 #[test]
@@ -100,24 +54,22 @@ fn test_random_numbers() {
     let data: Vec<u32> = (0..65536)
         .map(|_| StdRng::seed_from_u64(123456).random())
         .collect();
-    anylen_roundtrip(&mut FastPFor256::default(), &data);
-    anylen_roundtrip(&mut FastPFor128::default(), &data);
+    roundtrip_all(&data);
 }
 
 /// `BlockCodec` round-trip using `slice_to_blocks` to split aligned input.
 #[test]
 fn block_codec_roundtrip() {
-    block_roundtrip::<FastPForBlock256>(&(0u32..512).collect::<Vec<_>>());
-    block_roundtrip::<FastPForBlock128>(&(0u32..512).collect::<Vec<_>>());
+    let data: Vec<u32> = (0u32..512).collect();
+    block_roundtrip_all(&data);
 }
 
 /// `AnyLenCodec` round-trip with random values at various lengths.
 #[test]
-fn anylen_random_roundtrip() {
+fn random_roundtrip() {
     let mut rng = rand::rng();
     for n in [128usize, 300, 512, 1000, 4096] {
         let data: Vec<u32> = (0..n).map(|_| rng.random()).collect();
-        anylen_roundtrip(&mut FastPFor256::default(), &data);
-        anylen_roundtrip(&mut FastPFor128::default(), &data);
+        roundtrip_all(&data);
     }
 }
diff --git a/tests/benchmark_smoke.rs b/tests/benchmark_smoke.rs
index 782d5a1..ae5de15 100644
--- a/tests/benchmark_smoke.rs
+++ b/tests/benchmark_smoke.rs
@@ -9,24 +9,26 @@
 #[path = "../benches/bench_utils.rs"]
 mod bench_utils;
 
-#[cfg(feature = "cpp")]
-use bench_utils::decompress_anylen;
-use bench_utils::{BlockSizeFixture, compress, compress_fixtures, decompress, ratio_fixtures};
 #[cfg(feature = "cpp")]
 use fastpfor::BlockCodec;
 #[cfg(feature = "cpp")]
 use fastpfor::cpp::CppFastPFor128;
 use fastpfor::{FastPForBlock128, FastPForBlock256};
 
+#[cfg(feature = "cpp")]
+use crate::bench_utils::decompress;
+use crate::bench_utils::{
+    BlockSizeFixture, block_compress, block_decompress, block_roundtrip, compress_fixtures,
+    ratio_fixtures,
+};
+
 const SMOKE_BLOCK_COUNT: usize = 2;
 
 #[test]
 fn smoke_compression() {
     for (_, fix) in compress_fixtures::<FastPForBlock128>(&[SMOKE_BLOCK_COUNT]) {
-        let mut out = Vec::new();
-        compress::<FastPForBlock128>(&fix.data, &mut out);
         assert!(
-            !out.is_empty(),
+            !fix.original.is_empty(),
             "{}: compressed output must be non-empty",
             fix.name
         );
@@ -36,15 +38,19 @@ fn smoke_compression() {
 #[test]
 fn smoke_decompression() {
     for (_, fix) in compress_fixtures::<FastPForBlock128>(&[SMOKE_BLOCK_COUNT]) {
-        let mut decompressed = Vec::new();
-        let n = decompress::<FastPForBlock128>(&fix.compressed, fix.n_blocks, &mut decompressed);
+        let decompressed =
+            block_decompress::<FastPForBlock128>(&fix.compressed, Some(fix.original.len() as u32));
         assert_eq!(
-            n,
-            fix.data.len(),
+            decompressed.len(),
+            fix.original.len(),
             "{}: decompressed length mismatch",
             fix.name
         );
-        assert_eq!(decompressed, fix.data, "{}: roundtrip mismatch", fix.name);
+        assert_eq!(
+            decompressed, fix.original,
+            "{}: roundtrip mismatch",
+            fix.name
+        );
     }
 }
 
@@ -52,12 +58,7 @@ fn smoke_decompression() {
 #[test]
 fn smoke_roundtrip() {
     for (_, fix) in compress_fixtures::<FastPForBlock128>(&[SMOKE_BLOCK_COUNT]) {
-        let mut compressed = Vec::new();
-        compress::<FastPForBlock128>(&fix.data, &mut compressed);
-        let mut decompressed = Vec::new();
-        let n = decompress::<FastPForBlock128>(&compressed, fix.n_blocks, &mut decompressed);
-        assert_eq!(n, fix.data.len(), "{}: roundtrip length mismatch", fix.name);
-        assert_eq!(decompressed, fix.data, "{}: roundtrip mismatch", fix.name);
+        block_roundtrip::<FastPForBlock128>(&fix.original);
     }
 }
 
@@ -68,34 +69,43 @@ fn smoke_block_sizes() {
 
     // 128-element blocks
     {
-        let mut compressed = Vec::new();
-        compress::<FastPForBlock128>(&fix128.data, &mut compressed);
+        let compressed = block_compress::<FastPForBlock128>(&fix128.original);
         assert_eq!(
             compressed, fix128.compressed,
             "128: compress output mismatch"
         );
-        let mut decompressed = Vec::new();
-        let n = decompress::<FastPForBlock128>(&compressed, fix128.n_blocks, &mut decompressed);
-        assert_eq!(n, fix128.data.len(), "128: decompressed length mismatch");
-        assert_eq!(decompressed, fix128.data, "128: roundtrip mismatch");
+        let decompressed =
+            block_decompress::<FastPForBlock128>(&compressed, Some(fix128.original.len() as u32));
+        assert_eq!(
+            decompressed.len(),
+            fix128.original.len(),
+            "128: decompressed length mismatch"
+        );
+        assert_eq!(decompressed, fix128.original, "128: roundtrip mismatch");
     }
 
     // 256-element blocks
     {
-        let mut compressed = Vec::new();
-        compress::<FastPForBlock256>(&fix256.data, &mut compressed);
-        let mut decompressed = Vec::new();
-        let n = decompress::<FastPForBlock256>(&compressed, fix256.n_blocks, &mut decompressed);
-        assert_eq!(n, fix256.data.len(), "256: decompressed length mismatch");
-        assert_eq!(decompressed, fix256.data, "256: roundtrip mismatch");
+        let compressed = block_compress::<FastPForBlock256>(&fix256.original);
+        assert_eq!(
+            compressed, fix256.compressed,
+            "256: compress output mismatch"
+        );
+        let decompressed =
+            block_decompress::<FastPForBlock256>(&compressed, Some(fix256.original.len() as u32));
+        assert_eq!(
+            decompressed.len(),
+            fix256.original.len(),
+            "256: decompressed length mismatch"
+        );
+        assert_eq!(decompressed, fix256.original, "256: roundtrip mismatch");
     }
 }
 
 #[test]
 fn smoke_compression_ratio() {
     for fix in ratio_fixtures::<FastPForBlock128>(SMOKE_BLOCK_COUNT) {
-        let mut out = Vec::new();
-        compress::<FastPForBlock128>(&fix.data, &mut out);
+        let out = block_compress::<FastPForBlock128>(&fix.original);
         assert!(
             !out.is_empty(),
             "{}: compressed output must be non-empty",
@@ -105,7 +115,7 @@ fn smoke_compression_ratio() {
             clippy::cast_precision_loss,
             reason = "Loss of precision is acceptable for compression ratio calculation"
         )]
-        let ratio = fix.data.len() as f64 / out.len() as f64;
+        let ratio = fix.original.len() as f64 / out.len() as f64;
         assert!(
             ratio > 0.0,
             "{}: compression ratio must be positive",
@@ -120,24 +130,11 @@ fn smoke_cpp_vs_rust() {
     for (_, fix) in compress_fixtures::<FastPForBlock128>(&[SMOKE_BLOCK_COUNT]) {
         let expected_len = fix.n_blocks * FastPForBlock128::size();
 
-        // C++ decode (same wire format as Rust; C++ uses AnyLenCodec)
-        let mut cpp_out = Vec::new();
-        let n = decompress_anylen::<CppFastPFor128>(&fix.compressed, expected_len, &mut cpp_out);
-        assert_eq!(
-            n, expected_len,
-            "{}: C++ decoded wrong element count",
-            fix.name
-        );
-        assert_eq!(cpp_out, fix.data, "{}: C++ roundtrip mismatch", fix.name);
+        let out = decompress::<CppFastPFor128>(&fix.compressed, Some(expected_len as u32));
+        assert_eq!(out, fix.original, "{}: Bad C++ roundtrip", fix.name);
 
-        // Rust decode
-        let mut rust_out = Vec::new();
-        let n = decompress::<FastPForBlock128>(&fix.compressed, fix.n_blocks, &mut rust_out);
-        assert_eq!(
-            n, expected_len,
-            "{}: Rust decoded wrong element count",
-            fix.name
-        );
-        assert_eq!(rust_out, fix.data, "{}: Rust roundtrip mismatch", fix.name);
+        let out =
+            block_decompress::<FastPForBlock128>(&fix.compressed, Some(fix.original.len() as u32));
+        assert_eq!(out, fix.original, "{}: Bad Rust roundtrip", fix.name);
     }
 }
diff --git a/tests/encode_paths.rs b/tests/encode_paths.rs
index b54a887..b9402e0 100644
--- a/tests/encode_paths.rs
+++ b/tests/encode_paths.rs
@@ -6,38 +6,11 @@
 
 #![cfg(feature = "rust")]
 
-use std::mem::size_of;
+#[path = "../benches/bench_utils.rs"]
+mod bench_utils;
 
-use fastpfor::{
-    AnyLenCodec, BlockCodec, FastPFor128, FastPFor256, FastPForBlock256, JustCopy, VariableByte,
-    slice_to_blocks,
-};
-// ── helpers ───────────────────────────────────────────────────────────────────
-
-fn roundtrip<C: AnyLenCodec>(codec: &mut C, data: &[u32]) {
-    let mut compressed = Vec::new();
-    codec.encode(data, &mut compressed).unwrap();
-    let mut decompressed = Vec::new();
-    codec.decode(&compressed, &mut decompressed, None).unwrap();
-    assert_eq!(decompressed, data);
-}
-
-fn block_roundtrip<C: BlockCodec + Default>(data: &[u32]) {
-    let mut codec = C::default();
-    let (blocks, _) = slice_to_blocks::<C>(data);
-    let mut compressed = Vec::new();
-    codec.encode_blocks(blocks, &mut compressed).unwrap();
-    let mut decoded = Vec::new();
-    let expected_values = blocks.len() * (size_of::<C::Block>() / 4);
-    codec
-        .decode_blocks(
-            &compressed,
-            Some(u32::try_from(expected_values).expect("expected_values fits in u32")),
-            &mut decoded,
-        )
-        .unwrap();
-    assert_eq!(decoded, &data[..expected_values]);
-}
+use bench_utils::{block_roundtrip, roundtrip};
+use fastpfor::{AnyLenCodec, FastPFor128, FastPFor256, FastPForBlock256, JustCopy, VariableByte};
 
 // ── VariableByte round-trip ───────────────────────────────────────────────────
 
@@ -45,22 +18,19 @@ fn block_roundtrip<C: BlockCodec + Default>(data: &[u32]) {
 /// (1- through 5-byte encodings).
 #[test]
 fn variable_byte_roundtrip_all_widths() {
-    roundtrip(
-        &mut VariableByte::new(),
-        &[1u32, 127, 128, 16383, 16384, u32::MAX],
-    );
+    roundtrip::<VariableByte>(&[1u32, 127, 128, 16383, 16384, u32::MAX]);
 }
 
 #[test]
 fn variable_byte_roundtrip_empty() {
-    roundtrip(&mut VariableByte::new(), &[]);
+    roundtrip::<VariableByte>(&[]);
 }
 
 // ── JustCopy via AnyLenCodec ─────────────────────────────────────────────────
 
 #[test]
 fn justcopy_roundtrip() {
-    roundtrip(&mut JustCopy::new(), &[1u32, 2, 3, 42, u32::MAX]);
+    roundtrip::<JustCopy>(&[1u32, 2, 3, 42, u32::MAX]);
 }
 
 // ── BlockCodec: FastPForBlock256 — block-exact input ───────────────────────────────
@@ -78,14 +48,14 @@ fn fastpfor256_block_roundtrip() {
 fn fastpfor_multi_page_encode_decode() {
     // 65536 (default page size) + 256 (one block) — enough to span two pages
     let data: Vec<u32> = (0..65792u32).map(|i| i % 1024).collect();
-    roundtrip(&mut FastPFor256::default(), &data);
+    roundtrip::<FastPFor256>(&data);
 }
 
 /// A block of all zeros causes `best_b_from_data` to decrement `optimal_bits`
 /// all the way to 0 — no packed words are written.
 #[test]
 fn fastpfor_encode_all_zeros() {
-    roundtrip(&mut FastPFor256::default(), &vec![0u32; 256]);
+    roundtrip::<FastPFor256>(&vec![0u32; 256]);
 }
 
 /// When the metadata byte count is already a multiple of 4 the padding loop
@@ -93,13 +63,13 @@ fn fastpfor_encode_all_zeros() {
 #[test]
 fn fastpfor_encode_metadata_already_aligned() {
     let data = vec![0u32; 32768]; // 128 blocks of 256 zeros
-    roundtrip(&mut FastPFor256::default(), &data);
+    roundtrip::<FastPFor256>(&data);
 }
 
 /// When every value needs all 32 bits.
 #[test]
 fn fastpfor_encode_all_max_u32() {
-    roundtrip(&mut FastPFor256::default(), &vec![u32::MAX; 256]);
+    roundtrip::<FastPFor256>(&vec![u32::MAX; 256]);
 }
 
 /// Exception index == 1 branch.
@@ -108,7 +78,7 @@ fn fastpfor_encode_exception_index1() {
     let mut data = vec![1u32; 256];
     data[0] = 3;
     data[128] = 3;
-    roundtrip(&mut FastPFor256::default(), &data);
+    roundtrip::<FastPFor256>(&data);
 }
 
 /// 128-element block size with exceptions.
@@ -117,7 +87,7 @@ fn fastpfor_encode_128_block_with_exceptions() {
     let data: Vec<u32> = (0..128)
         .map(|i| if i % 4 == 0 { 1u32 << 28 } else { 1 })
         .collect();
-    roundtrip(&mut FastPFor128::default(), &data);
+    roundtrip::<FastPFor128>(&data);
 }
 
 // ── VariableByte AnyLenCodec edge cases ──────────────────────────────────────

From 087f1fc76dba6c00b4c6bbc36428a7768a181e2d Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Sun, 22 Mar 2026 22:27:57 -0400
Subject: [PATCH 08/26] simplify tests

---
 Cargo.toml                                    |   6 -
 benches/bench_utils.rs                        | 267 ---------------
 benches/fastpfor_benchmark.rs                 |  11 +-
 fuzz/fuzz_targets/common.rs                   | 269 ++++++++++++---
 src/cpp/codecs.rs                             |  58 +---
 src/cpp/tests.rs                              |   6 +-
 src/lib.rs                                    |   7 +
 src/rust/composite.rs                         |  83 ++---
 src/rust/integer_compression/just_copy.rs     |  16 +-
 src/rust/integer_compression/variable_byte.rs |  23 +-
 src/test_utils.rs                             | 323 ++++++++++++++++++
 tests/basic_tests.rs                          |   6 +-
 tests/benchmark_smoke.rs                      |  10 +-
 tests/cpp_compat_tests.rs                     |  46 +--
 tests/encode_paths.rs                         |  22 +-
 15 files changed, 638 insertions(+), 515 deletions(-)
 delete mode 100644 benches/bench_utils.rs
 create mode 100644 src/test_utils.rs

diff --git a/Cargo.toml b/Cargo.toml
index ce87f1c..31f5890 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,12 +21,6 @@ name = "fastpfor_benchmark"
 required-features = ["rust"]
 harness = false
 
-[[bench]]
-name = "bench_utils"
-required-features = ["rust"]
-harness = false
-bench = false
-
 [features]
 default = ["rust"]
 # Used internally for testing and benchmarking. Not intended for public use.
diff --git a/benches/bench_utils.rs b/benches/bench_utils.rs
deleted file mode 100644
index 73a6a27..0000000
--- a/benches/bench_utils.rs
+++ /dev/null
@@ -1,267 +0,0 @@
-//! Shared data generators, codec helpers, and pre-computed fixtures used by
-//! the Criterion benchmark (`fastpfor_benchmark.rs`), smoke tests
-//! (`tests/benchmark_smoke.rs`), and targeted integration tests
-//! (`tests/encode_paths.rs`).
-//!
-//! Loaded as a module via `#[path]`, so every item consumed from outside must
-//! be `pub`. Each consumer uses a different subset, so dead-code is allowed
-//! at module scope.
-
-// This is an internal dev-only module; doc-comments on every field would add
-// noise without benefit.
-#![allow(dead_code, missing_docs)]
-
-use core::ops::Range;
-use std::marker::PhantomData;
-
-#[allow(unused_imports)]
-use fastpfor::{AnyLenCodec, BlockCodec, slice_to_blocks};
-use fastpfor::{
-    FastPFor128, FastPFor256, FastPForBlock128, FastPForBlock256, JustCopy, VariableByte,
-};
-use rand::rngs::StdRng;
-use rand::{RngExt as _, SeedableRng};
-
-const SEED: u64 = 456;
-
-// ---------------------------------------------------------------------------
-// Generic codec helpers
-// ---------------------------------------------------------------------------
-
-pub fn roundtrip<C: AnyLenCodec>(data: &[u32]) {
-    let compressed = compress::<C>(data);
-    let decompressed = decompress::<C>(&compressed, Some(data.len() as u32));
-    assert_eq!(decompressed, data);
-}
-
-pub fn compress<C: AnyLenCodec>(data: &[u32]) -> Vec<u32> {
-    let mut codec = C::default();
-    let mut compressed = Vec::new();
-    codec.encode(data, &mut compressed).unwrap();
-    compressed
-}
-
-pub fn decompress<C: AnyLenCodec>(compressed: &Vec<u32>, expected_len: Option<u32>) -> Vec<u32> {
-    let mut codec = C::default();
-    let mut decompressed = Vec::new();
-    codec
-        .decode(&compressed, &mut decompressed, expected_len)
-        .unwrap();
-    decompressed
-}
-
-pub fn block_roundtrip<C: BlockCodec>(data: &[u32]) {
-    let compressed = block_compress::<C>(data);
-    let decompressed = block_decompress::<C>(&compressed, Some(data.len() as u32));
-    assert_eq!(decompressed, data);
-}
-
-pub fn block_compress<C: BlockCodec>(data: &[u32]) -> Vec<u32> {
-    let mut codec = C::default();
-    let (blocks, remainder) = slice_to_blocks::<C>(data);
-    assert_eq!(
-        remainder.len(),
-        0,
-        "data length must be a multiple of block size"
-    );
-    let mut out = Vec::new();
-    codec.encode_blocks(blocks, &mut out).unwrap();
-    out
-}
-
-pub fn block_decompress<C: BlockCodec>(compressed: &[u32], expected_len: Option<u32>) -> Vec<u32> {
-    let mut codec = C::default();
-    let mut out = Vec::new();
-    codec
-        .decode_blocks(compressed, expected_len, &mut out)
-        .unwrap();
-    out
-}
-
-/// Interpret `data` as little-endian `u32` words (length must be a multiple of 4) and
-/// run [`roundtrip`] for every any-length codec covered here.
-pub fn roundtrip_all(data: &[u32]) {
-    roundtrip::<VariableByte>(data);
-    roundtrip::<JustCopy>(data);
-    roundtrip::<FastPFor256>(data);
-    roundtrip::<FastPFor128>(data);
-
-    #[cfg(feature = "cpp")]
-    {
-        use fastpfor::cpp::*;
-        roundtrip::<CppFastPFor128>(data);
-    }
-}
-
-pub fn block_roundtrip_all(data: &[u32]) {
-    block_roundtrip::<FastPForBlock256>(data);
-    block_roundtrip::<FastPForBlock128>(data);
-}
-
-// ---------------------------------------------------------------------------
-// Data generators (private — only used to build fixtures)
-// ---------------------------------------------------------------------------
-
-type DataGeneratorFn = fn(usize) -> Vec<u32>;
-
-fn generate_uniform_data_from_range(size: usize, value_range: Range<u32>) -> Vec<u32> {
-    let mut rng = StdRng::seed_from_u64(SEED);
-    (0..size)
-        .map(|_| rng.random_range(value_range.clone()))
-        .collect()
-}
-
-pub fn generate_uniform_data_small_value_distribution(size: usize) -> Vec<u32> {
-    generate_uniform_data_from_range(size, 0..1000)
-}
-
-fn generate_uniform_data_large_value_distribution(size: usize) -> Vec<u32> {
-    generate_uniform_data_from_range(size, 0..u32::MAX)
-}
-
-fn generate_clustered_data(size: usize) -> Vec<u32> {
-    let mut rng = StdRng::seed_from_u64(SEED);
-    let mut base = 0u32;
-    (0..size)
-        .map(|_| {
-            if rng.random_bool(0.1) {
-                base = rng.random_range(0..1000);
-            }
-            base + rng.random_range(0..10)
-        })
-        .collect()
-}
-
-fn generate_sequential_data(size: usize) -> Vec<u32> {
-    (0..size as u32).collect()
-}
-
-fn generate_sparse_data(size: usize) -> Vec<u32> {
-    let mut rng = StdRng::seed_from_u64(SEED);
-    (0..size)
-        .map(|_| {
-            if rng.random_bool(0.9) {
-                0
-            } else {
-                rng.random()
-            }
-        })
-        .collect()
-}
-
-fn generate_constant_data(size: usize) -> Vec<u32> {
-    vec![SEED as u32; size]
-}
-
-fn generate_geometric_data(size: usize) -> Vec<u32> {
-    (0..size).map(|i| 1u32 << (i % 30)).collect()
-}
-
-/// Patterns used by compression / decompression / roundtrip / block-size benchmarks.
-const COMPRESS_PATTERNS: &[(&str, DataGeneratorFn)] = &[
-    (
-        "uniform_small_value_distribution",
-        generate_uniform_data_small_value_distribution,
-    ),
-    (
-        "uniform_large_value_distribution",
-        generate_uniform_data_large_value_distribution,
-    ),
-    ("clustered", generate_clustered_data),
-    ("sequential", generate_sequential_data),
-    ("sparse", generate_sparse_data),
-];
-
-/// Superset of `COMPRESS_PATTERNS`, also used by the compression-ratio benchmark.
-const ALL_PATTERNS: &[(&str, DataGeneratorFn)] = &[
-    (
-        "uniform_small_distribution",
-        generate_uniform_data_small_value_distribution,
-    ),
-    (
-        "uniform_large_distribution",
-        generate_uniform_data_large_value_distribution,
-    ),
-    ("clustered", generate_clustered_data),
-    ("sequential", generate_sequential_data),
-    ("sparse", generate_sparse_data),
-    ("constant", generate_constant_data),
-    ("geometric", generate_geometric_data),
-];
-
-// ---------------------------------------------------------------------------
-// Pre-computed fixtures
-// ---------------------------------------------------------------------------
-
-/// One row of pre-computed data for compression / decompression benchmarks.
-///
-/// Parameterised by `C: BlockCodec` so the same struct works for both 128-
-/// and 256-element block codecs.
-pub struct CompressFixture<C: BlockCodec> {
-    pub name: &'static str,
-    /// Block-aligned uncompressed data (exactly `n_blocks * C::elements_per_block()` elements).
-    pub original: Vec<u32>,
-    /// Pre-compressed form, ready for decompression benchmarks.
-    pub compressed: Vec<u32>,
-    /// Number of blocks in `data`.
-    pub n_blocks: usize,
-    _codec: PhantomData<C>,
-}
-
-/// One row for the block-size comparison benchmark.
-///
-/// Parameterised by `C: BlockCodec` — create one per codec to compare.
-/// FIXME: deduplicate these two structs if possible
-pub struct BlockSizeFixture<C: BlockCodec> {
-    pub compressed: Vec<u32>,
-    pub original: Vec<u32>,
-    pub n_blocks: usize,
-    _codec: PhantomData<C>,
-}
-
-impl<C: BlockCodec> CompressFixture<C> {
-    fn new(name: &'static str, generator: DataGeneratorFn, block_count: usize) -> Self {
-        let original = generator(block_count * C::size());
-        Self {
-            name,
-            compressed: block_compress::<C>(&original),
-            original,
-            n_blocks: block_count,
-            _codec: PhantomData,
-        }
-    }
-}
-
-impl<C: BlockCodec> BlockSizeFixture<C> {
-    pub fn new(block_count: usize) -> Self {
-        let original = generate_uniform_data_small_value_distribution(block_count * C::size());
-        Self {
-            compressed: block_compress::<C>(&original),
-            original,
-            n_blocks: block_count,
-            _codec: PhantomData,
-        }
-    }
-}
-
-/// Build fixtures for every `COMPRESS_PATTERNS × block_counts` combination.
-pub fn compress_fixtures<C: BlockCodec>(
-    block_counts: &[usize],
-) -> Vec<(usize, CompressFixture<C>)> {
-    block_counts
-        .iter()
-        .flat_map(|&bc| {
-            COMPRESS_PATTERNS
-                .iter()
-                .map(move |&(name, generator)| (bc, CompressFixture::<C>::new(name, generator, bc)))
-        })
-        .collect()
-}
-
-/// Build fixtures for every `ALL_PATTERNS` at a single block count.
-pub fn ratio_fixtures<C: BlockCodec>(block_count: usize) -> Vec<CompressFixture<C>> {
-    ALL_PATTERNS
-        .iter()
-        .map(|&(name, generator)| CompressFixture::<C>::new(name, generator, block_count))
-        .collect()
-}
diff --git a/benches/fastpfor_benchmark.rs b/benches/fastpfor_benchmark.rs
index 762057f..d0caa0b 100644
--- a/benches/fastpfor_benchmark.rs
+++ b/benches/fastpfor_benchmark.rs
@@ -7,14 +7,15 @@ use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_m
 use fastpfor::AnyLenCodec;
 use fastpfor::{BlockCodec as _, FastPForBlock128, FastPForBlock256, slice_to_blocks};
 
-#[path = "bench_utils.rs"]
-mod bench_utils;
-use bench_utils::{
+// Shared helpers live in `src/bench_utils.rs` (library exposes the same file only under `cfg(test)`).
+#[path = "../src/test_utils.rs"]
+mod test_utils;
+#[cfg(feature = "cpp")]
+use fastpfor::cpp::CppFastPFor128;
+use test_utils::{
     BlockSizeFixture, compress_fixtures, generate_uniform_data_small_value_distribution,
     ratio_fixtures,
 };
-#[cfg(feature = "cpp")]
-use fastpfor::cpp::CppFastPFor128;
 
 /// Number of blocks per benchmark run.  The element count per run is
 /// `BLOCK_COUNTS[i] * C::elements_per_block()`, e.g. 8 × 128 = 1,024 or 32 × 128 = 4,096.
diff --git a/fuzz/fuzz_targets/common.rs b/fuzz/fuzz_targets/common.rs
index 9dd2d05..4594f54 100644
--- a/fuzz/fuzz_targets/common.rs
+++ b/fuzz/fuzz_targets/common.rs
@@ -3,7 +3,8 @@
 #![allow(dead_code)]
 
 use fastpfor::cpp::*;
-use fastpfor::{AnyLenCodec, FastPFor128, FastPFor256, JustCopy, VariableByte};
+use fastpfor::{AnyLenCodec, FastPFor128, FastPFor256, FastPForResult, JustCopy, VariableByte};
+
 // ── Debug helper ─────────────────────────────────────────────────────────────
 
 pub struct HexSlice<'a>(pub &'a [u32]);
@@ -36,54 +37,188 @@ pub struct FuzzInput<C> {
     pub codec: C,
 }
 
-pub type AnyLen = Box<dyn AnyLenCodec>;
+// `AnyLenCodec` is not dyn-compatible; carry a closed enum of every fuzzed codec instead.
+macro_rules! define_fuzz_any_len {
+    ($( $variant:ident ( $ty:ty ) ),* $(,)?) => {
+        pub enum FuzzAnyLen {
+            $( $variant($ty), )*
+        }
+
+        impl FuzzAnyLen {
+            pub fn encode(&mut self, input: &[u32], out: &mut Vec<u32>) -> FastPForResult<()> {
+                match self {
+                    $( Self::$variant(codec) => codec.encode(input, out), )*
+                }
+            }
+
+            pub fn decode(
+                &mut self,
+                input: &[u32],
+                out: &mut Vec<u32>,
+                expected_len: Option<u32>,
+            ) -> FastPForResult<()> {
+                match self {
+                    $( Self::$variant(codec) => codec.decode(input, out, expected_len), )*
+                }
+            }
+        }
+    };
+}
+
+define_fuzz_any_len!(
+    FastPFor256(FastPFor256),
+    FastPFor128(FastPFor128),
+    VariableByte(VariableByte),
+    JustCopy(JustCopy),
+    CppBP32(CppBP32),
+    CppCopy(CppCopy),
+    CppFastBinaryPacking8(CppFastBinaryPacking8),
+    CppFastPFor128(CppFastPFor128),
+    CppFastPFor256(CppFastPFor256),
+    CppFastBinaryPacking16(CppFastBinaryPacking16),
+    CppFastBinaryPacking32(CppFastBinaryPacking32),
+    CppMaskedVByte(CppMaskedVByte),
+    CppNewPFor(CppNewPFor),
+    CppOptPFor(CppOptPFor),
+    CppPFor2008(CppPFor2008),
+    CppPFor(CppPFor),
+    CppSimdBinaryPacking(CppSimdBinaryPacking),
+    CppSimdFastPFor128(CppSimdFastPFor128),
+    CppSimdFastPFor256(CppSimdFastPFor256),
+    CppSimdGroupSimple(CppSimdGroupSimple),
+    CppSimdGroupSimpleRingBuf(CppSimdGroupSimpleRingBuf),
+    CppSimdNewPFor(CppSimdNewPFor),
+    CppSimdOptPFor(CppSimdOptPFor),
+    CppSimdPFor(CppSimdPFor),
+    CppSimdSimplePFor(CppSimdSimplePFor),
+    CppStreamVByte(CppStreamVByte),
+    CppVByte(CppVByte),
+    CppVarInt(CppVarInt),
+    CppVarIntGb(CppVarIntGb),
+);
 
-pub type CodecEntry = (&'static str, fn() -> AnyLen);
+pub type CodecEntry = (&'static str, fn() -> FuzzAnyLen);
 
-/// Generates `(name, || Box::new(T::default()))` entries from a list of types.
-macro_rules! codec_list {
-    ($($t:ty),* $(,)?) => {
-        &[
-            $( (stringify!($t), || Box::new(<$t>::default())) ),*
-        ]
+macro_rules! codec_ctor_fn {
+    ($fn_name:ident, $variant:ident, $ty:ty) => {
+        fn $fn_name() -> FuzzAnyLen {
+            FuzzAnyLen::$variant(<$ty>::default())
+        }
     };
 }
 
+codec_ctor_fn!(make_rust_fastpfor256, FastPFor256, FastPFor256);
+codec_ctor_fn!(make_rust_fastpfor128, FastPFor128, FastPFor128);
+
+fn make_rust_variable_byte() -> FuzzAnyLen {
+    FuzzAnyLen::VariableByte(VariableByte)
+}
+
+fn make_rust_just_copy() -> FuzzAnyLen {
+    FuzzAnyLen::JustCopy(JustCopy)
+}
+
 /// Rust codecs. Block codecs are wrapped in `CompositeCodec<_, VariableByte>`.
-pub static RUST: &[CodecEntry] = codec_list!(FastPFor256, FastPFor128, VariableByte, JustCopy,);
+pub static RUST: &[CodecEntry] = &[
+    ("FastPFor256", make_rust_fastpfor256),
+    ("FastPFor128", make_rust_fastpfor128),
+    ("VariableByte", make_rust_variable_byte),
+    ("JustCopy", make_rust_just_copy),
+];
 
-/// C++ codecs (any-length; block codecs are already composites in the C++ library).
-pub static CPP: &[CodecEntry] = codec_list!(
-    CppBP32,
-    CppCopy,
+codec_ctor_fn!(make_cpp_bp32, CppBP32, CppBP32);
+codec_ctor_fn!(make_cpp_copy, CppCopy, CppCopy);
+codec_ctor_fn!(
+    make_cpp_fast_binary_packing8,
     CppFastBinaryPacking8,
-    CppFastPFor128,
-    CppFastPFor256,
+    CppFastBinaryPacking8
+);
+codec_ctor_fn!(make_cpp_fastpfor128, CppFastPFor128, CppFastPFor128);
+codec_ctor_fn!(make_cpp_fastpfor256, CppFastPFor256, CppFastPFor256);
+codec_ctor_fn!(
+    make_cpp_fast_binary_packing16,
     CppFastBinaryPacking16,
+    CppFastBinaryPacking16
+);
+codec_ctor_fn!(
+    make_cpp_fast_binary_packing32,
     CppFastBinaryPacking32,
-    CppMaskedVByte,
-    CppNewPFor,
-    CppOptPFor,
-    CppPFor2008,
-    CppPFor,
+    CppFastBinaryPacking32
+);
+codec_ctor_fn!(make_cpp_masked_vbyte, CppMaskedVByte, CppMaskedVByte);
+codec_ctor_fn!(make_cpp_new_pfor, CppNewPFor, CppNewPFor);
+codec_ctor_fn!(make_cpp_opt_pfor, CppOptPFor, CppOptPFor);
+codec_ctor_fn!(make_cpp_pfor2008, CppPFor2008, CppPFor2008);
+codec_ctor_fn!(make_cpp_pfor, CppPFor, CppPFor);
+codec_ctor_fn!(
+    make_cpp_simd_binary_packing,
     CppSimdBinaryPacking,
+    CppSimdBinaryPacking
+);
+codec_ctor_fn!(
+    make_cpp_simd_fastpfor128,
     CppSimdFastPFor128,
+    CppSimdFastPFor128
+);
+codec_ctor_fn!(
+    make_cpp_simd_fastpfor256,
     CppSimdFastPFor256,
+    CppSimdFastPFor256
+);
+codec_ctor_fn!(
+    make_cpp_simd_group_simple,
     CppSimdGroupSimple,
+    CppSimdGroupSimple
+);
+codec_ctor_fn!(
+    make_cpp_simd_group_simple_ring_buf,
     CppSimdGroupSimpleRingBuf,
-    CppSimdNewPFor,
-    CppSimdOptPFor,
-    CppSimdPFor,
+    CppSimdGroupSimpleRingBuf
+);
+codec_ctor_fn!(make_cpp_simd_new_pfor, CppSimdNewPFor, CppSimdNewPFor);
+codec_ctor_fn!(make_cpp_simd_opt_pfor, CppSimdOptPFor, CppSimdOptPFor);
+codec_ctor_fn!(make_cpp_simd_pfor, CppSimdPFor, CppSimdPFor);
+codec_ctor_fn!(
+    make_cpp_simd_simple_pfor,
     CppSimdSimplePFor,
-    // Simple16 / Simple8b / Simple8bRle / Simple9 / Simple9Rle / SimplePFor:
-    //   cannot encode arbitrary u32 values.
-    // Snappy / VarIntG8iu: conditional #ifdef in C++.
-    // VsEncoding: leaks memory.
-    CppStreamVByte,
-    CppVByte,
-    CppVarInt,
-    CppVarIntGb,
+    CppSimdSimplePFor
 );
+codec_ctor_fn!(make_cpp_stream_vbyte, CppStreamVByte, CppStreamVByte);
+codec_ctor_fn!(make_cpp_vbyte, CppVByte, CppVByte);
+codec_ctor_fn!(make_cpp_var_int, CppVarInt, CppVarInt);
+codec_ctor_fn!(make_cpp_var_int_gb, CppVarIntGb, CppVarIntGb);
+
+/// C++ codecs (any-length; block codecs are already composites in the C++ library).
+pub static CPP: &[CodecEntry] = &[
+    ("CppBP32", make_cpp_bp32),
+    ("CppCopy", make_cpp_copy),
+    ("CppFastBinaryPacking8", make_cpp_fast_binary_packing8),
+    ("CppFastPFor128", make_cpp_fastpfor128),
+    ("CppFastPFor256", make_cpp_fastpfor256),
+    ("CppFastBinaryPacking16", make_cpp_fast_binary_packing16),
+    ("CppFastBinaryPacking32", make_cpp_fast_binary_packing32),
+    ("CppMaskedVByte", make_cpp_masked_vbyte),
+    ("CppNewPFor", make_cpp_new_pfor),
+    ("CppOptPFor", make_cpp_opt_pfor),
+    ("CppPFor2008", make_cpp_pfor2008),
+    ("CppPFor", make_cpp_pfor),
+    ("CppSimdBinaryPacking", make_cpp_simd_binary_packing),
+    ("CppSimdFastPFor128", make_cpp_simd_fastpfor128),
+    ("CppSimdFastPFor256", make_cpp_simd_fastpfor256),
+    ("CppSimdGroupSimple", make_cpp_simd_group_simple),
+    (
+        "CppSimdGroupSimpleRingBuf",
+        make_cpp_simd_group_simple_ring_buf,
+    ),
+    ("CppSimdNewPFor", make_cpp_simd_new_pfor),
+    ("CppSimdOptPFor", make_cpp_simd_opt_pfor),
+    ("CppSimdPFor", make_cpp_simd_pfor),
+    ("CppSimdSimplePFor", make_cpp_simd_simple_pfor),
+    ("CppStreamVByte", make_cpp_stream_vbyte),
+    ("CppVByte", make_cpp_vbyte),
+    ("CppVarInt", make_cpp_var_int),
+    ("CppVarIntGb", make_cpp_var_int_gb),
+];
 
 // ── Codec selector (Arbitrary) ─────────────────────────────────────────────────
 
@@ -96,7 +231,7 @@ pub struct AnyLenSelector {
 }
 
 /// Instantiate a codec, returning `(name, codec)`.
-pub fn instantiate_anylen_codec(sel: AnyLenSelector) -> (&'static str, AnyLen) {
+pub fn instantiate_anylen_codec(sel: AnyLenSelector) -> (&'static str, FuzzAnyLen) {
     let list = if sel.use_cpp { CPP } else { RUST };
     let (name, make) = list[sel.idx as usize % list.len()];
     (name, make())
@@ -109,33 +244,57 @@ pub fn instantiate_anylen_codec(sel: AnyLenSelector) -> (&'static str, AnyLen) {
 #[derive(Clone, Copy)]
 pub struct CodecPair {
     pub name: &'static str,
-    pub make_rust: fn() -> AnyLen,
-    pub make_cpp: fn() -> AnyLen,
+    pub make_rust: fn() -> FuzzAnyLen,
+    pub make_cpp: fn() -> FuzzAnyLen,
 }
 
-macro_rules! codec_pair {
-    ($name:expr, $rust:ty, $cpp:ty) => {
-        CodecPair {
-            name: $name,
-            make_rust: || Box::new(<$rust>::default()),
-            make_cpp: || Box::new(<$cpp>::default()),
-        }
-    };
-    ($name:expr, $rust:ty, $cpp:ty, $cpp_alt:ty) => {
-        CodecPair {
-            name: $name,
-            make_rust: || Box::new(<$rust>::default()),
-            make_cpp: || Box::new(<$cpp>::default()),
-        }
-    };
+fn pair_rust_fastpfor128() -> FuzzAnyLen {
+    FuzzAnyLen::FastPFor128(FastPFor128::default())
+}
+fn pair_cpp_fastpfor128() -> FuzzAnyLen {
+    FuzzAnyLen::CppFastPFor128(CppFastPFor128::default())
+}
+fn pair_rust_fastpfor256() -> FuzzAnyLen {
+    FuzzAnyLen::FastPFor256(FastPFor256::default())
+}
+fn pair_cpp_fastpfor256() -> FuzzAnyLen {
+    FuzzAnyLen::CppFastPFor256(CppFastPFor256::default())
+}
+fn pair_rust_variable_byte() -> FuzzAnyLen {
+    FuzzAnyLen::VariableByte(VariableByte)
+}
+fn pair_cpp_var_int() -> FuzzAnyLen {
+    FuzzAnyLen::CppVarInt(CppVarInt::default())
+}
+fn pair_rust_just_copy() -> FuzzAnyLen {
+    FuzzAnyLen::JustCopy(JustCopy)
+}
+fn pair_cpp_copy() -> FuzzAnyLen {
+    FuzzAnyLen::CppCopy(CppCopy::default())
 }
 
 /// Pairs of Rust and C++ codecs expected to produce bit-identical output.
 pub static ENCODE_COMPARE_PAIRS: &[CodecPair] = &[
-    codec_pair!("FastPFor128", FastPFor128, CppFastPFor128),
-    codec_pair!("FastPFor256", FastPFor256, CppFastPFor256),
-    codec_pair!("VariableByte", VariableByte, CppVarInt),
-    codec_pair!("JustCopy", JustCopy, CppCopy),
+    CodecPair {
+        name: "FastPFor128",
+        make_rust: pair_rust_fastpfor128,
+        make_cpp: pair_cpp_fastpfor128,
+    },
+    CodecPair {
+        name: "FastPFor256",
+        make_rust: pair_rust_fastpfor256,
+        make_cpp: pair_cpp_fastpfor256,
+    },
+    CodecPair {
+        name: "VariableByte",
+        make_rust: pair_rust_variable_byte,
+        make_cpp: pair_cpp_var_int,
+    },
+    CodecPair {
+        name: "JustCopy",
+        make_rust: pair_rust_just_copy,
+        make_cpp: pair_cpp_copy,
+    },
 ];
 
 /// Optional pair filter: if set, only the named pair is tested.
@@ -161,7 +320,7 @@ pub fn resolve_encode_compare_pair(idx: u8) -> Option<CodecPair> {
 }
 
 /// Instantiate both codecs for a pair, using the alternative C++ when requested.
-pub fn instantiate_pair(pair: CodecPair) -> (AnyLen, AnyLen) {
+pub fn instantiate_pair(pair: CodecPair) -> (FuzzAnyLen, FuzzAnyLen) {
     let rust_codec = (pair.make_rust)();
     let cpp_codec = (pair.make_cpp)();
     (rust_codec, cpp_codec)
diff --git a/src/cpp/codecs.rs b/src/cpp/codecs.rs
index 2def24a..ed099df 100644
--- a/src/cpp/codecs.rs
+++ b/src/cpp/codecs.rs
@@ -180,98 +180,64 @@ implement_cpp_codecs_64! {
 
 #[cfg(test)]
 pub(crate) mod tests {
-    use crate::codec::{AnyLenCodec, BlockCodec64};
     use crate::cpp::codecs::{CppFastPFor128, CppFastPFor256, CppVByte, CppVarInt};
-
-    pub fn roundtrip_32(codec: &mut (impl AnyLenCodec + ?Sized), input: &[u32]) {
-        let mut compressed = Vec::new();
-        codec.encode(input, &mut compressed).unwrap();
-        let mut decoded = Vec::new();
-        codec.decode(&compressed, &mut decoded, None).unwrap();
-        assert_eq!(decoded, input);
-    }
+    use crate::test_utils::{decompress, decompress64, roundtrip, roundtrip64};
 
     /// C++ `fastpfor256_codec` returns `CompositeCodec<FastPFor<8>, VariableByte>` — already
     /// any-length. Use it directly; do not wrap in Rust `CompositeCodec`.
     #[test]
     fn test_cpp_fastpfor256_composite_anylen() {
-        let mut codec = CppFastPFor256::new();
-        roundtrip_32(&mut codec, &[1, 2, 3, 4, 5]);
+        roundtrip::<CppFastPFor256>(&[1, 2, 3, 4, 5]);
         let data: Vec<u32> = (0..600).collect();
-        roundtrip_32(&mut codec, &data);
+        roundtrip::<CppFastPFor256>(&data);
     }
 
     #[test]
     fn test_fastpfor128_anylen() {
         let data: Vec<u32> = (0..128).collect();
-        roundtrip_32(&mut CppFastPFor128::new(), &data);
+        roundtrip::<CppFastPFor128>(&data);
     }
 
     #[test]
     fn test_fastpfor256_anylen() {
         let data: Vec<u32> = (0..256).collect();
-        roundtrip_32(&mut CppFastPFor256::new(), &data);
+        roundtrip::<CppFastPFor256>(&data);
     }
 
     #[test]
     fn test_fastpfor256_u64() {
         let input: Vec<u64> = (0..256).collect();
-        let mut codec = CppFastPFor256::new();
-        let mut compressed = Vec::new();
-        codec.encode64(&input, &mut compressed).unwrap();
-        let mut decoded = Vec::new();
-        codec.decode64(&compressed, &mut decoded).unwrap();
-        assert_eq!(decoded, input);
+        roundtrip64::<CppFastPFor256>(&input);
     }
 
     #[test]
     fn test_varint_u64() {
         let input = vec![1u64, 2, 3, 4, 5];
-        let mut codec = CppVarInt::new();
-        let mut compressed = Vec::new();
-        codec.encode64(&input, &mut compressed).unwrap();
-        let mut decoded = Vec::new();
-        codec.decode64(&compressed, &mut decoded).unwrap();
-        assert_eq!(decoded, input);
+        roundtrip64::<CppVarInt>(&input);
     }
 
     #[test]
     fn test_decode32_empty_input() {
-        let mut codec = CppVByte::new();
-        let mut out = Vec::new();
-        codec.decode(&[], &mut out, None).unwrap();
-        assert!(out.is_empty());
+        assert!(decompress::<CppVByte>(&[], None).is_empty());
     }
 
     #[test]
     fn test_decode32_cpp_empty_format() {
-        let mut codec = CppFastPFor128::new();
-        let mut out = Vec::new();
-        codec.decode(&[0u32], &mut out, Some(0)).unwrap();
-        assert!(out.is_empty());
+        assert!(decompress::<CppFastPFor128>(&[0u32], Some(0)).is_empty());
     }
 
     #[test]
     fn test_decode64_empty_input() {
-        let mut codec = CppFastPFor256::new();
-        let mut out: Vec<u64> = Vec::new();
-        codec.decode64(&[], &mut out).unwrap();
-        assert!(out.is_empty());
+        assert!(decompress64::<CppFastPFor256>(&[]).is_empty());
     }
 
     #[test]
     fn test_decode64_empty_format() {
-        let mut codec = CppVarInt::new();
-        let mut out: Vec<u64> = Vec::new();
-        codec.decode64(&[], &mut out).unwrap();
-        assert!(out.is_empty());
+        assert!(decompress64::<CppVarInt>(&[]).is_empty());
     }
 
     #[test]
     fn test_decode_empty_input() {
-        let mut codec = CppFastPFor128::new();
-        let mut out = Vec::new();
-        codec.decode(&[], &mut out, None).unwrap();
-        assert!(out.is_empty());
+        assert!(decompress::<CppFastPFor128>(&[], None).is_empty());
     }
 }
diff --git a/src/cpp/tests.rs b/src/cpp/tests.rs
index 4845c0a..fa8d3e8 100644
--- a/src/cpp/tests.rs
+++ b/src/cpp/tests.rs
@@ -1,4 +1,4 @@
-use crate::cpp::codecs::tests::roundtrip_32;
+use crate::test_utils::roundtrip_with;
 
 /// Test all codecs compile and do a basic 32-bit roundtrip
 macro_rules! test_anylen {
@@ -7,7 +7,7 @@ macro_rules! test_anylen {
             #[test]
             #[allow(non_snake_case)]
             fn $name() {
-                roundtrip_32(&mut $crate::cpp::$name::new(), &[1u32, 2, 3, 4, 5]);
+                roundtrip_with(&mut $crate::cpp::$name::default(), &[1u32, 2, 3, 4, 5]);
             }
         )*
     };
@@ -51,7 +51,7 @@ macro_rules! test_anylen_128 {
                 #[allow(non_snake_case)]
                 fn $name() {
                     let input: Vec<u32> = (1..=128).collect();
-                    roundtrip_32(&mut $crate::cpp::$name::new(), &input);
+                    roundtrip_with(&mut $crate::cpp::$name::default(), &input);
                 }
             )*
         };
diff --git a/src/lib.rs b/src/lib.rs
index 22bf842..a62c545 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -33,3 +33,10 @@ pub use rust::{
     CompositeCodec, FastPFor, FastPFor128, FastPFor256, FastPForBlock128, FastPForBlock256,
     JustCopy, VariableByte,
 };
+
+// `src/test_utils.rs` uses `fastpfor::...`; alias this crate for unit tests only.
+#[cfg(test)]
+extern crate self as fastpfor;
+
+#[cfg(test)]
+pub(crate) mod test_utils;
diff --git a/src/rust/composite.rs b/src/rust/composite.rs
index a633694..912a453 100644
--- a/src/rust/composite.rs
+++ b/src/rust/composite.rs
@@ -45,11 +45,7 @@ pub struct CompositeCodec<Blocks: BlockCodec, Tail: AnyLenCodec> {
     tail: Tail,
 }
 
-impl<Blocks, Tail> Default for CompositeCodec<Blocks, Tail>
-where
-    Blocks: BlockCodec + Default,
-    Tail: AnyLenCodec + Default,
-{
+impl<Blocks: BlockCodec, Tail: AnyLenCodec> Default for CompositeCodec<Blocks, Tail> {
     fn default() -> Self {
         Self::new(Blocks::default(), Tail::default())
     }
@@ -114,73 +110,50 @@ mod tests {
     use super::*;
     use crate::FastPForError;
     use crate::rust::{FastPForBlock128, FastPForBlock256, JustCopy, VariableByte};
+    use crate::test_utils::{compress, decompress, roundtrip_composite};
 
-    fn roundtrip<C: AnyLenCodec>(codec: &mut C, data: &[u32]) {
-        let mut encoded = Vec::new();
-        codec.encode(data, &mut encoded).unwrap();
-        let mut decoded = Vec::new();
-        codec.decode(&encoded, &mut decoded, None).unwrap();
-        assert_eq!(decoded, data);
-    }
+    type Comp256Vb = CompositeCodec<FastPForBlock256, VariableByte>;
 
     #[test]
     fn test_fastpfor256_vbyte_exact_two_blocks() {
         let data: Vec<u32> = (0..512).collect();
-        roundtrip(
-            &mut CompositeCodec::new(FastPForBlock256::default(), VariableByte::new()),
-            &data,
-        );
+        roundtrip_composite::<FastPForBlock256, VariableByte>(&data);
     }
 
     #[test]
     fn test_fastpfor256_vbyte_with_remainder() {
         let data: Vec<u32> = (0..600).collect();
-        roundtrip(
-            &mut CompositeCodec::new(FastPForBlock256::default(), VariableByte::new()),
-            &data,
-        );
+        roundtrip_composite::<FastPForBlock256, VariableByte>(&data);
     }
 
     #[test]
     fn test_fastpfor128_justcopy_with_remainder() {
         let data: Vec<u32> = (0..300).collect();
-        roundtrip(
-            &mut CompositeCodec::new(FastPForBlock128::default(), JustCopy::new()),
-            &data,
-        );
+        roundtrip_composite::<FastPForBlock128, JustCopy>(&data);
     }
 
     #[test]
     fn test_empty_input() {
-        roundtrip(
-            &mut CompositeCodec::new(FastPForBlock256::default(), VariableByte::new()),
-            &[],
-        );
+        roundtrip_composite::<FastPForBlock256, VariableByte>(&[]);
     }
 
     #[test]
     fn test_decode_truly_empty_input() {
         // Decoding a zero-length slice (not even a header word) must succeed with empty output.
-        let mut codec = CompositeCodec::new(FastPForBlock256::default(), VariableByte::new());
-        let mut out = Vec::new();
-        codec.decode(&[], &mut out, None).unwrap();
-        assert!(out.is_empty());
+        assert!(decompress::<Comp256Vb>(&[], None).is_empty());
     }
 
     #[test]
     fn test_decode_empty_input_with_expected_zero() {
         // Empty input with expected_len=0 must succeed.
-        let mut codec = CompositeCodec::new(FastPForBlock256::default(), VariableByte::new());
-        let mut out = Vec::new();
-        codec.decode(&[], &mut out, Some(0)).unwrap();
-        assert!(out.is_empty());
+        assert!(decompress::<Comp256Vb>(&[], Some(0)).is_empty());
     }
 
     #[test]
     fn test_decode_empty_input_with_nonzero_expected_errors() {
         // Empty input: max_decompressed_len(0) == 0, so any expected_len > 0 fails
         // with ExpectedCountExceedsMax before decoding begins.
-        let mut codec = CompositeCodec::new(FastPForBlock256::default(), VariableByte::new());
+        let mut codec = CompositeCodec::<FastPForBlock256, VariableByte>::default();
         let err = codec.decode(&[], &mut Vec::new(), Some(5)).unwrap_err();
         assert!(matches!(
             err,
@@ -197,7 +170,7 @@ mod tests {
         // than attempting a multi-gigabyte allocation.
         // Regression: fuzzer found bytes [0x04, 0x35, 0x19] → u32 LE 0x00193504 = 1_651_460
         // fed to FastPFor256.decode caused an OOM via a ~2.5 GB Vec::resize.
-        let mut codec = CompositeCodec::new(FastPForBlock256::default(), VariableByte::new());
+        let mut codec = CompositeCodec::<FastPForBlock256, VariableByte>::default();
         let mut out = Vec::new();
         let input = [0x0019_3504u32]; // n_blocks = 1_651_460, rest is empty
         assert!(codec.decode(&input, &mut out, None).is_err());
@@ -207,31 +180,25 @@ mod tests {
     #[test]
     fn test_sub_block_only() {
         let data: Vec<u32> = (0..10).collect();
-        roundtrip(
-            &mut CompositeCodec::new(FastPForBlock256::default(), VariableByte::new()),
-            &data,
-        );
+        roundtrip_composite::<FastPForBlock256, VariableByte>(&data);
     }
 
     #[test]
     fn test_decode_with_expected_len() {
         let data: Vec<u32> = (0..600).collect();
-        let mut codec = CompositeCodec::new(FastPForBlock256::default(), VariableByte::new());
-        let mut encoded = Vec::new();
-        codec.encode(&data, &mut encoded).unwrap();
-        let mut decoded = Vec::new();
-        codec.decode(&encoded, &mut decoded, Some(600)).unwrap();
+        let encoded = compress::<Comp256Vb>(&data);
+        let decoded = decompress::<Comp256Vb>(&encoded, Some(600));
         assert_eq!(decoded, data);
     }
 
     #[test]
     fn test_decode_expected_len_mismatch_errors() {
         let data: Vec<u32> = (0..100).collect();
-        let mut codec = CompositeCodec::new(FastPForBlock256::default(), VariableByte::new());
-        let mut encoded = Vec::new();
-        codec.encode(&data, &mut encoded).unwrap();
-        let mut decoded = Vec::new();
-        let err = codec.decode(&encoded, &mut decoded, Some(50)).unwrap_err();
+        let encoded = compress::<Comp256Vb>(&data);
+        let mut codec = Comp256Vb::default();
+        let err = codec
+            .decode(&encoded, &mut Vec::new(), Some(50))
+            .unwrap_err();
         assert!(matches!(
             err,
             FastPForError::DecodedCountMismatch {
@@ -244,15 +211,11 @@ mod tests {
     #[test]
     fn test_decode_expected_len_exceeds_max_errors() {
         let data: Vec<u32> = (0..10).collect();
-        let mut codec = CompositeCodec::new(FastPForBlock256::default(), VariableByte::new());
-        let mut encoded = Vec::new();
-        codec.encode(&data, &mut encoded).unwrap();
-        let mut decoded = Vec::new();
-        let huge =
-            (CompositeCodec::<FastPForBlock256, VariableByte>::max_decompressed_len(encoded.len())
-                + 1) as u32;
+        let encoded = compress::<Comp256Vb>(&data);
+        let huge = (Comp256Vb::max_decompressed_len(encoded.len()) + 1) as u32;
+        let mut codec = Comp256Vb::default();
         let err = codec
-            .decode(&encoded, &mut decoded, Some(huge))
+            .decode(&encoded, &mut Vec::new(), Some(huge))
             .unwrap_err();
         assert!(matches!(err, FastPForError::ExpectedCountExceedsMax { .. }));
     }
diff --git a/src/rust/integer_compression/just_copy.rs b/src/rust/integer_compression/just_copy.rs
index 228b9b3..2476aa2 100644
--- a/src/rust/integer_compression/just_copy.rs
+++ b/src/rust/integer_compression/just_copy.rs
@@ -47,31 +47,25 @@ impl AnyLenCodec for JustCopy {
 mod tests {
     use super::*;
     use crate::FastPForError;
+    use crate::test_utils::{decompress, roundtrip};
 
     #[test]
     fn justcopy_default_and_roundtrip() {
-        // Exercise the Default impl explicitly.
-        let mut codec = <JustCopy as Default>::default();
-        let data = vec![1u32, 2, 3];
-        let mut compressed = Vec::new();
-        codec.encode(&data, &mut compressed).unwrap();
-        let mut decoded = Vec::new();
-        codec.decode(&compressed, &mut decoded, None).unwrap();
-        assert_eq!(decoded, data);
+        roundtrip::<JustCopy>(&[1u32, 2, 3]);
     }
 
     #[test]
     fn justcopy_decode_with_expected_len_ok() {
         let data = vec![1u32, 2, 3];
-        let mut out = Vec::new();
-        JustCopy::new().decode(&data, &mut out, Some(3)).unwrap();
+        let out = decompress::<JustCopy>(&data, Some(3));
         assert_eq!(out, data);
     }
 
     #[test]
+    #[expect(clippy::default_constructed_unit_structs)]
     fn justcopy_decode_expected_len_mismatch_errors() {
         let data = vec![1u32, 2, 3];
-        let err = JustCopy::new()
+        let err = JustCopy::default()
             .decode(&data, &mut Vec::new(), Some(2))
             .unwrap_err();
         assert!(matches!(
diff --git a/src/rust/integer_compression/variable_byte.rs b/src/rust/integer_compression/variable_byte.rs
index 020697a..a2bd260 100644
--- a/src/rust/integer_compression/variable_byte.rs
+++ b/src/rust/integer_compression/variable_byte.rs
@@ -365,6 +365,8 @@ mod tests {
 
     use super::*;
 
+    use crate::test_utils::{compress, decompress, roundtrip};
+
     fn verify_u32_roundtrip(input: &[u32]) {
         let mut encoded: Vec<u32> = vec![0; input.len() * 2 + 1];
         let mut input_offset = Cursor::new(0);
@@ -582,13 +584,8 @@ mod tests {
 
     #[test]
     fn test_variable_byte_default() {
-        let mut codec = <VariableByte as Default>::default();
         let data = vec![1u32, 2, 3];
-        let mut out = Vec::new();
-        codec.encode(&data, &mut out).unwrap();
-        let mut decoded = Vec::new();
-        codec.decode(&out, &mut decoded, None).unwrap();
-        assert_eq!(decoded, data);
+        roundtrip::<VariableByte>(&data);
     }
 
     /// `decompress_from_u32_slice` returns `OutputBufferTooSmall` when the
@@ -661,23 +658,19 @@ mod tests {
     #[test]
     fn test_anylen_decode_with_expected_len_ok() {
         let data = vec![1u32, 2, 3];
-        let mut encoded = Vec::new();
-        VariableByte::new().encode(&data, &mut encoded).unwrap();
-        let mut decoded = Vec::new();
-        VariableByte::new()
-            .decode(&encoded, &mut decoded, Some(3))
-            .unwrap();
+        let encoded = compress::<VariableByte>(&data);
+        let decoded = decompress::<VariableByte>(&encoded, Some(3));
         assert_eq!(decoded, data);
     }
 
     #[test]
+    #[expect(clippy::default_constructed_unit_structs)]
     fn test_anylen_decode_expected_len_mismatch_errors() {
         // expected_len must be >= actual to avoid OutputBufferTooSmall; use a larger
         // value to exercise the is_decoded_mismatch path.
         let data = vec![1u32, 2, 3];
-        let mut encoded = Vec::new();
-        VariableByte::new().encode(&data, &mut encoded).unwrap();
-        let err = VariableByte::new()
+        let encoded = compress::<VariableByte>(&data);
+        let err = VariableByte::default()
             .decode(&encoded, &mut Vec::new(), Some(10))
             .unwrap_err();
         assert!(matches!(
diff --git a/src/test_utils.rs b/src/test_utils.rs
new file mode 100644
index 0000000..7d974cc
--- /dev/null
+++ b/src/test_utils.rs
@@ -0,0 +1,323 @@
+//! Shared data generators, codec helpers, and pre-computed fixtures used by
+//! Criterion benchmarks, integration tests, and `#[cfg(test)]` unit tests in the
+//! `fastpfor` crate.
+//!
+//! - **Library unit tests:** `crate::test_utils` via `#[cfg(test)] mod bench_utils` in `lib.rs`
+//!   and `extern crate self as fastpfor` so this file can `use fastpfor::...`.
+//! - **Integration tests:** `#[cfg(test)] #[path = "../src/test_utils.rs"] mod bench_utils`.
+//! - **Criterion benchmarks:** `#[path = "../src/test_utils.rs"] mod bench_utils` (`cfg(test)` is not
+//!   enabled for bench targets, so the module is included unconditionally there).
+//!
+//! Loaded as a module via `#[path]` or as a normal child module, so every item
+//! consumed from outside must be `pub`. Each consumer uses a different subset,
+//! so dead-code is allowed at module scope.
+
+// This is an internal dev-only module; doc-comments on every field would add
+// noise without benefit.
+#![allow(dead_code, missing_docs)]
+
+#[cfg(feature = "cpp")]
+use fastpfor::BlockCodec64;
+#[allow(unused_imports)]
+use fastpfor::{AnyLenCodec, BlockCodec, slice_to_blocks};
+#[cfg(feature = "rust")]
+use fastpfor::{
+    FastPFor128, FastPFor256, FastPForBlock128, FastPForBlock256, JustCopy, VariableByte,
+};
+
+const SEED: u64 = 456;
+
+// ---------------------------------------------------------------------------
+// Generic codec helpers
+// ---------------------------------------------------------------------------
+
+/// Encode `data` with a caller-owned codec, decode with `expected_len: None`, assert round-trip.
+pub fn roundtrip_with<C: AnyLenCodec>(codec: &mut C, data: &[u32]) {
+    let mut compressed = Vec::new();
+    codec.encode(data, &mut compressed).unwrap();
+    let mut decoded = Vec::new();
+    codec.decode(&compressed, &mut decoded, None).unwrap();
+    assert_eq!(decoded, data);
+}
+
+pub fn roundtrip<C: AnyLenCodec>(data: &[u32]) {
+    let mut codec = C::default();
+    roundtrip_with(&mut codec, data);
+}
+
+pub fn compress<C: AnyLenCodec>(data: &[u32]) -> Vec<u32> {
+    let mut codec = C::default();
+    let mut compressed = Vec::new();
+    codec.encode(data, &mut compressed).unwrap();
+    compressed
+}
+
+pub fn decompress<C: AnyLenCodec>(compressed: &[u32], expected_len: Option<u32>) -> Vec<u32> {
+    let mut codec = C::default();
+    let mut decompressed = Vec::new();
+    codec
+        .decode(compressed, &mut decompressed, expected_len)
+        .unwrap();
+    decompressed
+}
+
+pub fn block_roundtrip<C: BlockCodec>(data: &[u32]) {
+    let compressed = block_compress::<C>(data);
+    let decompressed = block_decompress::<C>(&compressed, Some(data.len() as u32));
+    assert_eq!(decompressed, data);
+}
+
+pub fn block_compress<C: BlockCodec>(data: &[u32]) -> Vec<u32> {
+    let mut codec = C::default();
+    let (blocks, remainder) = slice_to_blocks::<C>(data);
+    assert_eq!(
+        remainder.len(),
+        0,
+        "data length must be a multiple of block size"
+    );
+    let mut out = Vec::new();
+    codec.encode_blocks(blocks, &mut out).unwrap();
+    out
+}
+
+pub fn block_decompress<C: BlockCodec>(compressed: &[u32], expected_len: Option<u32>) -> Vec<u32> {
+    let mut codec = C::default();
+    let mut out = Vec::new();
+    codec
+        .decode_blocks(compressed, expected_len, &mut out)
+        .unwrap();
+    out
+}
+
+#[cfg(feature = "cpp")]
+pub fn roundtrip64<C: BlockCodec64 + Default>(data: &[u64]) {
+    let mut codec = C::default();
+    let mut compressed = Vec::new();
+    codec.encode64(data, &mut compressed).unwrap();
+    let mut decoded = Vec::new();
+    codec.decode64(&compressed, &mut decoded).unwrap();
+    assert_eq!(decoded, data);
+}
+
+#[cfg(feature = "cpp")]
+pub fn compress64<C: BlockCodec64 + Default>(data: &[u64]) -> Vec<u32> {
+    let mut codec = C::default();
+    let mut compressed = Vec::new();
+    codec.encode64(data, &mut compressed).unwrap();
+    compressed
+}
+
+#[cfg(feature = "cpp")]
+pub fn decompress64<C: BlockCodec64 + Default>(compressed: &[u32]) -> Vec<u64> {
+    let mut codec = C::default();
+    let mut out = Vec::new();
+    codec.decode64(compressed, &mut out).unwrap();
+    out
+}
+
+/// Run [`roundtrip`] for every pure-Rust any-length codec covered here (and optionally C++).
+#[cfg(feature = "rust")]
+pub fn roundtrip_all(data: &[u32]) {
+    roundtrip::<VariableByte>(data);
+    roundtrip::<JustCopy>(data);
+    roundtrip::<FastPFor256>(data);
+    roundtrip::<FastPFor128>(data);
+
+    #[cfg(feature = "cpp")]
+    {
+        use fastpfor::cpp::CppFastPFor128;
+        roundtrip::<CppFastPFor128>(data);
+    }
+}
+
+#[cfg(feature = "rust")]
+pub fn block_roundtrip_all(data: &[u32]) {
+    block_roundtrip::<FastPForBlock256>(data);
+    block_roundtrip::<FastPForBlock128>(data);
+}
+
+/// Encode/decode round-trip using `CompositeCodec<B, T>` built from `B::default()` and `T::default()`.
+///
+/// `B` is the block codec; `T` is the any-length tail codec.
+#[cfg(feature = "rust")]
+pub fn roundtrip_composite<B, T>(data: &[u32])
+where
+    B: BlockCodec + Default,
+    T: AnyLenCodec + Default,
+{
+    let mut codec = fastpfor::CompositeCodec::<B, T>::default();
+    roundtrip_with(&mut codec, data);
+}
+
+// ---------------------------------------------------------------------------
+// Data generators + fixtures (Rust block codecs; benchmarks / smoke tests)
+// ---------------------------------------------------------------------------
+
+#[cfg(feature = "rust")]
+mod rust_bench {
+    use core::ops::Range;
+    use std::marker::PhantomData;
+
+    use rand::rngs::StdRng;
+    use rand::{RngExt as _, SeedableRng};
+
+    use super::{BlockCodec, block_compress};
+
+    type DataGeneratorFn = fn(usize) -> Vec<u32>;
+
+    fn generate_uniform_data_from_range(size: usize, value_range: Range<u32>) -> Vec<u32> {
+        let mut rng = StdRng::seed_from_u64(super::SEED);
+        (0..size)
+            .map(|_| rng.random_range(value_range.clone()))
+            .collect()
+    }
+
+    pub fn generate_uniform_data_small_value_distribution(size: usize) -> Vec<u32> {
+        generate_uniform_data_from_range(size, 0..1000)
+    }
+
+    fn generate_uniform_data_large_value_distribution(size: usize) -> Vec<u32> {
+        generate_uniform_data_from_range(size, 0..u32::MAX)
+    }
+
+    fn generate_clustered_data(size: usize) -> Vec<u32> {
+        let mut rng = StdRng::seed_from_u64(super::SEED);
+        let mut base = 0u32;
+        (0..size)
+            .map(|_| {
+                if rng.random_bool(0.1) {
+                    base = rng.random_range(0..1000);
+                }
+                base + rng.random_range(0..10)
+            })
+            .collect()
+    }
+
+    fn generate_sequential_data(size: usize) -> Vec<u32> {
+        (0..size as u32).collect()
+    }
+
+    fn generate_sparse_data(size: usize) -> Vec<u32> {
+        let mut rng = StdRng::seed_from_u64(super::SEED);
+        (0..size)
+            .map(|_| {
+                if rng.random_bool(0.9) {
+                    0
+                } else {
+                    rng.random()
+                }
+            })
+            .collect()
+    }
+
+    fn generate_constant_data(size: usize) -> Vec<u32> {
+        vec![super::SEED as u32; size]
+    }
+
+    fn generate_geometric_data(size: usize) -> Vec<u32> {
+        (0..size).map(|i| 1u32 << (i % 30)).collect()
+    }
+
+    const COMPRESS_PATTERNS: &[(&str, DataGeneratorFn)] = &[
+        (
+            "uniform_small_value_distribution",
+            generate_uniform_data_small_value_distribution,
+        ),
+        (
+            "uniform_large_value_distribution",
+            generate_uniform_data_large_value_distribution,
+        ),
+        ("clustered", generate_clustered_data),
+        ("sequential", generate_sequential_data),
+        ("sparse", generate_sparse_data),
+    ];
+
+    const ALL_PATTERNS: &[(&str, DataGeneratorFn)] = &[
+        (
+            "uniform_small_distribution",
+            generate_uniform_data_small_value_distribution,
+        ),
+        (
+            "uniform_large_distribution",
+            generate_uniform_data_large_value_distribution,
+        ),
+        ("clustered", generate_clustered_data),
+        ("sequential", generate_sequential_data),
+        ("sparse", generate_sparse_data),
+        ("constant", generate_constant_data),
+        ("geometric", generate_geometric_data),
+    ];
+
+    /// One row of pre-computed data for compression / decompression benchmarks.
+    pub struct CompressFixture<C: BlockCodec> {
+        pub name: &'static str,
+        /// Block-aligned uncompressed data (exactly `n_blocks * C::elements_per_block()` elements).
+        pub original: Vec<u32>,
+        /// Pre-compressed form, ready for decompression benchmarks.
+        pub compressed: Vec<u32>,
+        /// Number of blocks in `data`.
+        pub n_blocks: usize,
+        _codec: PhantomData<C>,
+    }
+
+    /// One row for the block-size comparison benchmark.
+    pub struct BlockSizeFixture<C: BlockCodec> {
+        pub compressed: Vec<u32>,
+        pub original: Vec<u32>,
+        pub n_blocks: usize,
+        _codec: PhantomData<C>,
+    }
+
+    impl<C: BlockCodec> CompressFixture<C> {
+        fn new(name: &'static str, generator: DataGeneratorFn, block_count: usize) -> Self {
+            let original = generator(block_count * C::size());
+            Self {
+                name,
+                compressed: block_compress::<C>(&original),
+                original,
+                n_blocks: block_count,
+                _codec: PhantomData,
+            }
+        }
+    }
+
+    impl<C: BlockCodec> BlockSizeFixture<C> {
+        pub fn new(block_count: usize) -> Self {
+            let original = generate_uniform_data_small_value_distribution(block_count * C::size());
+            Self {
+                compressed: block_compress::<C>(&original),
+                original,
+                n_blocks: block_count,
+                _codec: PhantomData,
+            }
+        }
+    }
+
+    pub fn compress_fixtures<C: BlockCodec>(
+        block_counts: &[usize],
+    ) -> Vec<(usize, CompressFixture<C>)> {
+        block_counts
+            .iter()
+            .flat_map(|&bc| {
+                COMPRESS_PATTERNS.iter().map(move |&(name, generator)| {
+                    (bc, CompressFixture::<C>::new(name, generator, bc))
+                })
+            })
+            .collect()
+    }
+
+    pub fn ratio_fixtures<C: BlockCodec>(block_count: usize) -> Vec<CompressFixture<C>> {
+        ALL_PATTERNS
+            .iter()
+            .map(|&(name, generator)| CompressFixture::<C>::new(name, generator, block_count))
+            .collect()
+    }
+}
+
+#[cfg(feature = "rust")]
+#[allow(unused_imports)]
+// Re-exports for benches/integration tests; not every `#[path]` site uses all items.
+pub use rust_bench::{
+    BlockSizeFixture, CompressFixture, compress_fixtures,
+    generate_uniform_data_small_value_distribution, ratio_fixtures,
+};
diff --git a/tests/basic_tests.rs b/tests/basic_tests.rs
index 6f3ac8f..97b2c39 100644
--- a/tests/basic_tests.rs
+++ b/tests/basic_tests.rs
@@ -2,14 +2,14 @@
 
 #![cfg(feature = "rust")]
 
-#[path = "../benches/bench_utils.rs"]
-mod bench_utils;
+#[path = "../src/test_utils.rs"]
+mod test_utils;
 
 use fastpfor::{BlockCodec, FastPForBlock128, FastPForBlock256, slice_to_blocks};
 use rand::rngs::StdRng;
 use rand::{RngExt as _, SeedableRng};
 
-use crate::bench_utils::{block_roundtrip_all, roundtrip_all};
+use crate::test_utils::{block_roundtrip_all, roundtrip_all};
 
 mod common;
 
diff --git a/tests/benchmark_smoke.rs b/tests/benchmark_smoke.rs
index ae5de15..73eb88c 100644
--- a/tests/benchmark_smoke.rs
+++ b/tests/benchmark_smoke.rs
@@ -1,13 +1,13 @@
 //! Smoke tests that execute every benchmark code path exactly once.
 //!
-//! All logic lives in `benches/bench_utils.rs`; this file just drives the
+//! All logic lives in `src/bench_utils.rs`; this file just drives the
 //! shared fixtures through assertions so `cargo test` and coverage tools
 //! cover the benchmark code paths without Criterion overhead.
 
 #![cfg(feature = "rust")]
 
-#[path = "../benches/bench_utils.rs"]
-mod bench_utils;
+#[path = "../src/test_utils.rs"]
+mod test_utils;
 
 #[cfg(feature = "cpp")]
 use fastpfor::BlockCodec;
@@ -16,8 +16,8 @@ use fastpfor::cpp::CppFastPFor128;
 use fastpfor::{FastPForBlock128, FastPForBlock256};
 
 #[cfg(feature = "cpp")]
-use crate::bench_utils::decompress;
-use crate::bench_utils::{
+use crate::test_utils::decompress;
+use crate::test_utils::{
     BlockSizeFixture, block_compress, block_decompress, block_roundtrip, compress_fixtures,
     ratio_fixtures,
 };
diff --git a/tests/cpp_compat_tests.rs b/tests/cpp_compat_tests.rs
index 1e70845..3299ed7 100644
--- a/tests/cpp_compat_tests.rs
+++ b/tests/cpp_compat_tests.rs
@@ -5,9 +5,14 @@
 
 #![cfg(all(feature = "rust", feature = "cpp"))]
 
+#[path = "../src/test_utils.rs"]
+mod test_utils;
+
 use fastpfor::{FastPFor128, FastPFor256, FastPForBlock128};
+use test_utils::{compress, decompress};
 
 mod common;
+use crate::test_utils::{block_compress, roundtrip};
 use common::{get_test_cases, test_input_sizes};
 use fastpfor::cpp::CppFastPFor128;
 use fastpfor::{AnyLenCodec, BlockCodec, slice_to_blocks};
@@ -15,7 +20,7 @@ use fastpfor::{AnyLenCodec, BlockCodec, slice_to_blocks};
 /// C++ `AnyLenCodec` encode → Rust `BlockCodec` decode (same wire format for block-aligned data).
 #[test]
 fn test_rust_decompresses_cpp_encoded_data() {
-    let mut codec_cpp = CppFastPFor128::new();
+    let mut codec_cpp = CppFastPFor128::default();
     let mut codec_rs = FastPForBlock128::default();
 
     for n in test_input_sizes() {
@@ -50,7 +55,7 @@ fn test_rust_decompresses_cpp_encoded_data() {
 /// Rust `BlockCodec` encode → C++ `AnyLenCodec` decode (same wire format).
 #[test]
 fn test_cpp_decompresses_rust_block_encoded_data() {
-    let mut codec_cpp = CppFastPFor128::new();
+    let mut codec_cpp = CppFastPFor128::default();
     let mut codec_rs = FastPForBlock128::default();
 
     for n in test_input_sizes() {
@@ -87,30 +92,30 @@ fn test_cpp_decompresses_rust_block_encoded_data() {
 /// Cross-check: Rust block encode and C++ any-length encode produce identical bytes for block-aligned input.
 #[test]
 fn test_rust_and_cpp_compression_matches() {
-    let mut codec_cpp = CppFastPFor128::new();
-    let mut codec_rs = FastPForBlock128::default();
-
     for n in test_input_sizes() {
         for input in get_test_cases(n + 128) {
             if input.len() % 128 != 0 || input.is_empty() {
                 continue;
             }
-            let (blocks_rs, _) = slice_to_blocks::<FastPForBlock128>(&input);
-
-            let mut cpp_compressed = Vec::new();
-            codec_cpp.encode(&input, &mut cpp_compressed).unwrap();
-
-            let mut rs_compressed = Vec::new();
-            codec_rs
-                .encode_blocks(blocks_rs, &mut rs_compressed)
-                .unwrap();
-
+            let compressed = compress::<CppFastPFor128>(&input);
             assert_eq!(
-                cpp_compressed,
-                rs_compressed,
+                compressed,
+                block_compress::<FastPForBlock128>(&input),
                 "Compressed bytes differ for input len {}",
                 input.len()
             );
+            assert_eq!(
+                decompress::<CppFastPFor128>(&compressed, None),
+                input,
+                "Rust→C++ roundtrip mismatch for len {}",
+                input.len()
+            );
+            assert_eq!(
+                decompress::<FastPFor128>(&compressed, None),
+                input,
+                "Rust→C++ roundtrip mismatch for len {}",
+                input.len()
+            );
         }
     }
 }
@@ -134,13 +139,8 @@ fn test_rust_anylen_roundtrip() {
 #[test]
 fn test_rust_anylen_128_roundtrip() {
     for n in test_input_sizes() {
-        let mut codec = FastPFor128::default();
         for input in get_test_cases(n) {
-            let mut compressed = Vec::new();
-            codec.encode(&input, &mut compressed).unwrap();
-            let mut decoded = Vec::new();
-            codec.decode(&compressed, &mut decoded, None).unwrap();
-            assert_eq!(decoded, input, "Rust AnyLenCodec 128 round-trip failed");
+            roundtrip::<FastPFor128>(&input);
         }
     }
 }
diff --git a/tests/encode_paths.rs b/tests/encode_paths.rs
index b9402e0..ffffda0 100644
--- a/tests/encode_paths.rs
+++ b/tests/encode_paths.rs
@@ -6,11 +6,11 @@
 
 #![cfg(feature = "rust")]
 
-#[path = "../benches/bench_utils.rs"]
-mod bench_utils;
+#[path = "../src/test_utils.rs"]
+mod test_utils;
 
-use bench_utils::{block_roundtrip, roundtrip};
-use fastpfor::{AnyLenCodec, FastPFor128, FastPFor256, FastPForBlock256, JustCopy, VariableByte};
+use fastpfor::{FastPFor128, FastPFor256, FastPForBlock256, JustCopy, VariableByte};
+use test_utils::{block_roundtrip, decompress, roundtrip};
 
 // ── VariableByte round-trip ───────────────────────────────────────────────────
 
@@ -95,10 +95,7 @@ fn fastpfor_encode_128_block_with_exceptions() {
 /// Decompressing an empty stream succeeds with empty output.
 #[test]
 fn variable_byte_anylen_decompress_short_input() {
-    let mut codec = VariableByte::new();
-    let mut out = Vec::new();
-    let result = codec.decode(&[], &mut out, None);
-    assert!(result.is_ok());
+    let out = decompress::<VariableByte>(&[], None);
     assert!(out.is_empty());
 }
 
@@ -106,12 +103,5 @@ fn variable_byte_anylen_decompress_short_input() {
 #[test]
 fn variable_byte_anylen_decompress_into_small_vec() {
     let data: Vec<u32> = (1..=20).collect();
-    let mut compressed = Vec::new();
-    VariableByte::new().encode(&data, &mut compressed).unwrap();
-
-    let mut out = Vec::new();
-    VariableByte::new()
-        .decode(&compressed, &mut out, None)
-        .unwrap();
-    assert_eq!(out, data);
+    roundtrip::<VariableByte>(&data);
 }

From 2cabce181593ca36819db051e4e17cff663b97fc Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Sun, 22 Mar 2026 23:12:21 -0400
Subject: [PATCH 09/26] fixes

---
 src/cpp/codecs.rs                             | 14 ++--
 src/cpp/tests.rs                              |  6 +-
 src/rust/composite.rs                         | 32 +++----
 src/rust/integer_compression/just_copy.rs     | 24 ++----
 src/rust/integer_compression/variable_byte.rs | 20 +----
 src/test_utils.rs                             | 75 ++++++++---------
 tests/benchmark_smoke.rs                      | 20 +++--
 tests/cpp_compat_tests.rs                     | 11 ++-
 tests/decode_validation.rs                    | 83 ++++++++-----------
 tests/encode_paths.rs                         |  2 +-
 10 files changed, 116 insertions(+), 171 deletions(-)

diff --git a/src/cpp/codecs.rs b/src/cpp/codecs.rs
index ed099df..300b340 100644
--- a/src/cpp/codecs.rs
+++ b/src/cpp/codecs.rs
@@ -212,32 +212,32 @@ pub(crate) mod tests {
 
     #[test]
     fn test_varint_u64() {
-        let input = vec![1u64, 2, 3, 4, 5];
-        roundtrip64::<CppVarInt>(&input);
+        roundtrip64::<CppVarInt>(&[1u64, 2, 3, 4, 5]);
     }
 
     #[test]
     fn test_decode32_empty_input() {
-        assert!(decompress::<CppVByte>(&[], None).is_empty());
+        assert!(decompress::<CppVByte>(&[], None).unwrap().is_empty());
     }
 
     #[test]
     fn test_decode32_cpp_empty_format() {
-        assert!(decompress::<CppFastPFor128>(&[0u32], Some(0)).is_empty());
+        let result = decompress::<CppFastPFor128>(&[0u32], Some(0)).unwrap();
+        assert!(result.is_empty());
     }
 
     #[test]
     fn test_decode64_empty_input() {
-        assert!(decompress64::<CppFastPFor256>(&[]).is_empty());
+        assert!(decompress64::<CppFastPFor256>(&[]).unwrap().is_empty());
     }
 
     #[test]
     fn test_decode64_empty_format() {
-        assert!(decompress64::<CppVarInt>(&[]).is_empty());
+        assert!(decompress64::<CppVarInt>(&[]).unwrap().is_empty());
     }
 
     #[test]
     fn test_decode_empty_input() {
-        assert!(decompress::<CppFastPFor128>(&[], None).is_empty());
+        assert!(decompress::<CppFastPFor128>(&[], None).unwrap().is_empty());
     }
 }
diff --git a/src/cpp/tests.rs b/src/cpp/tests.rs
index fa8d3e8..597dbff 100644
--- a/src/cpp/tests.rs
+++ b/src/cpp/tests.rs
@@ -1,4 +1,4 @@
-use crate::test_utils::roundtrip_with;
+use crate::test_utils::roundtrip;
 
 /// Test all codecs compile and do a basic 32-bit roundtrip
 macro_rules! test_anylen {
@@ -7,7 +7,7 @@ macro_rules! test_anylen {
             #[test]
             #[allow(non_snake_case)]
             fn $name() {
-                roundtrip_with(&mut $crate::cpp::$name::default(), &[1u32, 2, 3, 4, 5]);
+                roundtrip::<$crate::cpp::$name>(&[1u32, 2, 3, 4, 5]);
             }
         )*
     };
@@ -51,7 +51,7 @@ macro_rules! test_anylen_128 {
                 #[allow(non_snake_case)]
                 fn $name() {
                     let input: Vec<u32> = (1..=128).collect();
-                    roundtrip_with(&mut $crate::cpp::$name::default(), &input);
+                    roundtrip::<$crate::cpp::$name>(&input);
                 }
             )*
         };
diff --git a/src/rust/composite.rs b/src/rust/composite.rs
index 912a453..c495ee6 100644
--- a/src/rust/composite.rs
+++ b/src/rust/composite.rs
@@ -140,28 +140,20 @@ mod tests {
     #[test]
     fn test_decode_truly_empty_input() {
         // Decoding a zero-length slice (not even a header word) must succeed with empty output.
-        assert!(decompress::<Comp256Vb>(&[], None).is_empty());
+        assert!(decompress::<Comp256Vb>(&[], None).unwrap().is_empty());
     }
 
     #[test]
     fn test_decode_empty_input_with_expected_zero() {
         // Empty input with expected_len=0 must succeed.
-        assert!(decompress::<Comp256Vb>(&[], Some(0)).is_empty());
+        assert!(decompress::<Comp256Vb>(&[], Some(0)).unwrap().is_empty());
     }
 
     #[test]
     fn test_decode_empty_input_with_nonzero_expected_errors() {
         // Empty input: max_decompressed_len(0) == 0, so any expected_len > 0 fails
         // with ExpectedCountExceedsMax before decoding begins.
-        let mut codec = CompositeCodec::<FastPForBlock256, VariableByte>::default();
-        let err = codec.decode(&[], &mut Vec::new(), Some(5)).unwrap_err();
-        assert!(matches!(
-            err,
-            FastPForError::ExpectedCountExceedsMax {
-                expected: 5,
-                max: 0
-            }
-        ));
+        decompress::<CompositeCodec<FastPForBlock256, VariableByte>>(&[], Some(5)).unwrap_err();
     }
 
     #[test]
@@ -170,11 +162,8 @@ mod tests {
         // than attempting a multi-gigabyte allocation.
         // Regression: fuzzer found bytes [0x04, 0x35, 0x19] → u32 LE 0x00193504 = 1_651_460
         // fed to FastPFor256.decode caused an OOM via a ~2.5 GB Vec::resize.
-        let mut codec = CompositeCodec::<FastPForBlock256, VariableByte>::default();
-        let mut out = Vec::new();
-        let input = [0x0019_3504u32]; // n_blocks = 1_651_460, rest is empty
-        assert!(codec.decode(&input, &mut out, None).is_err());
-        assert!(out.is_empty());
+        let input = &[0x0019_3504u32]; // n_blocks = 1_651_460, rest is empty
+        decompress::<CompositeCodec<FastPForBlock256, VariableByte>>(input, None).unwrap_err();
     }
 
     #[test]
@@ -186,15 +175,15 @@ mod tests {
     #[test]
     fn test_decode_with_expected_len() {
         let data: Vec<u32> = (0..600).collect();
-        let encoded = compress::<Comp256Vb>(&data);
-        let decoded = decompress::<Comp256Vb>(&encoded, Some(600));
+        let encoded = compress::<Comp256Vb>(&data).unwrap();
+        let decoded = decompress::<Comp256Vb>(&encoded, Some(600)).unwrap();
         assert_eq!(decoded, data);
     }
 
     #[test]
     fn test_decode_expected_len_mismatch_errors() {
         let data: Vec<u32> = (0..100).collect();
-        let encoded = compress::<Comp256Vb>(&data);
+        let encoded = compress::<Comp256Vb>(&data).unwrap();
         let mut codec = Comp256Vb::default();
         let err = codec
             .decode(&encoded, &mut Vec::new(), Some(50))
@@ -211,10 +200,9 @@ mod tests {
     #[test]
     fn test_decode_expected_len_exceeds_max_errors() {
         let data: Vec<u32> = (0..10).collect();
-        let encoded = compress::<Comp256Vb>(&data);
+        let encoded = compress::<Comp256Vb>(&data).unwrap();
         let huge = (Comp256Vb::max_decompressed_len(encoded.len()) + 1) as u32;
-        let mut codec = Comp256Vb::default();
-        let err = codec
+        let err = Comp256Vb::default()
             .decode(&encoded, &mut Vec::new(), Some(huge))
             .unwrap_err();
         assert!(matches!(err, FastPForError::ExpectedCountExceedsMax { .. }));
diff --git a/src/rust/integer_compression/just_copy.rs b/src/rust/integer_compression/just_copy.rs
index 2476aa2..ba911f0 100644
--- a/src/rust/integer_compression/just_copy.rs
+++ b/src/rust/integer_compression/just_copy.rs
@@ -46,34 +46,20 @@ impl AnyLenCodec for JustCopy {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::FastPForError;
-    use crate::test_utils::{decompress, roundtrip};
+    use crate::test_utils::{decompress, roundtrip, roundtrip_expected};
 
     #[test]
-    fn justcopy_default_and_roundtrip() {
+    fn justcopy_roundtrip() {
         roundtrip::<JustCopy>(&[1u32, 2, 3]);
     }
 
     #[test]
-    fn justcopy_decode_with_expected_len_ok() {
-        let data = vec![1u32, 2, 3];
-        let out = decompress::<JustCopy>(&data, Some(3));
-        assert_eq!(out, data);
+    fn justcopy_roundtrip_with_expected_len_none() {
+        roundtrip_expected::<JustCopy>(&[1u32, 2, 3], None);
     }
 
     #[test]
-    #[expect(clippy::default_constructed_unit_structs)]
     fn justcopy_decode_expected_len_mismatch_errors() {
-        let data = vec![1u32, 2, 3];
-        let err = JustCopy::default()
-            .decode(&data, &mut Vec::new(), Some(2))
-            .unwrap_err();
-        assert!(matches!(
-            err,
-            FastPForError::DecodedCountMismatch {
-                actual: 3,
-                expected: 2
-            }
-        ));
+        decompress::<JustCopy>(&[1u32, 2, 3], Some(2)).unwrap_err();
     }
 }
diff --git a/src/rust/integer_compression/variable_byte.rs b/src/rust/integer_compression/variable_byte.rs
index a2bd260..4550d04 100644
--- a/src/rust/integer_compression/variable_byte.rs
+++ b/src/rust/integer_compression/variable_byte.rs
@@ -364,7 +364,6 @@ mod tests {
     use std::hash::{BuildHasher, Hasher};
 
     use super::*;
-
     use crate::test_utils::{compress, decompress, roundtrip};
 
     fn verify_u32_roundtrip(input: &[u32]) {
@@ -658,27 +657,16 @@ mod tests {
     #[test]
     fn test_anylen_decode_with_expected_len_ok() {
         let data = vec![1u32, 2, 3];
-        let encoded = compress::<VariableByte>(&data);
-        let decoded = decompress::<VariableByte>(&encoded, Some(3));
+        let encoded = compress::<VariableByte>(&data).unwrap();
+        let decoded = decompress::<VariableByte>(&encoded, Some(3)).unwrap();
         assert_eq!(decoded, data);
     }
 
     #[test]
-    #[expect(clippy::default_constructed_unit_structs)]
     fn test_anylen_decode_expected_len_mismatch_errors() {
         // expected_len must be >= actual to avoid OutputBufferTooSmall; use a larger
         // value to exercise the is_decoded_mismatch path.
-        let data = vec![1u32, 2, 3];
-        let encoded = compress::<VariableByte>(&data);
-        let err = VariableByte::default()
-            .decode(&encoded, &mut Vec::new(), Some(10))
-            .unwrap_err();
-        assert!(matches!(
-            err,
-            FastPForError::DecodedCountMismatch {
-                actual: 3,
-                expected: 10
-            }
-        ));
+        let encoded = compress::<VariableByte>(&[1u32, 2, 3]).unwrap();
+        decompress::<VariableByte>(&encoded, Some(10)).unwrap_err();
     }
 }
diff --git a/src/test_utils.rs b/src/test_utils.rs
index 7d974cc..808956a 100644
--- a/src/test_utils.rs
+++ b/src/test_utils.rs
@@ -19,7 +19,7 @@
 #[cfg(feature = "cpp")]
 use fastpfor::BlockCodec64;
 #[allow(unused_imports)]
-use fastpfor::{AnyLenCodec, BlockCodec, slice_to_blocks};
+use fastpfor::{AnyLenCodec, BlockCodec, FastPForResult, slice_to_blocks};
 #[cfg(feature = "rust")]
 use fastpfor::{
     FastPFor128, FastPFor256, FastPForBlock128, FastPForBlock256, JustCopy, VariableByte,
@@ -32,43 +32,43 @@ const SEED: u64 = 456;
 // ---------------------------------------------------------------------------
 
 /// Encode `data` with a caller-owned codec, decode with `expected_len: None`, assert round-trip.
-pub fn roundtrip_with<C: AnyLenCodec>(codec: &mut C, data: &[u32]) {
+pub fn roundtrip_expected<C: AnyLenCodec>(data: &[u32], expected_len: Option<u32>) {
+    let mut codec = C::default();
     let mut compressed = Vec::new();
     codec.encode(data, &mut compressed).unwrap();
     let mut decoded = Vec::new();
-    codec.decode(&compressed, &mut decoded, None).unwrap();
+    codec
+        .decode(&compressed, &mut decoded, expected_len)
+        .unwrap();
     assert_eq!(decoded, data);
 }
 
 pub fn roundtrip<C: AnyLenCodec>(data: &[u32]) {
-    let mut codec = C::default();
-    roundtrip_with(&mut codec, data);
+    roundtrip_expected::<C>(data, Some(data.len().try_into().unwrap()));
 }
 
-pub fn compress<C: AnyLenCodec>(data: &[u32]) -> Vec<u32> {
-    let mut codec = C::default();
+pub fn compress<C: AnyLenCodec>(data: &[u32]) -> FastPForResult<Vec<u32>> {
     let mut compressed = Vec::new();
-    codec.encode(data, &mut compressed).unwrap();
-    compressed
+    C::default().encode(data, &mut compressed)?;
+    Ok(compressed)
 }
 
-pub fn decompress<C: AnyLenCodec>(compressed: &[u32], expected_len: Option<u32>) -> Vec<u32> {
-    let mut codec = C::default();
+pub fn decompress<C: AnyLenCodec>(
+    compressed: &[u32],
+    expected_len: Option<u32>,
+) -> FastPForResult<Vec<u32>> {
     let mut decompressed = Vec::new();
-    codec
-        .decode(compressed, &mut decompressed, expected_len)
-        .unwrap();
-    decompressed
+    C::default().decode(compressed, &mut decompressed, expected_len)?;
+    Ok(decompressed)
 }
 
 pub fn block_roundtrip<C: BlockCodec>(data: &[u32]) {
-    let compressed = block_compress::<C>(data);
-    let decompressed = block_decompress::<C>(&compressed, Some(data.len() as u32));
+    let compressed = block_compress::<C>(data).unwrap();
+    let decompressed = block_decompress::<C>(&compressed, Some(data.len() as u32)).unwrap();
     assert_eq!(decompressed, data);
 }
 
-pub fn block_compress<C: BlockCodec>(data: &[u32]) -> Vec<u32> {
-    let mut codec = C::default();
+pub fn block_compress<C: BlockCodec>(data: &[u32]) -> FastPForResult<Vec<u32>> {
     let (blocks, remainder) = slice_to_blocks::<C>(data);
     assert_eq!(
         remainder.len(),
@@ -76,17 +76,17 @@ pub fn block_compress<C: BlockCodec>(data: &[u32]) -> Vec<u32> {
         "data length must be a multiple of block size"
     );
     let mut out = Vec::new();
-    codec.encode_blocks(blocks, &mut out).unwrap();
-    out
+    C::default().encode_blocks(blocks, &mut out)?;
+    Ok(out)
 }
 
-pub fn block_decompress<C: BlockCodec>(compressed: &[u32], expected_len: Option<u32>) -> Vec<u32> {
-    let mut codec = C::default();
+pub fn block_decompress<C: BlockCodec>(
+    compressed: &[u32],
+    expected_len: Option<u32>,
+) -> FastPForResult<Vec<u32>> {
     let mut out = Vec::new();
-    codec
-        .decode_blocks(compressed, expected_len, &mut out)
-        .unwrap();
-    out
+    C::default().decode_blocks(compressed, expected_len, &mut out)?;
+    Ok(out)
 }
 
 #[cfg(feature = "cpp")]
@@ -100,19 +100,17 @@ pub fn roundtrip64<C: BlockCodec64 + Default>(data: &[u64]) {
 }
 
 #[cfg(feature = "cpp")]
-pub fn compress64<C: BlockCodec64 + Default>(data: &[u64]) -> Vec<u32> {
-    let mut codec = C::default();
+pub fn compress64<C: BlockCodec64 + Default>(data: &[u64]) -> FastPForResult<Vec<u32>> {
     let mut compressed = Vec::new();
-    codec.encode64(data, &mut compressed).unwrap();
-    compressed
+    C::default().encode64(data, &mut compressed)?;
+    Ok(compressed)
 }
 
 #[cfg(feature = "cpp")]
-pub fn decompress64<C: BlockCodec64 + Default>(compressed: &[u32]) -> Vec<u64> {
-    let mut codec = C::default();
+pub fn decompress64<C: BlockCodec64 + Default>(compressed: &[u32]) -> FastPForResult<Vec<u64>> {
     let mut out = Vec::new();
-    codec.decode64(compressed, &mut out).unwrap();
-    out
+    C::default().decode64(compressed, &mut out)?;
+    Ok(out)
 }
 
 /// Run [`roundtrip`] for every pure-Rust any-length codec covered here (and optionally C++).
@@ -145,8 +143,7 @@ where
     B: BlockCodec + Default,
     T: AnyLenCodec + Default,
 {
-    let mut codec = fastpfor::CompositeCodec::<B, T>::default();
-    roundtrip_with(&mut codec, data);
+    roundtrip::<fastpfor::CompositeCodec<B, T>>(data);
 }
 
 // ---------------------------------------------------------------------------
@@ -273,7 +270,7 @@ mod rust_bench {
             let original = generator(block_count * C::size());
             Self {
                 name,
-                compressed: block_compress::<C>(&original),
+                compressed: block_compress::<C>(&original).unwrap(),
                 original,
                 n_blocks: block_count,
                 _codec: PhantomData,
@@ -285,7 +282,7 @@ mod rust_bench {
         pub fn new(block_count: usize) -> Self {
             let original = generate_uniform_data_small_value_distribution(block_count * C::size());
             Self {
-                compressed: block_compress::<C>(&original),
+                compressed: block_compress::<C>(&original).unwrap(),
                 original,
                 n_blocks: block_count,
                 _codec: PhantomData,
diff --git a/tests/benchmark_smoke.rs b/tests/benchmark_smoke.rs
index 73eb88c..b17bc49 100644
--- a/tests/benchmark_smoke.rs
+++ b/tests/benchmark_smoke.rs
@@ -39,7 +39,8 @@ fn smoke_compression() {
 fn smoke_decompression() {
     for (_, fix) in compress_fixtures::<FastPForBlock128>(&[SMOKE_BLOCK_COUNT]) {
         let decompressed =
-            block_decompress::<FastPForBlock128>(&fix.compressed, Some(fix.original.len() as u32));
+            block_decompress::<FastPForBlock128>(&fix.compressed, Some(fix.original.len() as u32))
+                .unwrap();
         assert_eq!(
             decompressed.len(),
             fix.original.len(),
@@ -69,13 +70,14 @@ fn smoke_block_sizes() {
 
     // 128-element blocks
     {
-        let compressed = block_compress::<FastPForBlock128>(&fix128.original);
+        let compressed = block_compress::<FastPForBlock128>(&fix128.original).unwrap();
         assert_eq!(
             compressed, fix128.compressed,
             "128: compress output mismatch"
         );
         let decompressed =
-            block_decompress::<FastPForBlock128>(&compressed, Some(fix128.original.len() as u32));
+            block_decompress::<FastPForBlock128>(&compressed, Some(fix128.original.len() as u32))
+                .unwrap();
         assert_eq!(
             decompressed.len(),
             fix128.original.len(),
@@ -86,13 +88,14 @@ fn smoke_block_sizes() {
 
     // 256-element blocks
     {
-        let compressed = block_compress::<FastPForBlock256>(&fix256.original);
+        let compressed = block_compress::<FastPForBlock256>(&fix256.original).unwrap();
         assert_eq!(
             compressed, fix256.compressed,
             "256: compress output mismatch"
         );
         let decompressed =
-            block_decompress::<FastPForBlock256>(&compressed, Some(fix256.original.len() as u32));
+            block_decompress::<FastPForBlock256>(&compressed, Some(fix256.original.len() as u32))
+                .unwrap();
         assert_eq!(
             decompressed.len(),
             fix256.original.len(),
@@ -105,7 +108,7 @@ fn smoke_block_sizes() {
 #[test]
 fn smoke_compression_ratio() {
     for fix in ratio_fixtures::<FastPForBlock128>(SMOKE_BLOCK_COUNT) {
-        let out = block_compress::<FastPForBlock128>(&fix.original);
+        let out = block_compress::<FastPForBlock128>(&fix.original).unwrap();
         assert!(
             !out.is_empty(),
             "{}: compressed output must be non-empty",
@@ -130,11 +133,12 @@ fn smoke_cpp_vs_rust() {
     for (_, fix) in compress_fixtures::<FastPForBlock128>(&[SMOKE_BLOCK_COUNT]) {
         let expected_len = fix.n_blocks * FastPForBlock128::size();
 
-        let out = decompress::<CppFastPFor128>(&fix.compressed, Some(expected_len as u32));
+        let out = decompress::<CppFastPFor128>(&fix.compressed, Some(expected_len as u32)).unwrap();
         assert_eq!(out, fix.original, "{}: Bad C++ roundtrip", fix.name);
 
         let out =
-            block_decompress::<FastPForBlock128>(&fix.compressed, Some(fix.original.len() as u32));
+            block_decompress::<FastPForBlock128>(&fix.compressed, Some(fix.original.len() as u32))
+                .unwrap();
         assert_eq!(out, fix.original, "{}: Bad Rust roundtrip", fix.name);
     }
 }
diff --git a/tests/cpp_compat_tests.rs b/tests/cpp_compat_tests.rs
index 3299ed7..1072ed8 100644
--- a/tests/cpp_compat_tests.rs
+++ b/tests/cpp_compat_tests.rs
@@ -9,10 +9,9 @@
 mod test_utils;
 
 use fastpfor::{FastPFor128, FastPFor256, FastPForBlock128};
-use test_utils::{compress, decompress};
+use test_utils::{block_compress, compress, decompress, roundtrip};
 
 mod common;
-use crate::test_utils::{block_compress, roundtrip};
 use common::{get_test_cases, test_input_sizes};
 use fastpfor::cpp::CppFastPFor128;
 use fastpfor::{AnyLenCodec, BlockCodec, slice_to_blocks};
@@ -97,21 +96,21 @@ fn test_rust_and_cpp_compression_matches() {
             if input.len() % 128 != 0 || input.is_empty() {
                 continue;
             }
-            let compressed = compress::<CppFastPFor128>(&input);
+            let compressed = compress::<CppFastPFor128>(&input).unwrap();
             assert_eq!(
                 compressed,
-                block_compress::<FastPForBlock128>(&input),
+                block_compress::<FastPForBlock128>(&input).unwrap(),
                 "Compressed bytes differ for input len {}",
                 input.len()
             );
             assert_eq!(
-                decompress::<CppFastPFor128>(&compressed, None),
+                decompress::<CppFastPFor128>(&compressed, None).unwrap(),
                 input,
                 "Rust→C++ roundtrip mismatch for len {}",
                 input.len()
             );
             assert_eq!(
-                decompress::<FastPFor128>(&compressed, None),
+                decompress::<FastPFor128>(&compressed, None).unwrap(),
                 input,
                 "Rust→C++ roundtrip mismatch for len {}",
                 input.len()
diff --git a/tests/decode_validation.rs b/tests/decode_validation.rs
index 56cf719..8d33527 100644
--- a/tests/decode_validation.rs
+++ b/tests/decode_validation.rs
@@ -3,48 +3,31 @@
 //! ([`fastpfor::FastPFor128`] only).
 //!
 //! Error cases that previously lived in `fastpfor.rs` unit tests (`try_decode` /
-//! `decode_blocks`) are exercised here via `assert_fails` and `AnyLenCodec::decode`.
+//! `decode_blocks`) are exercised here via `decompress` and `AnyLenCodec::decode`.
 
 #![cfg(feature = "rust")]
 
-use bytemuck::{cast_slice, cast_slice_mut};
-use fastpfor::{AnyLenCodec, BlockCodec, FastPFor128, FastPForBlock128};
+#[path = "../src/test_utils.rs"]
+mod test_utils;
+
+use crate::test_utils::{block_compress, decompress};
+use bytemuck::cast_slice_mut;
+use fastpfor::{FastPFor128, FastPForBlock128};
 
 /// Matches `DEFAULT_PAGE_SIZE` in `fastpfor` (64 Ki integers per page).
 const DEFAULT_PAGE_SIZE: u32 = 65536;
 
-/// `compressed` must not decode successfully. Use `Some(128)` for a single full 128-block
-/// stream; `None` for arbitrary garbage.
-fn assert_fails<C: AnyLenCodec + Default>(compressed: &[u32], expected_len: Option<u32>) {
-    let mut codec = C::default();
-    let mut out = Vec::new();
-    assert!(
-        codec.decode(compressed, &mut out, expected_len).is_err(),
-        "expected decode to fail with Err, but it succeeded"
-    );
-}
-
-fn encode<C: BlockCodec + Default>(data: &[u32]) -> Vec<u32> {
-    assert_eq!(data.len() % C::size(), 0);
-    let blocks: &[C::Block] = cast_slice(data);
-    let mut out = Vec::new();
-    C::default()
-        .encode_blocks(blocks, &mut out)
-        .expect("encode one or more blocks");
-    out
-}
-
 fn compressed_with_exceptions() -> Vec<u32> {
     let data: Vec<u32> = (0..128u32)
         .map(|i| if i % 2 == 0 { 1u32 << 30 } else { 3 })
         .collect();
-    encode::<FastPForBlock128>(&data)
+    block_compress::<FastPForBlock128>(&data).unwrap()
 }
 
 fn compressed_with_index1_exceptions() -> Vec<u32> {
     let mut data = vec![1u32; 128];
     data[0] = 3;
-    encode::<FastPForBlock128>(&data)
+    block_compress::<FastPForBlock128>(&data).unwrap()
 }
 
 fn meta_byte_start(compressed: &[u32]) -> usize {
@@ -82,46 +65,46 @@ fn decode_returns_error_for_libfuzzer_arbitrary_words() {
         u32::MAX,
         36,
     ];
-    assert_fails::<FastPFor128>(data, None);
+    decompress::<FastPFor128>(data, None).unwrap_err();
 }
 
 #[test]
 fn decode_returns_error_for_minimal_three_word_garbage() {
-    assert_fails::<FastPFor128>(&[0x200, 0, 1], None);
+    decompress::<FastPFor128>(&[0x200, 0, 1], None).unwrap_err();
 }
 
 #[test]
 fn decode_returns_error_when_block_stream_truncated() {
-    let compressed = encode::<FastPForBlock128>(&[42u32; 128]);
+    let compressed = block_compress::<FastPForBlock128>(&[42u32; 128]).unwrap();
     for truncated_len in [1, 2, compressed.len() / 2, compressed.len() - 1] {
-        assert_fails::<FastPFor128>(&compressed[..truncated_len], Some(128));
+        decompress::<FastPFor128>(&compressed[..truncated_len], Some(128)).unwrap_err();
     }
 }
 
 #[test]
 fn decode_returns_error_when_where_meta_word_points_past_buffer() {
-    let mut compressed = encode::<FastPForBlock128>(&[1u32; 128]);
+    let mut compressed = block_compress::<FastPForBlock128>(&[1u32; 128]).unwrap();
     compressed[1] = u32::MAX;
-    assert_fails::<FastPFor128>(&compressed, Some(128));
+    decompress::<FastPFor128>(&compressed, Some(128)).unwrap_err();
 }
 
 #[test]
 fn decode_returns_error_when_only_out_length_word_present() {
-    assert_fails::<FastPFor128>(&[128u32], Some(128));
+    decompress::<FastPFor128>(&[128u32], Some(128)).unwrap_err();
 }
 
 #[test]
 fn decode_returns_error_when_where_meta_out_of_bounds_on_exception_stream() {
     let mut compressed = compressed_with_exceptions();
     compressed[1] = u32::MAX;
-    assert_fails::<FastPFor128>(&compressed, Some(128));
+    decompress::<FastPFor128>(&compressed, Some(128)).unwrap_err();
 }
 
 #[test]
 fn decode_returns_error_when_bytesize_points_past_end() {
     let mut compressed = compressed_with_exceptions();
     compressed[1] = compressed.len() as u32 - 1;
-    assert_fails::<FastPFor128>(&compressed, Some(128));
+    decompress::<FastPFor128>(&compressed, Some(128)).unwrap_err();
 }
 
 #[test]
@@ -129,7 +112,7 @@ fn decode_returns_error_when_bytesize_overflows_length() {
     let mut compressed = compressed_with_exceptions();
     let bytesize_idx = 1 + compressed[1] as usize;
     compressed[bytesize_idx] = u32::MAX - 3;
-    assert_fails::<FastPFor128>(&compressed, Some(128));
+    decompress::<FastPFor128>(&compressed, Some(128)).unwrap_err();
 }
 
 #[test]
@@ -138,7 +121,7 @@ fn decode_returns_error_when_bitmap_reads_past_end() {
     let bytesize_idx = 1 + compressed[1] as usize;
     let remaining = (compressed.len() - bytesize_idx - 1) as u32;
     compressed[bytesize_idx] = remaining * 4;
-    assert_fails::<FastPFor128>(&compressed, Some(128));
+    decompress::<FastPFor128>(&compressed, Some(128)).unwrap_err();
 }
 
 #[test]
@@ -146,13 +129,13 @@ fn decode_returns_error_when_exception_group_size_exceeds_page() {
     let mut compressed = compressed_with_exceptions();
     let size_idx = bitmap_idx(&compressed) + 1;
     compressed[size_idx] = DEFAULT_PAGE_SIZE + 1;
-    assert_fails::<FastPFor128>(&compressed, Some(128));
+    decompress::<FastPFor128>(&compressed, Some(128)).unwrap_err();
 }
 
 #[test]
 fn decode_returns_error_when_exception_bitstream_truncated() {
     let compressed = compressed_with_exceptions();
-    assert_fails::<FastPFor128>(&compressed[..compressed.len() - 2], Some(128));
+    decompress::<FastPFor128>(&compressed[..compressed.len() - 2], Some(128)).unwrap_err();
 }
 
 #[test]
@@ -160,14 +143,14 @@ fn decode_returns_error_when_packed_bit_width_byte_too_large() {
     let mut compressed = compressed_with_exceptions();
     let start = meta_byte_start(&compressed);
     cast_slice_mut::<_, u8>(&mut compressed)[start] = 33;
-    assert_fails::<FastPFor128>(&compressed, Some(128));
+    decompress::<FastPFor128>(&compressed, Some(128)).unwrap_err();
 }
 
 #[test]
 fn decode_returns_error_when_packed_region_truncated_before_metadata() {
     let compressed = compressed_with_exceptions();
     let where_meta = compressed[1] as usize;
-    assert_fails::<FastPFor128>(&compressed[..where_meta], Some(128));
+    decompress::<FastPFor128>(&compressed[..where_meta], Some(128)).unwrap_err();
 }
 
 #[test]
@@ -178,7 +161,7 @@ fn decode_returns_error_when_exception_maxbits_too_large() {
     if let Some((_, _, mb_off)) = find_exception_block(bytes, start) {
         bytes[mb_off] = 33;
     }
-    assert_fails::<FastPFor128>(&compressed, Some(128));
+    decompress::<FastPFor128>(&compressed, Some(128)).unwrap_err();
 }
 
 #[test]
@@ -189,7 +172,7 @@ fn decode_returns_error_when_exception_index_underflows_optimal_bits() {
     if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
         bytes[mb_off] = bytes[bb_off].saturating_sub(1);
     }
-    assert_fails::<FastPFor128>(&compressed, Some(128));
+    decompress::<FastPFor128>(&compressed, Some(128)).unwrap_err();
 }
 
 #[test]
@@ -200,26 +183,26 @@ fn decode_returns_error_when_exception_index_equals_optimal_bits() {
     if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
         bytes[mb_off] = bytes[bb_off];
     }
-    assert_fails::<FastPFor128>(&compressed, Some(128));
+    decompress::<FastPFor128>(&compressed, Some(128)).unwrap_err();
 }
 
 #[test]
 fn decode_returns_error_when_index1_exception_position_byte_truncated() {
     let compressed = compressed_with_index1_exceptions();
-    assert_fails::<FastPFor128>(&compressed[..compressed.len() - 1], Some(128));
+    decompress::<FastPFor128>(&compressed[..compressed.len() - 1], Some(128)).unwrap_err();
 }
 
 #[test]
 fn decode_returns_error_when_exception_position_byte_truncated() {
     let compressed = compressed_with_exceptions();
-    assert_fails::<FastPFor128>(&compressed[..compressed.len() - 1], Some(128));
+    decompress::<FastPFor128>(&compressed[..compressed.len() - 1], Some(128)).unwrap_err();
 }
 
 #[test]
 fn decode_returns_error_when_index1_exception_position_out_of_block() {
     let mut data = vec![1u32; 128];
     data[0] = 3;
-    let mut buf = encode::<FastPForBlock128>(&data);
+    let mut buf = block_compress::<FastPForBlock128>(&data).unwrap();
     let start = meta_byte_start(&buf);
     let bytes: &mut [u8] = cast_slice_mut(&mut buf);
     if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
@@ -227,7 +210,7 @@ fn decode_returns_error_when_index1_exception_position_out_of_block() {
             bytes[mb_off + 1] = 200;
         }
     }
-    assert_fails::<FastPFor128>(&buf, Some(128));
+    decompress::<FastPFor128>(&buf, Some(128)).unwrap_err();
 }
 
 #[test]
@@ -235,7 +218,7 @@ fn decode_returns_error_when_exception_position_out_of_block() {
     let data: Vec<u32> = (0..128u32)
         .map(|i| if i % 4 == 0 { 1u32 << 30 } else { 1 })
         .collect();
-    let mut buf = encode::<FastPForBlock128>(&data);
+    let mut buf = block_compress::<FastPForBlock128>(&data).unwrap();
     let start = meta_byte_start(&buf);
     let bytes: &mut [u8] = cast_slice_mut(&mut buf);
     if let Some((bb_off, _, mb_off)) = find_exception_block(bytes, start) {
@@ -243,5 +226,5 @@ fn decode_returns_error_when_exception_position_out_of_block() {
             bytes[mb_off + 1] = 200;
         }
     }
-    assert_fails::<FastPFor128>(&buf, Some(128));
+    decompress::<FastPFor128>(&buf, Some(128)).unwrap_err();
 }
diff --git a/tests/encode_paths.rs b/tests/encode_paths.rs
index ffffda0..cf2848a 100644
--- a/tests/encode_paths.rs
+++ b/tests/encode_paths.rs
@@ -95,7 +95,7 @@ fn fastpfor_encode_128_block_with_exceptions() {
 /// Decompressing an empty stream succeeds with empty output.
 #[test]
 fn variable_byte_anylen_decompress_short_input() {
-    let out = decompress::<VariableByte>(&[], None);
+    let out = decompress::<VariableByte>(&[], None).unwrap();
     assert!(out.is_empty());
 }
 

From 18e8fc8ef8d3a7e45f833c6775165001de4a6203 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Sun, 22 Mar 2026 23:21:45 -0400
Subject: [PATCH 10/26] cleanup

---
 src/rust/integer_compression/fastpfor.rs |  6 +++++-
 tests/cpp_compat_tests.rs                | 11 +++++++----
 tests/decode_validation.rs               |  3 ++-
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/rust/integer_compression/fastpfor.rs b/src/rust/integer_compression/fastpfor.rs
index 8c18b5e..fbad95f 100644
--- a/src/rust/integer_compression/fastpfor.rs
+++ b/src/rust/integer_compression/fastpfor.rs
@@ -605,7 +605,11 @@ mod tests {
             .unwrap();
         let mut decoded = Vec::new();
         FastPFor::<N>::default()
-            .decode_blocks(&compressed, Some((blocks.len() * N) as u32), &mut decoded)
+            .decode_blocks(
+                &compressed,
+                Some((blocks.len() * N).try_into().unwrap()),
+                &mut decoded,
+            )
             .unwrap();
         decoded
     }
diff --git a/tests/cpp_compat_tests.rs b/tests/cpp_compat_tests.rs
index 1072ed8..c2945a8 100644
--- a/tests/cpp_compat_tests.rs
+++ b/tests/cpp_compat_tests.rs
@@ -21,6 +21,7 @@ use fastpfor::{AnyLenCodec, BlockCodec, slice_to_blocks};
 fn test_rust_decompresses_cpp_encoded_data() {
     let mut codec_cpp = CppFastPFor128::default();
     let mut codec_rs = FastPForBlock128::default();
+    let mut cpp_compressed = Vec::new();
 
     for n in test_input_sizes() {
         for input in get_test_cases(n + 128) {
@@ -29,7 +30,7 @@ fn test_rust_decompresses_cpp_encoded_data() {
             }
             let n_blocks = input.len() / 128;
 
-            let mut cpp_compressed = Vec::new();
+            cpp_compressed.truncate(0);
             codec_cpp.encode(&input, &mut cpp_compressed).unwrap();
 
             let mut rust_decoded = Vec::new();
@@ -122,12 +123,14 @@ fn test_rust_and_cpp_compression_matches() {
 /// Rust `AnyLenCodec` (`CompositeCodec`) encoder → round-trip.
 #[test]
 fn test_rust_anylen_roundtrip() {
+    let mut codec = FastPFor256::default();
+    let mut compressed = Vec::new();
+    let mut decoded = Vec::new();
     for n in test_input_sizes() {
-        let mut codec = FastPFor256::default();
         for input in get_test_cases(n) {
-            let mut compressed = Vec::new();
+            compressed.truncate(0);
+            decoded.truncate(0);
             codec.encode(&input, &mut compressed).unwrap();
-            let mut decoded = Vec::new();
             codec.decode(&compressed, &mut decoded, None).unwrap();
             assert_eq!(decoded, input, "Rust AnyLenCodec round-trip failed");
         }
diff --git a/tests/decode_validation.rs b/tests/decode_validation.rs
index 8d33527..a9844cc 100644
--- a/tests/decode_validation.rs
+++ b/tests/decode_validation.rs
@@ -10,10 +10,11 @@
 #[path = "../src/test_utils.rs"]
 mod test_utils;
 
-use crate::test_utils::{block_compress, decompress};
 use bytemuck::cast_slice_mut;
 use fastpfor::{FastPFor128, FastPForBlock128};
 
+use crate::test_utils::{block_compress, decompress};
+
 /// Matches `DEFAULT_PAGE_SIZE` in `fastpfor` (64 Ki integers per page).
 const DEFAULT_PAGE_SIZE: u32 = 65536;
 

From b15a7837332cb676ce4c446add82f6affde5330f Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Sun, 22 Mar 2026 23:27:02 -0400
Subject: [PATCH 11/26] roundtrip

---
 src/rust/integer_compression/fastpfor.rs | 39 ++++++------------------
 1 file changed, 9 insertions(+), 30 deletions(-)

diff --git a/src/rust/integer_compression/fastpfor.rs b/src/rust/integer_compression/fastpfor.rs
index fbad95f..cf5030d 100644
--- a/src/rust/integer_compression/fastpfor.rs
+++ b/src/rust/integer_compression/fastpfor.rs
@@ -589,31 +589,10 @@ where
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::test_utils::block_roundtrip;
 
     // ── Generic helpers ───────────────────────────────────────────────────────
 
-    /// Encode `data` with `FastPFor<N>`, decode it back, and return the result.
-    fn roundtrip<const N: usize>(data: &[u32]) -> Vec<u32>
-    where
-        FastPFor<N>: BlockCodec<Block = [u32; N]>,
-        [u32; N]: bytemuck::Pod,
-    {
-        let blocks: &[[u32; N]] = cast_slice(data);
-        let mut compressed = Vec::new();
-        FastPFor::<N>::default()
-            .encode_blocks(blocks, &mut compressed)
-            .unwrap();
-        let mut decoded = Vec::new();
-        FastPFor::<N>::default()
-            .decode_blocks(
-                &compressed,
-                Some((blocks.len() * N).try_into().unwrap()),
-                &mut decoded,
-            )
-            .unwrap();
-        decoded
-    }
-
     /// Encode `data` as a single batch of `[u32; N]` blocks and return the compressed words.
     fn encode_block<const N: usize>(data: &[u32]) -> Vec<u32>
     where
@@ -649,14 +628,14 @@ mod tests {
     fn fastpfor_test() {
         let mut data = vec![0u32; 256];
         data[126] = u32::MAX;
-        assert_eq!(roundtrip::<256>(&data), data);
+        block_roundtrip::<FastPForBlock256>(&data);
     }
 
     #[test]
     fn fastpfor_test_128() {
         let mut data = vec![0u32; 128];
         data[126] = u32::MAX;
-        assert_eq!(roundtrip::<128>(&data), data);
+        block_roundtrip::<FastPForBlock128>(&data);
     }
 
     #[test]
@@ -677,31 +656,31 @@ mod tests {
     // Tests ported from C++
     #[test]
     fn test_constant_sequence() {
-        assert_eq!(roundtrip::<128>(&vec![42u32; 65536]), vec![42u32; 65536]);
+        block_roundtrip::<FastPForBlock128>(&vec![42u32; 65536]);
     }
 
     #[test]
     fn test_alternating_sequence() {
         let data: Vec<_> = (0..65536u32).map(|i| u32::from(i % 2 != 0)).collect();
-        assert_eq!(roundtrip::<128>(&data), data);
+        block_roundtrip::<FastPForBlock128>(&data);
     }
 
     #[test]
     fn test_large_numbers() {
         let data: Vec<u32> = (0..65536u32).map(|i| i + (1u32 << 30)).collect();
-        assert_eq!(roundtrip::<128>(&data), data);
+        block_roundtrip::<FastPForBlock128>(&data);
     }
 
     #[test]
     fn cursor_api_roundtrip() {
-        assert_eq!(roundtrip::<256>(&vec![42u32; 256]), vec![42u32; 256]);
+        block_roundtrip::<FastPForBlock256>(&vec![42u32; 256]);
     }
 
     #[test]
     fn headless_compress_unfit_pagesize() {
         // 640 values with 128-block codec spans two pages (512 + 128), exercising the loop.
         let input: Vec<u32> = (0..640u32).collect();
-        assert_eq!(roundtrip::<128>(&input), input);
+        block_roundtrip::<FastPForBlock128>(&input);
     }
 
     #[test]
@@ -710,7 +689,7 @@ mod tests {
         let input: Vec<u32> = (0..1024u32)
             .map(|i| if i % 2 == 0 { 1 << 30 } else { 3 })
             .collect();
-        assert_eq!(roundtrip::<128>(&input), input);
+        block_roundtrip::<FastPForBlock128>(&input);
     }
 
     // ── Error / edge tests not covered by `tests/decode_validation.rs` ─────

From bbea95e545a3721741b3dd49474c45a7433ceba8 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Sun, 22 Mar 2026 23:38:33 -0400
Subject: [PATCH 12/26] cleanup

---
 src/rust/integer_compression/fastpfor.rs | 63 ++++++------------------
 tests/decode_validation.rs               | 10 ++--
 2 files changed, 17 insertions(+), 56 deletions(-)

diff --git a/src/rust/integer_compression/fastpfor.rs b/src/rust/integer_compression/fastpfor.rs
index cf5030d..494cd18 100644
--- a/src/rust/integer_compression/fastpfor.rs
+++ b/src/rust/integer_compression/fastpfor.rs
@@ -589,39 +589,10 @@ where
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::test_utils::block_roundtrip;
+    use crate::test_utils::{block_compress, block_decompress, block_roundtrip};
 
     // ── Generic helpers ───────────────────────────────────────────────────────
 
-    /// Encode `data` as a single batch of `[u32; N]` blocks and return the compressed words.
-    fn encode_block<const N: usize>(data: &[u32]) -> Vec<u32>
-    where
-        FastPFor<N>: BlockCodec<Block = [u32; N]>,
-        [u32; N]: bytemuck::Pod,
-    {
-        let mut out = Vec::new();
-        FastPFor::<N>::default()
-            .encode_blocks(cast_slice(data), &mut out)
-            .expect("compression must succeed");
-        out
-    }
-
-    /// Compressed data containing at least one non-trivial exception group.
-    fn compressed_with_exceptions() -> (Vec<u32>, Vec<u32>) {
-        let data: Vec<u32> = (0..256u32)
-            .map(|i| if i % 2 == 0 { 1u32 << 30 } else { 3 })
-            .collect();
-        (encode_block::<256>(&data), data)
-    }
-
-    /// Compressed data whose exception group uses bit-width difference == 1
-    /// (`maxbits - optimal_bits == 1`), triggering the `index == 1` branch.
-    fn compressed_with_index1_exceptions() -> (Vec<u32>, Vec<u32>) {
-        let mut data = vec![1u32; 256];
-        data[0] = 3; // needs 2 bits → encoder picks optimal_bits=1, maxbits=2, index=1
-        (encode_block::<256>(&data), data)
-    }
-
     // ── Round-trip tests ──────────────────────────────────────────────────────
 
     #[test]
@@ -641,16 +612,10 @@ mod tests {
     #[test]
     fn test_empty_blocks_ok() {
         // Empty input encodes to length header [0] (matches C++ FastPFor) and decodes cleanly.
-        let mut enc = Vec::new();
-        FastPForBlock256::default()
-            .encode_blocks(&[], &mut enc)
-            .unwrap();
+        let enc = block_compress::<FastPForBlock256>(&[]).unwrap();
         assert_eq!(enc, [0]);
-        let mut dec = Vec::new();
-        FastPForBlock256::default()
-            .decode_blocks(&enc, Some(0), &mut dec)
-            .unwrap();
-        assert_eq!(dec, []);
+        let dec = block_decompress::<FastPForBlock256>(&enc, Some(0)).unwrap();
+        assert!(dec.is_empty());
     }
 
     // Tests ported from C++
@@ -723,7 +688,11 @@ mod tests {
     #[test]
     fn decode_where_meta_overflow() {
         // `decode_headless_blocks` only: no `AnyLenCodec` entry point passes this layout.
-        let (compressed, _) = compressed_with_exceptions();
+        let data: Vec<u32> = (0..256u32)
+            .map(|i| if i % 2 == 0 { 1u32 << 30 } else { 3 })
+            .collect();
+        let compressed = block_compress::<FastPForBlock256>(&data).unwrap();
+
         let mut padded = vec![0u32];
         padded.extend_from_slice(&compressed);
         padded[2] = u32::MAX;
@@ -743,11 +712,10 @@ mod tests {
 
     #[test]
     fn decode_index1_branch_valid() {
-        let (compressed, data) = compressed_with_index1_exceptions();
-        let mut out = Vec::new();
-        FastPForBlock256::default()
-            .decode_blocks(&compressed, Some(256), &mut out)
-            .expect("decompression of index-1 data must succeed");
+        let mut data = vec![1u32; 256];
+        data[0] = 3;
+        let compressed = block_compress::<FastPForBlock256>(&data).unwrap();
+        let out = block_decompress::<FastPForBlock256>(&compressed, Some(256)).unwrap();
         assert_eq!(out, data);
     }
 
@@ -756,10 +724,7 @@ mod tests {
     fn decode_blocks_header_only_input() {
         // Input with just the length header [0]: no blocks to decode.
         let input = vec![0u32];
-        let mut out = Vec::new();
-        FastPForBlock256::default()
-            .decode_blocks(&input, None, &mut out)
-            .unwrap();
+        let out = block_decompress::<FastPForBlock256>(&input, None).unwrap();
         assert!(out.is_empty());
     }
 }
diff --git a/tests/decode_validation.rs b/tests/decode_validation.rs
index a9844cc..f403c46 100644
--- a/tests/decode_validation.rs
+++ b/tests/decode_validation.rs
@@ -25,12 +25,6 @@ fn compressed_with_exceptions() -> Vec<u32> {
     block_compress::<FastPForBlock128>(&data).unwrap()
 }
 
-fn compressed_with_index1_exceptions() -> Vec<u32> {
-    let mut data = vec![1u32; 128];
-    data[0] = 3;
-    block_compress::<FastPForBlock128>(&data).unwrap()
-}
-
 fn meta_byte_start(compressed: &[u32]) -> usize {
     let where_meta = compressed[1] as usize;
     (1 + where_meta + 1) * 4
@@ -189,7 +183,9 @@ fn decode_returns_error_when_exception_index_equals_optimal_bits() {
 
 #[test]
 fn decode_returns_error_when_index1_exception_position_byte_truncated() {
-    let compressed = compressed_with_index1_exceptions();
+    let mut data = vec![1u32; 128];
+    data[0] = 3;
+    let compressed = block_compress::<FastPForBlock128>(&data).unwrap();
     decompress::<FastPFor128>(&compressed[..compressed.len() - 1], Some(128)).unwrap_err();
 }
 

From 29b4a0320dbaa481bbd42a83e85090fd1a7cefd1 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Sun, 22 Mar 2026 23:48:52 -0400
Subject: [PATCH 13/26] cleanup

---
 src/rust/integer_compression/fastpfor.rs | 6 +-----
 tests/basic_tests.rs                     | 6 +++---
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/rust/integer_compression/fastpfor.rs b/src/rust/integer_compression/fastpfor.rs
index 494cd18..676e57d 100644
--- a/src/rust/integer_compression/fastpfor.rs
+++ b/src/rust/integer_compression/fastpfor.rs
@@ -665,11 +665,7 @@ mod tests {
     #[test]
     fn uncompress_zero_input_length_err() {
         // Truly empty input (no header word at all) is invalid — C++ would crash reading *in.
-        assert!(
-            FastPForBlock256::default()
-                .decode_blocks(&[], None, &mut Vec::new())
-                .is_err()
-        );
+        block_decompress::<FastPForBlock256>(&[], None).unwrap_err();
     }
 
     #[test]
diff --git a/tests/basic_tests.rs b/tests/basic_tests.rs
index 97b2c39..bbaa848 100644
--- a/tests/basic_tests.rs
+++ b/tests/basic_tests.rs
@@ -5,11 +5,12 @@
 #[path = "../src/test_utils.rs"]
 mod test_utils;
 
+use bytemuck::cast_slice;
 use fastpfor::{BlockCodec, FastPForBlock128, FastPForBlock256, slice_to_blocks};
 use rand::rngs::StdRng;
 use rand::{RngExt as _, SeedableRng};
 
-use crate::test_utils::{block_roundtrip_all, roundtrip_all};
+use crate::test_utils::{block_compress, block_roundtrip_all, roundtrip_all};
 
 mod common;
 
@@ -27,8 +28,7 @@ fn spurious_out_test() {
     fn check<C: BlockCodec + Default>(len: usize) {
         let x = vec![0u32; 1024];
         let (blocks, _) = slice_to_blocks::<C>(&x[..len]);
-        let mut out = Vec::new();
-        C::default().encode_blocks(blocks, &mut out).unwrap();
+        let out = block_compress::<C>(cast_slice(blocks)).unwrap();
         assert!(out.is_empty() || blocks.is_empty());
     }
     for len in 0..32usize {

From ee9568beedb05f3d1c2222dfc5ff26f50fd8b4c5 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Mon, 23 Mar 2026 00:00:15 -0400
Subject: [PATCH 14/26] cleanup

---
 src/rust/composite.rs                    | 41 +++++++-----------------
 src/rust/integer_compression/fastpfor.rs |  4 ---
 src/test_utils.rs                        |  4 +--
 3 files changed, 14 insertions(+), 35 deletions(-)

diff --git a/src/rust/composite.rs b/src/rust/composite.rs
index c495ee6..31187a9 100644
--- a/src/rust/composite.rs
+++ b/src/rust/composite.rs
@@ -108,11 +108,9 @@ impl<Blocks: BlockCodec, Tail: AnyLenCodec> AnyLenCodec for CompositeCodec<Block
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::FastPForError;
+    use crate::FastPFor256;
     use crate::rust::{FastPForBlock128, FastPForBlock256, JustCopy, VariableByte};
-    use crate::test_utils::{compress, decompress, roundtrip_composite};
-
-    type Comp256Vb = CompositeCodec<FastPForBlock256, VariableByte>;
+    use crate::test_utils::{compress, decompress, roundtrip_composite, roundtrip_expected};
 
     #[test]
     fn test_fastpfor256_vbyte_exact_two_blocks() {
@@ -140,20 +138,20 @@ mod tests {
     #[test]
     fn test_decode_truly_empty_input() {
         // Decoding a zero-length slice (not even a header word) must succeed with empty output.
-        assert!(decompress::<Comp256Vb>(&[], None).unwrap().is_empty());
+        assert!(decompress::<FastPFor256>(&[], None).unwrap().is_empty());
     }
 
     #[test]
     fn test_decode_empty_input_with_expected_zero() {
         // Empty input with expected_len=0 must succeed.
-        assert!(decompress::<Comp256Vb>(&[], Some(0)).unwrap().is_empty());
+        assert!(decompress::<FastPFor256>(&[], Some(0)).unwrap().is_empty());
     }
 
     #[test]
     fn test_decode_empty_input_with_nonzero_expected_errors() {
         // Empty input: max_decompressed_len(0) == 0, so any expected_len > 0 fails
         // with ExpectedCountExceedsMax before decoding begins.
-        decompress::<CompositeCodec<FastPForBlock256, VariableByte>>(&[], Some(5)).unwrap_err();
+        decompress::<FastPFor256>(&[], Some(5)).unwrap_err();
     }
 
     #[test]
@@ -163,7 +161,7 @@ mod tests {
         // Regression: fuzzer found bytes [0x04, 0x35, 0x19] → u32 LE 0x00193504 = 1_651_460
         // fed to FastPFor256.decode caused an OOM via a ~2.5 GB Vec::resize.
         let input = &[0x0019_3504u32]; // n_blocks = 1_651_460, rest is empty
-        decompress::<CompositeCodec<FastPForBlock256, VariableByte>>(input, None).unwrap_err();
+        decompress::<FastPFor256>(input, None).unwrap_err();
     }
 
     #[test]
@@ -175,36 +173,21 @@ mod tests {
     #[test]
     fn test_decode_with_expected_len() {
         let data: Vec<u32> = (0..600).collect();
-        let encoded = compress::<Comp256Vb>(&data).unwrap();
-        let decoded = decompress::<Comp256Vb>(&encoded, Some(600)).unwrap();
-        assert_eq!(decoded, data);
+        roundtrip_expected::<FastPFor256>(&data, Some(600));
     }
 
     #[test]
     fn test_decode_expected_len_mismatch_errors() {
         let data: Vec<u32> = (0..100).collect();
-        let encoded = compress::<Comp256Vb>(&data).unwrap();
-        let mut codec = Comp256Vb::default();
-        let err = codec
-            .decode(&encoded, &mut Vec::new(), Some(50))
-            .unwrap_err();
-        assert!(matches!(
-            err,
-            FastPForError::DecodedCountMismatch {
-                actual: 100,
-                expected: 50
-            }
-        ));
+        let encoded = compress::<FastPFor256>(&data).unwrap();
+        decompress::<FastPFor256>(&encoded, Some(50)).unwrap_err();
     }
 
     #[test]
     fn test_decode_expected_len_exceeds_max_errors() {
         let data: Vec<u32> = (0..10).collect();
-        let encoded = compress::<Comp256Vb>(&data).unwrap();
-        let huge = (Comp256Vb::max_decompressed_len(encoded.len()) + 1) as u32;
-        let err = Comp256Vb::default()
-            .decode(&encoded, &mut Vec::new(), Some(huge))
-            .unwrap_err();
-        assert!(matches!(err, FastPForError::ExpectedCountExceedsMax { .. }));
+        let encoded = compress::<FastPFor256>(&data).unwrap();
+        let huge = (FastPFor256::max_decompressed_len(encoded.len()) + 1) as u32;
+        decompress::<FastPFor256>(&encoded, Some(huge)).unwrap_err();
     }
 }
diff --git a/src/rust/integer_compression/fastpfor.rs b/src/rust/integer_compression/fastpfor.rs
index 676e57d..b1859e8 100644
--- a/src/rust/integer_compression/fastpfor.rs
+++ b/src/rust/integer_compression/fastpfor.rs
@@ -591,10 +591,6 @@ mod tests {
     use super::*;
     use crate::test_utils::{block_compress, block_decompress, block_roundtrip};
 
-    // ── Generic helpers ───────────────────────────────────────────────────────
-
-    // ── Round-trip tests ──────────────────────────────────────────────────────
-
     #[test]
     fn fastpfor_test() {
         let mut data = vec![0u32; 256];
diff --git a/src/test_utils.rs b/src/test_utils.rs
index 808956a..d9f21ff 100644
--- a/src/test_utils.rs
+++ b/src/test_utils.rs
@@ -140,8 +140,8 @@ pub fn block_roundtrip_all(data: &[u32]) {
 #[cfg(feature = "rust")]
 pub fn roundtrip_composite<B, T>(data: &[u32])
 where
-    B: BlockCodec + Default,
-    T: AnyLenCodec + Default,
+    B: BlockCodec,
+    T: AnyLenCodec,
 {
     roundtrip::<fastpfor::CompositeCodec<B, T>>(data);
 }

From f8733d2ca2640ebbfd3bb4a4c23a74fd46a07e21 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Mon, 23 Mar 2026 00:21:08 -0400
Subject: [PATCH 15/26] cleanup

---
 src/rust/integer_compression/fastpfor.rs |  4 +-
 src/test_utils.rs                        | 53 +++++++++++---------
 tests/cpp_compat_tests.rs                | 63 +++++-------------------
 3 files changed, 42 insertions(+), 78 deletions(-)

diff --git a/src/rust/integer_compression/fastpfor.rs b/src/rust/integer_compression/fastpfor.rs
index b1859e8..d270e11 100644
--- a/src/rust/integer_compression/fastpfor.rs
+++ b/src/rust/integer_compression/fastpfor.rs
@@ -706,9 +706,7 @@ mod tests {
     fn decode_index1_branch_valid() {
         let mut data = vec![1u32; 256];
         data[0] = 3;
-        let compressed = block_compress::<FastPForBlock256>(&data).unwrap();
-        let out = block_decompress::<FastPForBlock256>(&compressed, Some(256)).unwrap();
-        assert_eq!(out, data);
+        block_roundtrip::<FastPForBlock256>(&data);
     }
 
     /// `decode_blocks` with `expected_len: None` and header=0 returns `Ok` with empty output.
diff --git a/src/test_utils.rs b/src/test_utils.rs
index d9f21ff..c4f3d2a 100644
--- a/src/test_utils.rs
+++ b/src/test_utils.rs
@@ -31,20 +31,43 @@ const SEED: u64 = 456;
 // Generic codec helpers
 // ---------------------------------------------------------------------------
 
+pub fn roundtrip<C: AnyLenCodec>(data: &[u32]) {
+    roundtrip_expected::<C>(data, Some(data.len().try_into().unwrap()));
+}
+
 /// Encode `data` with a caller-owned codec, decode with `expected_len: None`, assert round-trip.
-pub fn roundtrip_expected<C: AnyLenCodec>(data: &[u32], expected_len: Option<u32>) {
+pub fn roundtrip_expected<E: AnyLenCodec>(data: &[u32], expected_len: Option<u32>) {
+    roundtrip_full::<E, E>(data, expected_len);
+}
+
+/// Encode `data` with a caller-owned codec, decode with `expected_len: None`, assert round-trip.
+pub fn roundtrip_full<E: AnyLenCodec, D: AnyLenCodec>(data: &[u32], expected_len: Option<u32>) {
+    let mut encoder = E::default();
+    let mut compressed = Vec::new();
+    encoder.encode(data, &mut compressed).unwrap();
+
+    let mut decoder = D::default();
+    let mut decompressed = Vec::new();
+    decoder
+        .decode(&compressed, &mut decompressed, expected_len)
+        .unwrap();
+    assert_eq!(decompressed, data);
+}
+
+#[cfg(feature = "cpp")]
+pub fn roundtrip64<C: BlockCodec64 + Default>(data: &[u64]) {
     let mut codec = C::default();
     let mut compressed = Vec::new();
-    codec.encode(data, &mut compressed).unwrap();
+    codec.encode64(data, &mut compressed).unwrap();
     let mut decoded = Vec::new();
-    codec
-        .decode(&compressed, &mut decoded, expected_len)
-        .unwrap();
+    codec.decode64(&compressed, &mut decoded).unwrap();
     assert_eq!(decoded, data);
 }
 
-pub fn roundtrip<C: AnyLenCodec>(data: &[u32]) {
-    roundtrip_expected::<C>(data, Some(data.len().try_into().unwrap()));
+pub fn block_roundtrip<C: BlockCodec>(data: &[u32]) {
+    let compressed = block_compress::<C>(data).unwrap();
+    let decompressed = block_decompress::<C>(&compressed, Some(data.len() as u32)).unwrap();
+    assert_eq!(decompressed, data);
 }
 
 pub fn compress<C: AnyLenCodec>(data: &[u32]) -> FastPForResult<Vec<u32>> {
@@ -62,12 +85,6 @@ pub fn decompress<C: AnyLenCodec>(
     Ok(decompressed)
 }
 
-pub fn block_roundtrip<C: BlockCodec>(data: &[u32]) {
-    let compressed = block_compress::<C>(data).unwrap();
-    let decompressed = block_decompress::<C>(&compressed, Some(data.len() as u32)).unwrap();
-    assert_eq!(decompressed, data);
-}
-
 pub fn block_compress<C: BlockCodec>(data: &[u32]) -> FastPForResult<Vec<u32>> {
     let (blocks, remainder) = slice_to_blocks::<C>(data);
     assert_eq!(
@@ -89,16 +106,6 @@ pub fn block_decompress<C: BlockCodec>(
     Ok(out)
 }
 
-#[cfg(feature = "cpp")]
-pub fn roundtrip64<C: BlockCodec64 + Default>(data: &[u64]) {
-    let mut codec = C::default();
-    let mut compressed = Vec::new();
-    codec.encode64(data, &mut compressed).unwrap();
-    let mut decoded = Vec::new();
-    codec.decode64(&compressed, &mut decoded).unwrap();
-    assert_eq!(decoded, data);
-}
-
 #[cfg(feature = "cpp")]
 pub fn compress64<C: BlockCodec64 + Default>(data: &[u64]) -> FastPForResult<Vec<u32>> {
     let mut compressed = Vec::new();
diff --git a/tests/cpp_compat_tests.rs b/tests/cpp_compat_tests.rs
index c2945a8..b5c09e5 100644
--- a/tests/cpp_compat_tests.rs
+++ b/tests/cpp_compat_tests.rs
@@ -9,39 +9,26 @@
 mod test_utils;
 
 use fastpfor::{FastPFor128, FastPFor256, FastPForBlock128};
-use test_utils::{block_compress, compress, decompress, roundtrip};
+use test_utils::{block_compress, block_decompress, compress, decompress, roundtrip};
 
 mod common;
 use common::{get_test_cases, test_input_sizes};
 use fastpfor::cpp::CppFastPFor128;
-use fastpfor::{AnyLenCodec, BlockCodec, slice_to_blocks};
+
+use crate::test_utils::roundtrip_full;
 
 /// C++ `AnyLenCodec` encode → Rust `BlockCodec` decode (same wire format for block-aligned data).
 #[test]
 fn test_rust_decompresses_cpp_encoded_data() {
-    let mut codec_cpp = CppFastPFor128::default();
-    let mut codec_rs = FastPForBlock128::default();
-    let mut cpp_compressed = Vec::new();
-
     for n in test_input_sizes() {
         for input in get_test_cases(n + 128) {
             if input.len() % 128 != 0 || input.is_empty() {
                 continue;
             }
-            let n_blocks = input.len() / 128;
-
-            cpp_compressed.truncate(0);
-            codec_cpp.encode(&input, &mut cpp_compressed).unwrap();
-
-            let mut rust_decoded = Vec::new();
-            codec_rs
-                .decode_blocks(
-                    &cpp_compressed,
-                    Some(u32::try_from(n_blocks * 128).expect("block count fits in u32")),
-                    &mut rust_decoded,
-                )
-                .unwrap_or_else(|e| panic!("Rust decompress of C++ data failed: {e:?}"));
-
+            let cpp_compressed = compress::<CppFastPFor128>(&input).unwrap();
+            let rust_decoded =
+                block_decompress::<FastPForBlock128>(&cpp_compressed, Some(input.len() as u32))
+                    .unwrap_or_else(|e| panic!("Rust decompress of C++ data failed: {e:?}"));
             assert_eq!(
                 rust_decoded,
                 input,
@@ -55,35 +42,14 @@ fn test_rust_decompresses_cpp_encoded_data() {
 /// Rust `BlockCodec` encode → C++ `AnyLenCodec` decode (same wire format).
 #[test]
 fn test_cpp_decompresses_rust_block_encoded_data() {
-    let mut codec_cpp = CppFastPFor128::default();
-    let mut codec_rs = FastPForBlock128::default();
-
     for n in test_input_sizes() {
         for input in get_test_cases(n + 128) {
             if input.len() % 128 != 0 || input.is_empty() {
                 continue;
             }
-            let (blocks, _) = slice_to_blocks::<FastPForBlock128>(&input);
-            let n_blocks = blocks.len();
-            let expected_len = n_blocks * 128;
-
-            let mut rs_compressed = Vec::new();
-            codec_rs.encode_blocks(blocks, &mut rs_compressed).unwrap();
-
-            let mut cpp_decoded = Vec::new();
-            codec_cpp
-                .decode(
-                    &rs_compressed,
-                    &mut cpp_decoded,
-                    Some(u32::try_from(expected_len).expect("expected len fits in u32")),
-                )
-                .unwrap_or_else(|e| panic!("C++ decompress of Rust data failed: {e:?}"));
-
-            assert_eq!(
-                cpp_decoded,
-                input,
-                "Rust→C++ roundtrip mismatch for len {}",
-                input.len()
+            roundtrip_full::<FastPFor128, CppFastPFor128>(
+                &input,
+                Some(input.len().try_into().unwrap()),
             );
         }
     }
@@ -123,16 +89,9 @@ fn test_rust_and_cpp_compression_matches() {
 /// Rust `AnyLenCodec` (`CompositeCodec`) encoder → round-trip.
 #[test]
 fn test_rust_anylen_roundtrip() {
-    let mut codec = FastPFor256::default();
-    let mut compressed = Vec::new();
-    let mut decoded = Vec::new();
     for n in test_input_sizes() {
         for input in get_test_cases(n) {
-            compressed.truncate(0);
-            decoded.truncate(0);
-            codec.encode(&input, &mut compressed).unwrap();
-            codec.decode(&compressed, &mut decoded, None).unwrap();
-            assert_eq!(decoded, input, "Rust AnyLenCodec round-trip failed");
+            roundtrip::<FastPFor256>(&input);
         }
     }
 }

From a3b3f16192a760f9fa3ff256aab92f9d2843c7a2 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Mon, 23 Mar 2026 00:29:15 -0400
Subject: [PATCH 16/26] cleanup

---
 README.md                 | 32 ++++++++++++++++----------------
 tests/cpp_compat_tests.rs | 16 +++++++---------
 2 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 11abee1..2f3cc20 100644
--- a/README.md
+++ b/README.md
@@ -75,12 +75,12 @@ Create one instance per thread or synchronize access externally.
 
 ## Crate Features
 
-| Feature | Default | Description |
-|---------|---------|-------------|
-| `rust` | **yes** | Pure-Rust implementation — no `unsafe`, no build dependencies |
-| `cpp` | no | C++ wrapper via CXX — requires a C++14 compiler with SIMD support |
-| `cpp_portable` | no | Enables `cpp`, compiles C++ with SSE4.2 baseline (runs on any x86-64 from ~2008+) |
-| `cpp_native` | no | Enables `cpp`, compiles C++ with `-march=native` for maximum throughput on the build machine |
+| Feature        | Default | Description                                                                                  |
+|----------------|---------|----------------------------------------------------------------------------------------------|
+| `rust`         | **yes** | Pure-Rust implementation — no `unsafe`, no build dependencies                                |
+| `cpp`          | no      | C++ wrapper via CXX — requires a C++14 compiler with SIMD support                            |
+| `cpp_portable` | no      | Enables `cpp`, compiles C++ with SSE4.2 baseline (runs on any x86-64 from ~2008+)            |
+| `cpp_native`   | no      | Enables `cpp`, compiles C++ with `-march=native` for maximum throughput on the build machine |
 
 The `FASTPFOR_SIMD_MODE` environment variable (`portable` or `native`) can override the SIMD mode at build time.
 
@@ -92,14 +92,14 @@ The `FASTPFOR_SIMD_MODE` environment variable (`portable` or `native`) can overr
 
 Rust block codecs require block-aligned input. `CompositeCodec` chains a block codec with a tail codec (e.g. `VariableByte`) to handle arbitrary-length input. `FastPFor256` and `FastPFor128` are type aliases for such composites.
 
-| Codec                      | Description                                                                               |
-|----------------------------|-------------------------------------------------------------------------------------------|
-| `FastPFor256`              | `CompositeCodec` of `FastPForBlock256` + `VariableByte`. **Recommended for general use.** |
-| `FastPFor128`              | `CompositeCodec` of `FastPForBlock128` + `VariableByte`                                   |
-| `VariableByte`             | Variable-byte encoding only; good for small integers                                      |
-| `JustCopy`                 | No compression; useful as a baseline                                                      |
-| `FastPForBlock256` (block) | `FastPFor` with 256-element blocks; block-aligned input only                              |
-| `FastPForBlock128` (block) | `FastPFor` with 128-element blocks; block-aligned input only                              |
+| Codec              | Description                                                  |
+|--------------------|--------------------------------------------------------------|
+| `FastPFor256`      | `CompositeCodec` of `FastPForBlock256` + `VariableByte`      |
+| `FastPFor128`      | `CompositeCodec` of `FastPForBlock128` + `VariableByte`      |
+| `VariableByte`     | Variable-byte encoding, MSB is opposite to protobuf's varint |
+| `JustCopy`         | No compression; useful as a baseline                         |
+| `FastPForBlock256` | `FastPFor` with 256-element blocks; block-aligned input only |
+| `FastPForBlock128` | `FastPFor` with 128-element blocks; block-aligned input only |
 
 ### C++ (`cpp` feature)
 
@@ -108,8 +108,8 @@ All C++ codecs are composite (any-length) and implement `AnyLenCodec` only.
 
 | Codec                       | Notes                                                                  |
 |-----------------------------|------------------------------------------------------------------------|
-| `CppFastPFor128`            | `FastPFor + VByte` composite, 128-element blocks. Also supports `u64`.  |
-| `CppFastPFor256`            | `FastPFor + VByte` composite, 256-element blocks. Also supports `u64`.  |
+| `CppFastPFor128`            | `FastPFor + VByte` composite, 128-element blocks. Also supports `u64`. |
+| `CppFastPFor256`            | `FastPFor + VByte` composite, 256-element blocks. Also supports `u64`. |
 | `CppSimdFastPFor128`        | SIMD-optimized 128-element variant                                     |
 | `CppSimdFastPFor256`        | SIMD-optimized 256-element variant                                     |
 | `CppBP32`                   | Binary packing, 32-bit blocks                                          |
diff --git a/tests/cpp_compat_tests.rs b/tests/cpp_compat_tests.rs
index b5c09e5..153b8bf 100644
--- a/tests/cpp_compat_tests.rs
+++ b/tests/cpp_compat_tests.rs
@@ -9,13 +9,13 @@
 mod test_utils;
 
 use fastpfor::{FastPFor128, FastPFor256, FastPForBlock128};
-use test_utils::{block_compress, block_decompress, compress, decompress, roundtrip};
+use test_utils::{block_compress, block_decompress, compress, roundtrip, roundtrip_full};
 
 mod common;
 use common::{get_test_cases, test_input_sizes};
 use fastpfor::cpp::CppFastPFor128;
 
-use crate::test_utils::roundtrip_full;
+use crate::test_utils::decompress;
 
 /// C++ `AnyLenCodec` encode → Rust `BlockCodec` decode (same wire format for block-aligned data).
 #[test]
@@ -60,27 +60,25 @@ fn test_cpp_decompresses_rust_block_encoded_data() {
 fn test_rust_and_cpp_compression_matches() {
     for n in test_input_sizes() {
         for input in get_test_cases(n + 128) {
-            if input.len() % 128 != 0 || input.is_empty() {
+            let len = input.len();
+            if len % 128 != 0 || input.is_empty() {
                 continue;
             }
             let compressed = compress::<CppFastPFor128>(&input).unwrap();
             assert_eq!(
                 compressed,
                 block_compress::<FastPForBlock128>(&input).unwrap(),
-                "Compressed bytes differ for input len {}",
-                input.len()
+                "Compressed bytes differ for input len {len}",
             );
             assert_eq!(
                 decompress::<CppFastPFor128>(&compressed, None).unwrap(),
                 input,
-                "Rust→C++ roundtrip mismatch for len {}",
-                input.len()
+                "Rust→C++ roundtrip mismatch for len {len}",
             );
             assert_eq!(
                 decompress::<FastPFor128>(&compressed, None).unwrap(),
                 input,
-                "Rust→C++ roundtrip mismatch for len {}",
-                input.len()
+                "Rust→C++ roundtrip mismatch for len {len}",
             );
         }
     }

From 6b1ef41cba2c82fab587010776d83312a8cd020c Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Mon, 23 Mar 2026 01:31:17 -0400
Subject: [PATCH 17/26] revert cpp testing

---
 src/cpp/codecs.rs | 60 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 47 insertions(+), 13 deletions(-)

diff --git a/src/cpp/codecs.rs b/src/cpp/codecs.rs
index 300b340..74dbb3c 100644
--- a/src/cpp/codecs.rs
+++ b/src/cpp/codecs.rs
@@ -180,64 +180,98 @@ implement_cpp_codecs_64! {
 
 #[cfg(test)]
 pub(crate) mod tests {
+    use crate::codec::{AnyLenCodec, BlockCodec64};
     use crate::cpp::codecs::{CppFastPFor128, CppFastPFor256, CppVByte, CppVarInt};
-    use crate::test_utils::{decompress, decompress64, roundtrip, roundtrip64};
+
+    pub fn roundtrip_32(codec: &mut impl AnyLenCodec, input: &[u32]) {
+        let mut compressed = Vec::new();
+        codec.encode(input, &mut compressed).unwrap();
+        let mut decoded = Vec::new();
+        codec.decode(&compressed, &mut decoded, None).unwrap();
+        assert_eq!(decoded, input);
+    }
 
     /// C++ `fastpfor256_codec` returns `CompositeCodec<FastPFor<8>, VariableByte>` — already
     /// any-length. Use it directly; do not wrap in Rust `CompositeCodec`.
     #[test]
     fn test_cpp_fastpfor256_composite_anylen() {
-        roundtrip::<CppFastPFor256>(&[1, 2, 3, 4, 5]);
+        let mut codec = CppFastPFor256::new();
+        roundtrip_32(&mut codec, &[1, 2, 3, 4, 5]);
         let data: Vec<u32> = (0..600).collect();
-        roundtrip::<CppFastPFor256>(&data);
+        roundtrip_32(&mut codec, &data);
     }
 
     #[test]
     fn test_fastpfor128_anylen() {
         let data: Vec<u32> = (0..128).collect();
-        roundtrip::<CppFastPFor128>(&data);
+        roundtrip_32(&mut CppFastPFor128::new(), &data);
     }
 
     #[test]
     fn test_fastpfor256_anylen() {
         let data: Vec<u32> = (0..256).collect();
-        roundtrip::<CppFastPFor256>(&data);
+        roundtrip_32(&mut CppFastPFor256::new(), &data);
     }
 
     #[test]
     fn test_fastpfor256_u64() {
         let input: Vec<u64> = (0..256).collect();
-        roundtrip64::<CppFastPFor256>(&input);
+        let mut codec = CppFastPFor256::new();
+        let mut compressed = Vec::new();
+        codec.encode64(&input, &mut compressed).unwrap();
+        let mut decoded = Vec::new();
+        codec.decode64(&compressed, &mut decoded).unwrap();
+        assert_eq!(decoded, input);
     }
 
     #[test]
     fn test_varint_u64() {
-        roundtrip64::<CppVarInt>(&[1u64, 2, 3, 4, 5]);
+        let input = vec![1u64, 2, 3, 4, 5];
+        let mut codec = CppVarInt::new();
+        let mut compressed = Vec::new();
+        codec.encode64(&input, &mut compressed).unwrap();
+        let mut decoded = Vec::new();
+        codec.decode64(&compressed, &mut decoded).unwrap();
+        assert_eq!(decoded, input);
     }
 
     #[test]
     fn test_decode32_empty_input() {
-        assert!(decompress::<CppVByte>(&[], None).unwrap().is_empty());
+        let mut codec = CppVByte::new();
+        let mut out = Vec::new();
+        codec.decode(&[], &mut out, None).unwrap();
+        assert!(out.is_empty());
     }
 
     #[test]
     fn test_decode32_cpp_empty_format() {
-        let result = decompress::<CppFastPFor128>(&[0u32], Some(0)).unwrap();
-        assert!(result.is_empty());
+        let mut codec = CppFastPFor128::new();
+        let mut out = Vec::new();
+        codec.decode(&[0u32], &mut out, Some(0)).unwrap();
+        assert!(out.is_empty());
     }
 
     #[test]
     fn test_decode64_empty_input() {
-        assert!(decompress64::<CppFastPFor256>(&[]).unwrap().is_empty());
+        let mut codec = CppFastPFor256::new();
+        let mut out: Vec<u64> = Vec::new();
+        codec.decode64(&[], &mut out).unwrap();
+        assert!(out.is_empty());
     }
 
     #[test]
     fn test_decode64_empty_format() {
-        assert!(decompress64::<CppVarInt>(&[]).unwrap().is_empty());
+        let mut codec = CppVarInt::new();
+        let mut out: Vec<u64> = Vec::new();
+        codec.decode64(&[], &mut out).unwrap();
+        assert!(out.is_empty());
     }
 
     #[test]
     fn test_decode_empty_input() {
-        assert!(decompress::<CppFastPFor128>(&[], None).unwrap().is_empty());
+        let mut codec = CppFastPFor128::new();
+        let mut out = Vec::new();
+        codec.decode(&[], &mut out, None).unwrap();
+        assert!(out.is_empty());
     }
 }

From d01b333b4eb40b3bc50e13d85a23af56ba05eb47 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Mon, 23 Mar 2026 02:14:37 -0400
Subject: [PATCH 18/26] Revert "revert cpp testing"

This reverts commit 6b1ef41cba2c82fab587010776d83312a8cd020c.
---
 src/cpp/codecs.rs | 60 ++++++++++-------------------------------------
 1 file changed, 13 insertions(+), 47 deletions(-)

diff --git a/src/cpp/codecs.rs b/src/cpp/codecs.rs
index 74dbb3c..300b340 100644
--- a/src/cpp/codecs.rs
+++ b/src/cpp/codecs.rs
@@ -180,98 +180,64 @@ implement_cpp_codecs_64! {
 
 #[cfg(test)]
 pub(crate) mod tests {
-    use crate::codec::{AnyLenCodec, BlockCodec64};
     use crate::cpp::codecs::{CppFastPFor128, CppFastPFor256, CppVByte, CppVarInt};
-
-    pub fn roundtrip_32(codec: &mut impl AnyLenCodec, input: &[u32]) {
-        let mut compressed = Vec::new();
-        codec.encode(input, &mut compressed).unwrap();
-        let mut decoded = Vec::new();
-        codec.decode(&compressed, &mut decoded, None).unwrap();
-        assert_eq!(decoded, input);
-    }
+    use crate::test_utils::{decompress, decompress64, roundtrip, roundtrip64};
 
     /// C++ `fastpfor256_codec` returns `CompositeCodec<FastPFor<8>, VariableByte>` — already
     /// any-length. Use it directly; do not wrap in Rust `CompositeCodec`.
     #[test]
     fn test_cpp_fastpfor256_composite_anylen() {
-        let mut codec = CppFastPFor256::new();
-        roundtrip_32(&mut codec, &[1, 2, 3, 4, 5]);
+        roundtrip::<CppFastPFor256>(&[1, 2, 3, 4, 5]);
         let data: Vec<u32> = (0..600).collect();
-        roundtrip_32(&mut codec, &data);
+        roundtrip::<CppFastPFor256>(&data);
     }
 
     #[test]
     fn test_fastpfor128_anylen() {
         let data: Vec<u32> = (0..128).collect();
-        roundtrip_32(&mut CppFastPFor128::new(), &data);
+        roundtrip::<CppFastPFor128>(&data);
     }
 
     #[test]
     fn test_fastpfor256_anylen() {
         let data: Vec<u32> = (0..256).collect();
-        roundtrip_32(&mut CppFastPFor256::new(), &data);
+        roundtrip::<CppFastPFor256>(&data);
     }
 
     #[test]
     fn test_fastpfor256_u64() {
         let input: Vec<u64> = (0..256).collect();
-        let mut codec = CppFastPFor256::new();
-        let mut compressed = Vec::new();
-        codec.encode64(&input, &mut compressed).unwrap();
-        let mut decoded = Vec::new();
-        codec.decode64(&compressed, &mut decoded).unwrap();
-        assert_eq!(decoded, input);
+        roundtrip64::<CppFastPFor256>(&input);
     }
 
     #[test]
     fn test_varint_u64() {
-        let input = vec![1u64, 2, 3, 4, 5];
-        let mut codec = CppVarInt::new();
-        let mut compressed = Vec::new();
-        codec.encode64(&input, &mut compressed).unwrap();
-        let mut decoded = Vec::new();
-        codec.decode64(&compressed, &mut decoded).unwrap();
-        assert_eq!(decoded, input);
+        roundtrip64::<CppVarInt>(&[1u64, 2, 3, 4, 5]);
     }
 
     #[test]
     fn test_decode32_empty_input() {
-        let mut codec = CppVByte::new();
-        let mut out = Vec::new();
-        codec.decode(&[], &mut out, None).unwrap();
-        assert!(out.is_empty());
+        assert!(decompress::<CppVByte>(&[], None).unwrap().is_empty());
     }
 
     #[test]
     fn test_decode32_cpp_empty_format() {
-        let mut codec = CppFastPFor128::new();
-        let mut out = Vec::new();
-        codec.decode(&[0u32], &mut out, Some(0)).unwrap();
-        assert!(out.is_empty());
+        let result = decompress::<CppFastPFor128>(&[0u32], Some(0)).unwrap();
+        assert!(result.is_empty());
     }
 
     #[test]
     fn test_decode64_empty_input() {
-        let mut codec = CppFastPFor256::new();
-        let mut out: Vec<u64> = Vec::new();
-        codec.decode64(&[], &mut out).unwrap();
-        assert!(out.is_empty());
+        assert!(decompress64::<CppFastPFor256>(&[]).unwrap().is_empty());
     }
 
     #[test]
     fn test_decode64_empty_format() {
-        let mut codec = CppVarInt::new();
-        let mut out: Vec<u64> = Vec::new();
-        codec.decode64(&[], &mut out).unwrap();
-        assert!(out.is_empty());
+        assert!(decompress64::<CppVarInt>(&[]).unwrap().is_empty());
     }
 
     #[test]
     fn test_decode_empty_input() {
-        let mut codec = CppFastPFor128::new();
-        let mut out = Vec::new();
-        codec.decode(&[], &mut out, None).unwrap();
-        assert!(out.is_empty());
+        assert!(decompress::<CppFastPFor128>(&[], None).unwrap().is_empty());
     }
 }

From 69523996371b45cade96d95a3841c18965b6f6c8 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Mon, 23 Mar 2026 02:14:43 -0400
Subject: [PATCH 19/26] cleanup

---
 src/cpp/codecs.rs | 22 ++++++++++++
 src/cpp/tests.rs  | 92 -----------------------------------------------
 2 files changed, 22 insertions(+), 92 deletions(-)

diff --git a/src/cpp/codecs.rs b/src/cpp/codecs.rs
index 300b340..158f5e3 100644
--- a/src/cpp/codecs.rs
+++ b/src/cpp/codecs.rs
@@ -51,6 +51,28 @@ macro_rules! implement_cpp_codecs {
                 }
             }
         )*
+
+        #[cfg(test)]
+        mod cpp_default {
+            $(
+                #[test]
+                #[allow(non_snake_case)]
+                fn $name() {
+                    let _codec = $crate::cpp::$name::default();
+                }
+            )*
+        }
+
+        #[cfg(test)]
+        mod cpp_roundtrip {
+            $(
+                #[test]
+                #[allow(non_snake_case)]
+                fn $name() {
+                    $crate::test_utils::roundtrip::<$crate::cpp::$name>(&[1u32, 2, 3, 4, 5]);
+                }
+            )*
+        }
     };
 }
 
diff --git a/src/cpp/tests.rs b/src/cpp/tests.rs
index 597dbff..c9d5a1e 100644
--- a/src/cpp/tests.rs
+++ b/src/cpp/tests.rs
@@ -1,47 +1,5 @@
 use crate::test_utils::roundtrip;
 
-/// Test all codecs compile and do a basic 32-bit roundtrip
-macro_rules! test_anylen {
-    ($($name:ident),* $(,)?) => {
-        $(
-            #[test]
-            #[allow(non_snake_case)]
-            fn $name() {
-                roundtrip::<$crate::cpp::$name>(&[1u32, 2, 3, 4, 5]);
-            }
-        )*
-    };
-}
-
-test_anylen!(
-    CppBP32,
-    CppCopy,
-    CppFastBinaryPacking16,
-    CppFastBinaryPacking32,
-    CppFastBinaryPacking8,
-    CppFastPFor128,
-    CppFastPFor256,
-    CppMaskedVByte,
-    CppNewPFor,
-    CppOptPFor,
-    CppPFor,
-    CppPFor2008,
-    CppSimdBinaryPacking,
-    CppSimdFastPFor128,
-    CppSimdFastPFor256,
-    CppSimdGroupSimple,
-    CppSimdGroupSimpleRingBuf,
-    CppSimdNewPFor,
-    CppSimdOptPFor,
-    CppSimdPFor,
-    CppSimdSimplePFor,
-    CppSimplePFor,
-    CppStreamVByte,
-    CppVByte,
-    CppVarInt,
-    CppVarIntGb,
-);
-
 /// Simple-9/16/8b codecs require values that fit in small bit widths and a
 /// block-aligned count; test them separately with 128 small values.
 macro_rules! test_anylen_128 {
@@ -59,53 +17,3 @@ macro_rules! test_anylen_128 {
 
 // Note: Simple9Rle crashes with heap corruption on various inputs; skip everywhere.
 test_anylen_128!(CppSimple16, CppSimple8b, CppSimple9, CppSimple8bRle);
-
-// Verify Default impl routes through new() for all generated codec types.
-macro_rules! test_default {
-    ($($name:ident),* $(,)?) => {
-        $(
-            #[test]
-            #[allow(non_snake_case)]
-            fn $name() {
-                let _codec = $crate::cpp::$name::default();
-            }
-        )*
-    };
-}
-
-/// Use a distinct prefix to avoid name collisions with `test_anylen` tests.
-mod default_impls {
-    test_default!(
-        CppBP32,
-        CppCopy,
-        CppFastBinaryPacking16,
-        CppFastBinaryPacking32,
-        CppFastBinaryPacking8,
-        CppFastPFor128,
-        CppFastPFor256,
-        CppMaskedVByte,
-        CppNewPFor,
-        CppOptPFor,
-        CppPFor,
-        CppPFor2008,
-        CppSimdBinaryPacking,
-        CppSimdFastPFor128,
-        CppSimdFastPFor256,
-        CppSimdGroupSimple,
-        CppSimdGroupSimpleRingBuf,
-        CppSimdNewPFor,
-        CppSimdOptPFor,
-        CppSimdPFor,
-        CppSimdSimplePFor,
-        CppSimple16,
-        CppSimple8b,
-        CppSimple8bRle,
-        CppSimple9,
-        CppSimple9Rle,
-        CppSimplePFor,
-        CppStreamVByte,
-        CppVByte,
-        CppVarInt,
-        CppVarIntGb,
-    );
-}

From b7c231463ce60d4aaebd1cc90a7e3cdcf8ef235b Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Mon, 23 Mar 2026 02:29:16 -0400
Subject: [PATCH 20/26] fix handling short values

---
 src/cpp/wrappers.rs | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/src/cpp/wrappers.rs b/src/cpp/wrappers.rs
index 90a58ad..11b4015 100644
--- a/src/cpp/wrappers.rs
+++ b/src/cpp/wrappers.rs
@@ -23,21 +23,6 @@ pub fn encode32_to_vec_ffi(
     Ok(())
 }
 
-fn decode32_to_vec_ffi(
-    codec: &UniquePtr<ffi::IntegerCODEC>,
-    input: &[u32],
-    out: &mut Vec<u32>,
-    capacity: usize,
-) -> FastPForResult<()> {
-    if !input.is_empty() {
-        let start = out.len();
-        out.resize(start + capacity, 0);
-        let n = ffi::codec_decode32(codec, input, &mut out[start..])?;
-        out.truncate(start + n);
-    }
-    Ok(())
-}
-
 pub fn decode32_anylen_ffi(
     codec: &UniquePtr<ffi::IntegerCODEC>,
     input: &[u32],
@@ -53,7 +38,18 @@ pub fn decode32_anylen_ffi(
         max
     };
     let start = out.len();
-    decode32_to_vec_ffi(codec, input, out, capacity)?;
+    if !input.is_empty() {
+        // Simple16/Simple9/Simple8b unpack functions always write a fixed number of
+        // values (up to 28) per word regardless of how many values remain. When the
+        // last word is decoded the function can write up to 27 elements past `nvalue`.
+        // Allocate extra padding so those writes land in owned memory rather than
+        // corrupting adjacent heap allocations.  The excess elements are discarded by
+        // the truncate below.
+        const DECODE_OVERFLOW_PADDING: usize = 32;
+        out.resize(start + capacity + DECODE_OVERFLOW_PADDING, 0);
+        let n = ffi::codec_decode32(codec, input, &mut out[start..])?;
+        out.truncate(start + n);
+    }
     if let Some(n) = expected_len {
         (out.len() - start).is_decoded_mismatch(n)?;
     }

From b4aceb95a60c4622dbc7411a8eaebf2b0af2af63 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Mon, 23 Mar 2026 02:33:13 -0400
Subject: [PATCH 21/26] simplify cpp tests

---
 src/cpp/codecs.rs | 18 +++++++++++++++---
 src/cpp/mod.rs    |  2 --
 src/cpp/tests.rs  | 19 -------------------
 3 files changed, 15 insertions(+), 24 deletions(-)
 delete mode 100644 src/cpp/tests.rs

diff --git a/src/cpp/codecs.rs b/src/cpp/codecs.rs
index 158f5e3..be18dcd 100644
--- a/src/cpp/codecs.rs
+++ b/src/cpp/codecs.rs
@@ -53,10 +53,10 @@ macro_rules! implement_cpp_codecs {
         )*
 
         #[cfg(test)]
+        #[allow(non_snake_case)]
         mod cpp_default {
             $(
                 #[test]
-                #[allow(non_snake_case)]
                 fn $name() {
                     let _codec = $crate::cpp::$name::default();
                 }
@@ -64,15 +64,27 @@ macro_rules! implement_cpp_codecs {
         }
 
         #[cfg(test)]
-        mod cpp_roundtrip {
+        #[allow(non_snake_case)]
+        mod cpp_short_roundtrip {
             $(
                 #[test]
-                #[allow(non_snake_case)]
                 fn $name() {
                     $crate::test_utils::roundtrip::<$crate::cpp::$name>(&[1u32, 2, 3, 4, 5]);
                 }
             )*
         }
+
+        #[cfg(test)]
+        #[allow(non_snake_case)]
+        mod cpp_128bit_roundtrip {
+            $(
+                #[test]
+                fn $name() {
+                    let input: Vec<u32> = (1..=128).collect();
+                    $crate::test_utils::roundtrip::<$crate::cpp::$name>(&input);
+                }
+            )*
+        }
     };
 }
 
diff --git a/src/cpp/mod.rs b/src/cpp/mod.rs
index 8aae284..c0cd0af 100644
--- a/src/cpp/mod.rs
+++ b/src/cpp/mod.rs
@@ -6,8 +6,6 @@
 //! **Thread safety:** instances have internal state and are not thread-safe. Use one per thread.
 
 mod codecs;
-#[cfg(test)]
-mod tests;
 mod wrappers;
 
 pub use codecs::*;
diff --git a/src/cpp/tests.rs b/src/cpp/tests.rs
deleted file mode 100644
index c9d5a1e..0000000
--- a/src/cpp/tests.rs
+++ /dev/null
@@ -1,19 +0,0 @@
-use crate::test_utils::roundtrip;
-
-/// Simple-9/16/8b codecs require values that fit in small bit widths and a
-/// block-aligned count; test them separately with 128 small values.
-macro_rules! test_anylen_128 {
-        ($($name:ident),* $(,)?) => {
-            $(
-                #[test]
-                #[allow(non_snake_case)]
-                fn $name() {
-                    let input: Vec<u32> = (1..=128).collect();
-                    roundtrip::<$crate::cpp::$name>(&input);
-                }
-            )*
-        };
-    }
-
-// Note: Simple9Rle crashes with heap corruption on various inputs; skip everywhere.
-test_anylen_128!(CppSimple16, CppSimple8b, CppSimple9, CppSimple8bRle);

From de430c79b5473dc08a416ae1ba1c338d3ae592f7 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Mon, 23 Mar 2026 14:39:37 -0400
Subject: [PATCH 22/26] asserts

---
 src/cpp/wrappers.rs   | 10 ++++++++++
 src/rust/composite.rs | 38 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/src/cpp/wrappers.rs b/src/cpp/wrappers.rs
index 11b4015..f41a19f 100644
--- a/src/cpp/wrappers.rs
+++ b/src/cpp/wrappers.rs
@@ -19,6 +19,11 @@ pub fn encode32_to_vec_ffi(
     let start = out.len();
     out.resize(start + capacity, 0);
     let n = ffi::codec_encode32(codec, input, &mut out[start..])?;
+    // SAFETY: It is better to panic than to have UB
+    assert!(
+        n <= capacity,
+        "C++ codec encoded more than the allocated capacity"
+    );
     out.truncate(start + n);
     Ok(())
 }
@@ -48,6 +53,11 @@ pub fn decode32_anylen_ffi(
         const DECODE_OVERFLOW_PADDING: usize = 32;
         out.resize(start + capacity + DECODE_OVERFLOW_PADDING, 0);
         let n = ffi::codec_decode32(codec, input, &mut out[start..])?;
+        // SAFETY: It is better to panic than to have UB
+        assert!(
+            n < capacity + DECODE_OVERFLOW_PADDING,
+            "C++ codec decoded more than the allocated capacity + padding"
+        );
         out.truncate(start + n);
     }
     if let Some(n) = expected_len {
diff --git a/src/rust/composite.rs b/src/rust/composite.rs
index 31187a9..adea3c1 100644
--- a/src/rust/composite.rs
+++ b/src/rust/composite.rs
@@ -108,9 +108,9 @@ impl<Blocks: BlockCodec, Tail: AnyLenCodec> AnyLenCodec for CompositeCodec<Block
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::FastPFor256;
     use crate::rust::{FastPForBlock128, FastPForBlock256, JustCopy, VariableByte};
     use crate::test_utils::{compress, decompress, roundtrip_composite, roundtrip_expected};
+    use crate::{FastPFor128, FastPFor256};
 
     #[test]
     fn test_fastpfor256_vbyte_exact_two_blocks() {
@@ -147,6 +147,42 @@ mod tests {
         assert!(decompress::<FastPFor256>(&[], Some(0)).unwrap().is_empty());
     }
 
+    /// Encoding empty input produces a single `[0]` header word — and Rust matches C++ exactly.
+    #[test]
+    fn test_empty_input_encodes_to_one_zero_word() {
+        let rust128 = compress::<FastPFor128>(&[]).unwrap();
+        assert_eq!(
+            rust128,
+            [0u32],
+            "FastPFor128: empty input must produce [0], got {rust128:?}"
+        );
+
+        let rust256 = compress::<FastPFor256>(&[]).unwrap();
+        assert_eq!(
+            rust256,
+            [0u32],
+            "FastPFor256: empty input must produce [0], got {rust256:?}"
+        );
+
+        // Verify C++ produces identical output — both codecs agree on [0] for empty.
+        #[cfg(feature = "cpp")]
+        {
+            use crate::cpp::{CppFastPFor128, CppFastPFor256};
+
+            let cpp128 = compress::<CppFastPFor128>(&[]).unwrap();
+            assert_eq!(
+                cpp128, rust128,
+                "CppFastPFor128 and FastPFor128 must agree on empty encoding"
+            );
+
+            let cpp256 = compress::<CppFastPFor256>(&[]).unwrap();
+            assert_eq!(
+                cpp256, rust256,
+                "CppFastPFor256 and FastPFor256 must agree on empty encoding"
+            );
+        }
+    }
+
     #[test]
     fn test_decode_empty_input_with_nonzero_expected_errors() {
         // Empty input: max_decompressed_len(0) == 0, so any expected_len > 0 fails

From 7ec91dbd3e52d87e620a91f2c5686b1f0ec47397 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Mon, 23 Mar 2026 15:27:58 -0400
Subject: [PATCH 23/26] clean up rnd seed

---
 src/test_utils.rs    | 12 ++++++------
 tests/basic_tests.rs |  7 +++----
 tests/common.rs      |  6 +++++-
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/test_utils.rs b/src/test_utils.rs
index c4f3d2a..3626ed5 100644
--- a/src/test_utils.rs
+++ b/src/test_utils.rs
@@ -25,7 +25,7 @@ use fastpfor::{
     FastPFor128, FastPFor256, FastPForBlock128, FastPForBlock256, JustCopy, VariableByte,
 };
 
-const SEED: u64 = 456;
+pub const RNG_SEED: u64 = 456;
 
 // ---------------------------------------------------------------------------
 // Generic codec helpers
@@ -165,12 +165,12 @@ mod rust_bench {
     use rand::rngs::StdRng;
     use rand::{RngExt as _, SeedableRng};
 
-    use super::{BlockCodec, block_compress};
+    use super::{BlockCodec, RNG_SEED, block_compress};
 
     type DataGeneratorFn = fn(usize) -> Vec<u32>;
 
     fn generate_uniform_data_from_range(size: usize, value_range: Range<u32>) -> Vec<u32> {
-        let mut rng = StdRng::seed_from_u64(super::SEED);
+        let mut rng = StdRng::seed_from_u64(RNG_SEED);
         (0..size)
             .map(|_| rng.random_range(value_range.clone()))
             .collect()
@@ -185,7 +185,7 @@ mod rust_bench {
     }
 
     fn generate_clustered_data(size: usize) -> Vec<u32> {
-        let mut rng = StdRng::seed_from_u64(super::SEED);
+        let mut rng = StdRng::seed_from_u64(RNG_SEED);
         let mut base = 0u32;
         (0..size)
             .map(|_| {
@@ -202,7 +202,7 @@ mod rust_bench {
     }
 
     fn generate_sparse_data(size: usize) -> Vec<u32> {
-        let mut rng = StdRng::seed_from_u64(super::SEED);
+        let mut rng = StdRng::seed_from_u64(RNG_SEED);
         (0..size)
             .map(|_| {
                 if rng.random_bool(0.9) {
@@ -215,7 +215,7 @@ mod rust_bench {
     }
 
     fn generate_constant_data(size: usize) -> Vec<u32> {
-        vec![super::SEED as u32; size]
+        vec![RNG_SEED as u32; size]
     }
 
     fn generate_geometric_data(size: usize) -> Vec<u32> {
diff --git a/tests/basic_tests.rs b/tests/basic_tests.rs
index bbaa848..570ecdf 100644
--- a/tests/basic_tests.rs
+++ b/tests/basic_tests.rs
@@ -10,7 +10,7 @@ use fastpfor::{BlockCodec, FastPForBlock128, FastPForBlock256, slice_to_blocks};
 use rand::rngs::StdRng;
 use rand::{RngExt as _, SeedableRng};
 
-use crate::test_utils::{block_compress, block_roundtrip_all, roundtrip_all};
+use crate::test_utils::{RNG_SEED, block_compress, block_roundtrip_all, roundtrip_all};
 
 mod common;
 
@@ -51,9 +51,8 @@ fn test_increasing_sequence() {
 
 #[test]
 fn test_random_numbers() {
-    let data: Vec<u32> = (0..65536)
-        .map(|_| StdRng::seed_from_u64(123456).random())
-        .collect();
+    let mut rng = StdRng::seed_from_u64(RNG_SEED);
+    let data: Vec<u32> = (0..65536).map(|_| rng.random()).collect();
     roundtrip_all(&data);
 }
 
diff --git a/tests/common.rs b/tests/common.rs
index 03e888d..d9bc197 100644
--- a/tests/common.rs
+++ b/tests/common.rs
@@ -3,8 +3,12 @@
 #![cfg(any(feature = "rust", feature = "cpp"))]
 #![allow(dead_code, reason = "This file is shared by several test modules")]
 
+#[path = "../src/test_utils.rs"]
+mod test_utils;
+
 use rand::rngs::StdRng;
 use rand::{RngExt as _, SeedableRng as _};
+use test_utils::RNG_SEED;
 
 /// Returns various input sizes to test codec behavior (multiples of 128).
 #[must_use]
@@ -15,7 +19,7 @@ pub fn test_input_sizes() -> Vec<usize> {
 /// Generates test data vectors of size `n` with various patterns.
 #[must_use]
 pub fn get_test_cases(n: usize) -> Vec<Vec<u32>> {
-    let mut rng = StdRng::seed_from_u64(14);
+    let mut rng = StdRng::seed_from_u64(RNG_SEED);
 
     vec![
         // Zeroes

From 3aef062efd22d37ce915fec5f18da327ba1380d2 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Mon, 23 Mar 2026 15:38:07 -0400
Subject: [PATCH 24/26] consolidate test helpers

---
 src/test_utils.rs         | 48 ++++++++++++++++++++++++++++++++++
 tests/basic_tests.rs      |  2 --
 tests/common.rs           | 55 ---------------------------------------
 tests/cpp_compat_tests.rs |  3 +--
 4 files changed, 49 insertions(+), 59 deletions(-)
 delete mode 100644 tests/common.rs

diff --git a/src/test_utils.rs b/src/test_utils.rs
index 3626ed5..6fe9c9a 100644
--- a/src/test_utils.rs
+++ b/src/test_utils.rs
@@ -153,6 +153,54 @@ where
     roundtrip::<fastpfor::CompositeCodec<B, T>>(data);
 }
 
+// ---------------------------------------------------------------------------
+// Compatibility test helpers (used by integration tests)
+// ---------------------------------------------------------------------------
+
+/// Returns various input sizes to test codec behavior (multiples of 128).
+pub fn test_input_sizes() -> Vec<usize> {
+    (1..=8).map(|exp| (1usize << exp) * 128).collect()
+}
+
+/// Generates test data vectors of size `n` with various patterns.
+pub fn get_test_cases(n: usize) -> Vec<Vec<u32>> {
+    use rand::rngs::StdRng;
+    use rand::{RngExt as _, SeedableRng as _};
+    let mut rng = StdRng::seed_from_u64(RNG_SEED);
+
+    vec![
+        // Zeroes
+        vec![0u32; n],
+        // Same non-zero
+        vec![14u32; n],
+        // Ascending values
+        (0..n).map(|i| i as u32).collect::<Vec<u32>>(),
+        // Descending values
+        (0..n).rev().map(|i| i as u32).collect::<Vec<u32>>(),
+        // Bit-flipping pattern
+        (0..n)
+            .map(|i| ((i as u32) * 32) ^ ((i as u32) >> 1))
+            .collect::<Vec<u32>>(),
+        // Alternating large and small values
+        (0..n)
+            .map(|i| {
+                let ui = i as u32;
+                if ui % 2 == 0 { 1 << 30 } else { 3 }
+            })
+            .collect::<Vec<u32>>(),
+        // Random u32 values
+        (0..n)
+            .map(|_| rng.random_range(0..(1 << 31)))
+            .collect::<Vec<u32>>(),
+        // Spike in the middle
+        (0..n)
+            .map(|i| if i == n / 2 { u32::MAX } else { 1 })
+            .collect::<Vec<u32>>(),
+        // An empty vector
+        Vec::new(),
+    ]
+}
+
 // ---------------------------------------------------------------------------
 // Data generators + fixtures (Rust block codecs; benchmarks / smoke tests)
 // ---------------------------------------------------------------------------
diff --git a/tests/basic_tests.rs b/tests/basic_tests.rs
index 570ecdf..33edf03 100644
--- a/tests/basic_tests.rs
+++ b/tests/basic_tests.rs
@@ -12,8 +12,6 @@ use rand::{RngExt as _, SeedableRng};
 
 use crate::test_utils::{RNG_SEED, block_compress, block_roundtrip_all, roundtrip_all};
 
-mod common;
-
 // ── Tests ─────────────────────────────────────────────────────────────────────
 
 #[test]
diff --git a/tests/common.rs b/tests/common.rs
deleted file mode 100644
index d9bc197..0000000
--- a/tests/common.rs
+++ /dev/null
@@ -1,55 +0,0 @@
-//! Common test utilities for codec compatibility testing.
-
-#![cfg(any(feature = "rust", feature = "cpp"))]
-#![allow(dead_code, reason = "This file is shared by several test modules")]
-
-#[path = "../src/test_utils.rs"]
-mod test_utils;
-
-use rand::rngs::StdRng;
-use rand::{RngExt as _, SeedableRng as _};
-use test_utils::RNG_SEED;
-
-/// Returns various input sizes to test codec behavior (multiples of 128).
-#[must_use]
-pub fn test_input_sizes() -> Vec<usize> {
-    (1..=8).map(|exp| (1usize << exp) * 128).collect()
-}
-
-/// Generates test data vectors of size `n` with various patterns.
-#[must_use]
-pub fn get_test_cases(n: usize) -> Vec<Vec<u32>> {
-    let mut rng = StdRng::seed_from_u64(RNG_SEED);
-
-    vec![
-        // Zeroes
-        vec![0u32; n],
-        // Same non-zero
-        vec![14u32; n],
-        // Ascending values
-        (0..n).map(|i| i as u32).collect::<Vec<u32>>(),
-        // Descending values
-        (0..n).rev().map(|i| i as u32).collect::<Vec<u32>>(),
-        // Bit-flipping pattern
-        (0..n)
-            .map(|i| ((i as u32) * 32) ^ ((i as u32) >> 1))
-            .collect::<Vec<u32>>(),
-        // Alternating large and small values
-        (0..n)
-            .map(|i| {
-                let ui = i as u32;
-                if ui % 2 == 0 { 1 << 30 } else { 3 }
-            })
-            .collect::<Vec<u32>>(),
-        // Random u32 values
-        (0..n)
-            .map(|_| rng.random_range(0..(1 << 31)))
-            .collect::<Vec<u32>>(),
-        // Spike in the middle
-        (0..n)
-            .map(|i| if i == n / 2 { u32::MAX } else { 1 })
-            .collect::<Vec<u32>>(),
-        // An empty vector
-        Vec::new(),
-    ]
-}
diff --git a/tests/cpp_compat_tests.rs b/tests/cpp_compat_tests.rs
index 153b8bf..a6717c4 100644
--- a/tests/cpp_compat_tests.rs
+++ b/tests/cpp_compat_tests.rs
@@ -11,9 +11,8 @@ mod test_utils;
 use fastpfor::{FastPFor128, FastPFor256, FastPForBlock128};
 use test_utils::{block_compress, block_decompress, compress, roundtrip, roundtrip_full};
 
-mod common;
-use common::{get_test_cases, test_input_sizes};
 use fastpfor::cpp::CppFastPFor128;
+use test_utils::{get_test_cases, test_input_sizes};
 
 use crate::test_utils::decompress;
 

From db79d666be8265a0b8f4a666a1d8cda96c42739f Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Mon, 23 Mar 2026 15:46:19 -0400
Subject: [PATCH 25/26] feedback

---
 fuzz/fuzz_targets/decode_arbitrary.rs    |  2 +-
 fuzz/justfile                            |  2 +-
 justfile                                 |  6 ++----
 src/rust/integer_compression/fastpfor.rs |  2 +-
 src/test_utils.rs                        |  6 ------
 tests/basic_tests.rs                     | 15 ---------------
 tests/cpp_compat_tests.rs                |  9 +++++----
 7 files changed, 10 insertions(+), 32 deletions(-)

diff --git a/fuzz/fuzz_targets/decode_arbitrary.rs b/fuzz/fuzz_targets/decode_arbitrary.rs
index 6eda4c9..e82e3fb 100644
--- a/fuzz/fuzz_targets/decode_arbitrary.rs
+++ b/fuzz/fuzz_targets/decode_arbitrary.rs
@@ -4,7 +4,7 @@
 //!
 //! Why this target is needed
 //! -------------------------
-//! The existing `compress_oracle` target only feeds *well-formed* data to the Rust
+//! The existing `encode_oracle` target only feeds *well-formed* data to the Rust
 //! decoder (it first compresses valid input, then decompresses).
 //! That means corrupted or truncated compressed streams never reach the decoder, so
 //! out-of-bounds panics in `decode_page` are invisible to the fuzzer.
diff --git a/fuzz/justfile b/fuzz/justfile
index 22cac61..e37bf21 100755
--- a/fuzz/justfile
+++ b/fuzz/justfile
@@ -34,7 +34,7 @@ run-iters target iters='10000' *args:
 # Run encode_oracle (pure Rust roundtrip, no C++ required)
 rust-encode *args: (run 'encode_oracle' args)
 
-# Run decode_oracle (parallel Rust + C++ roundtrips, cross-checks decodeed values)
+# Run decode_oracle (parallel Rust + C++ roundtrips, cross-checks decoded values)
 rust-decode *args: (run 'decode_oracle' args)
 
 # Feed arbitrary bytes directly to the Rust decodeor (no panic check)
diff --git a/justfile b/justfile
index 5fc804e..21c9940 100755
--- a/justfile
+++ b/justfile
@@ -85,7 +85,7 @@ fmt:
     #!/usr/bin/env bash
     set -euo pipefail
     for dir in "./" "fuzz"; do
-        cd "$dir"
+        pushd "$dir"
         if (rustup toolchain list | grep nightly && rustup component list --toolchain nightly | grep rustfmt) &> /dev/null; then
             echo "Reformatting Rust code using nightly Rust fmt to sort imports in $dir"
             cargo +nightly fmt --all -- --config imports_granularity=Module,group_imports=StdExternalCrate
@@ -93,9 +93,7 @@ fmt:
             echo "Reformatting Rust with the stable cargo fmt in $dir.  Install nightly with \`rustup install nightly\` for better results"
             cargo fmt --all
         fi
-        if [ -f .git ]; then
-            cd ..
-        fi
+        popd
     done
 
 # Reformat all Cargo.toml files using cargo-sort
diff --git a/src/rust/integer_compression/fastpfor.rs b/src/rust/integer_compression/fastpfor.rs
index d270e11..cf5805a 100644
--- a/src/rust/integer_compression/fastpfor.rs
+++ b/src/rust/integer_compression/fastpfor.rs
@@ -13,7 +13,7 @@ use crate::{BlockCodec, FastPForError, FastPForResult};
 /// Overhead cost (in bits) for storing each exception's position in the block
 const OVERHEAD_OF_EACH_EXCEPT: u32 = 8;
 
-/// Default page size in number of integers (64 KiB / 4 bytes = 16 Ki integers).
+/// Default page size in number of integers.
 const DEFAULT_PAGE_SIZE: u32 = 65536;
 
 /// Type alias for [`FastPFor`] with 128-element blocks.
diff --git a/src/test_utils.rs b/src/test_utils.rs
index 6fe9c9a..2041a84 100644
--- a/src/test_utils.rs
+++ b/src/test_utils.rs
@@ -2,12 +2,6 @@
 //! Criterion benchmarks, integration tests, and `#[cfg(test)]` unit tests in the
 //! `fastpfor` crate.
 //!
-//! - **Library unit tests:** `crate::test_utils` via `#[cfg(test)] mod bench_utils` in `lib.rs`
-//!   and `extern crate self as fastpfor` so this file can `use fastpfor::...`.
-//! - **Integration tests:** `#[cfg(test)] #[path = "../src/test_utils.rs"] mod bench_utils`.
-//! - **Criterion benchmarks:** `#[path = "../src/test_utils.rs"] mod bench_utils` (`cfg(test)` is not
-//!   enabled for bench targets, so the module is included unconditionally there).
-//!
 //! Loaded as a module via `#[path]` or as a normal child module, so every item
 //! consumed from outside must be `pub`. Each consumer uses a different subset,
 //! so dead-code is allowed at module scope.
diff --git a/tests/basic_tests.rs b/tests/basic_tests.rs
index 33edf03..0d4ca98 100644
--- a/tests/basic_tests.rs
+++ b/tests/basic_tests.rs
@@ -20,21 +20,6 @@ fn saul_test() {
     roundtrip_all(&[2u32, 3, 4, 5]);
 }
 
-/// Sub-block-sized inputs produce no output via `BlockCodec`.
-#[test]
-fn spurious_out_test() {
-    fn check<C: BlockCodec + Default>(len: usize) {
-        let x = vec![0u32; 1024];
-        let (blocks, _) = slice_to_blocks::<C>(&x[..len]);
-        let out = block_compress::<C>(cast_slice(blocks)).unwrap();
-        assert!(out.is_empty() || blocks.is_empty());
-    }
-    for len in 0..32usize {
-        check::<FastPForBlock256>(len);
-        check::<FastPForBlock128>(len);
-    }
-}
-
 /// `AnyLenCodec` round-trips empty input correctly.
 #[test]
 fn zero_in_zero_out_test() {
diff --git a/tests/cpp_compat_tests.rs b/tests/cpp_compat_tests.rs
index a6717c4..d85ee5b 100644
--- a/tests/cpp_compat_tests.rs
+++ b/tests/cpp_compat_tests.rs
@@ -8,11 +8,12 @@
 #[path = "../src/test_utils.rs"]
 mod test_utils;
 
-use fastpfor::{FastPFor128, FastPFor256, FastPForBlock128};
-use test_utils::{block_compress, block_decompress, compress, roundtrip, roundtrip_full};
-
 use fastpfor::cpp::CppFastPFor128;
-use test_utils::{get_test_cases, test_input_sizes};
+use fastpfor::{FastPFor128, FastPFor256, FastPForBlock128};
+use test_utils::{
+    block_compress, block_decompress, compress, get_test_cases, roundtrip, roundtrip_full,
+    test_input_sizes,
+};
 
 use crate::test_utils::decompress;
 

From fbdf5006013a434dea3947eaeaa46404eb24118a Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Mon, 23 Mar 2026 16:00:16 -0400
Subject: [PATCH 26/26] lock down block size

---
 src/rust/integer_compression/fastpfor.rs | 36 ++++++++++--------------
 tests/basic_tests.rs                     |  4 +--
 2 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/src/rust/integer_compression/fastpfor.rs b/src/rust/integer_compression/fastpfor.rs
index cf5805a..c8ef8af 100644
--- a/src/rust/integer_compression/fastpfor.rs
+++ b/src/rust/integer_compression/fastpfor.rs
@@ -10,6 +10,16 @@ use crate::rust::cursor::IncrementCursor;
 use crate::rust::integer_compression::{bitpacking, bitunpacking};
 use crate::{BlockCodec, FastPForError, FastPForResult};
 
+mod sealed {
+    /// Sealed marker trait: only `[u32; 128]` and `[u32; 256]` are valid `FastPFor` block arrays.
+    ///
+    /// This is intentionally private so that users cannot implement it for other sizes,
+    /// preventing instantiation of `FastPFor<N>` for unsupported `N` at compile time.
+    pub trait BlockSize: bytemuck::Pod {}
+    impl BlockSize for [u32; 128] {}
+    impl BlockSize for [u32; 256] {}
+}
+
 /// Overhead cost (in bits) for storing each exception's position in the block
 const OVERHEAD_OF_EACH_EXCEPT: u32 = 8;
 
@@ -63,36 +73,20 @@ pub struct FastPFor<const N: usize> {
 
 impl<const N: usize> Default for FastPFor<N>
 where
-    [u32; N]: bytemuck::Pod,
+    [u32; N]: sealed::BlockSize,
 {
     fn default() -> Self {
-        Self::create(DEFAULT_PAGE_SIZE)
+        Self::new(DEFAULT_PAGE_SIZE)
             .expect("DEFAULT_PAGE_SIZE is a multiple of all valid block sizes")
     }
 }
 
-impl FastPFor<128> {
-    /// Creates a new `FastPForBlock128` codec with the given page size.
+impl<const N: usize> FastPFor<N> {
+    /// Creates a new `FastPForBlock` with a codec with the given page size.
     ///
     /// Returns an error if `page_size` is not a multiple of 128.
     /// Use [`Default`] for the default page size.
     pub fn new(page_size: u32) -> FastPForResult<Self> {
-        Self::create(page_size)
-    }
-}
-
-impl FastPFor<256> {
-    /// Creates a new `FastPForBlock256` codec with the given page size.
-    ///
-    /// Returns an error if `page_size` is not a multiple of 256.
-    /// Use [`Default`] for the default page size.
-    pub fn new(page_size: u32) -> FastPForResult<Self> {
-        Self::create(page_size)
-    }
-}
-
-impl<const N: usize> FastPFor<N> {
-    fn create(page_size: u32) -> FastPForResult<Self> {
         if page_size % N as u32 != 0 {
             return Err(FastPForError::InvalidPageSize {
                 page_size,
@@ -499,7 +493,7 @@ impl<const N: usize> FastPFor<N> {
 
 impl<const N: usize> BlockCodec for FastPFor<N>
 where
-    [u32; N]: bytemuck::Pod,
+    [u32; N]: sealed::BlockSize,
 {
     type Block = [u32; N];
 
diff --git a/tests/basic_tests.rs b/tests/basic_tests.rs
index 0d4ca98..d308515 100644
--- a/tests/basic_tests.rs
+++ b/tests/basic_tests.rs
@@ -5,12 +5,10 @@
 #[path = "../src/test_utils.rs"]
 mod test_utils;
 
-use bytemuck::cast_slice;
-use fastpfor::{BlockCodec, FastPForBlock128, FastPForBlock256, slice_to_blocks};
 use rand::rngs::StdRng;
 use rand::{RngExt as _, SeedableRng};
 
-use crate::test_utils::{RNG_SEED, block_compress, block_roundtrip_all, roundtrip_all};
+use crate::test_utils::{RNG_SEED, block_roundtrip_all, roundtrip_all};
 
 // ── Tests ─────────────────────────────────────────────────────────────────────