From c41d357d5b1cbbd41b48731d259aa010287105c2 Mon Sep 17 00:00:00 2001
From: Yuri Astrakhan <YuriAstrakhan@gmail.com>
Date: Sun, 22 Mar 2026 05:00:13 -0400
Subject: [PATCH] chore(cpp): reorganize codec structure and improve error
 handling

First step of moving towards unified C++ and Rust model of codecs.
---
 Cargo.toml                                    |   6 +-
 README.md                                     |  20 +-
 benches/bench_utils.rs                        |  27 +-
 benches/fastpfor_benchmark.rs                 |  12 +-
 fuzz/fuzz_targets/common.rs                   |  75 ++--
 fuzz/fuzz_targets/cpp_roundtrip.rs            |  28 +-
 fuzz/fuzz_targets/rust_compress_oracle.rs     |  49 +--
 fuzz/fuzz_targets/rust_decompress_oracle.rs   |  45 +-
 fuzz/justfile                                 |  45 +-
 justfile                                      |  12 +-
 src/codec.rs                                  | 170 ++++++++
 src/cpp/README.md                             | 147 -------
 src/cpp/codecs.rs                             | 277 ++++++++++++
 src/cpp/mod.rs                                | 404 +-----------------
 src/cpp/tests.rs                              | 123 ++++++
 src/cpp/wrappers.rs                           |  90 ++++
 src/error.rs                                  |  48 +++
 src/helpers.rs                                |  89 ++++
 src/lib.rs                                    |  11 +
 src/rust/error.rs                             |  25 --
 src/rust/integer_compression/bitpacking.rs    |  56 +++
 src/rust/integer_compression/fastpfor.rs      |  15 +-
 src/rust/integer_compression/helpers.rs       |  61 ---
 src/rust/integer_compression/mod.rs           |   1 -
 src/rust/integer_compression/variable_byte.rs |   2 +-
 src/rust/mod.rs                               |   4 +-
 tests/benchmark_smoke.rs                      |   8 +-
 tests/cpp_compat_tests.rs                     |  48 ++-
 28 files changed, 1078 insertions(+), 820 deletions(-)
 create mode 100644 src/codec.rs
 delete mode 100644 src/cpp/README.md
 create mode 100644 src/cpp/codecs.rs
 create mode 100644 src/cpp/tests.rs
 create mode 100644 src/cpp/wrappers.rs
 create mode 100644 src/error.rs
 create mode 100644 src/helpers.rs
 delete mode 100644 src/rust/error.rs
 delete mode 100644 src/rust/integer_compression/helpers.rs

diff --git a/Cargo.toml b/Cargo.toml
index 46d0f08..fc171e5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -38,13 +38,13 @@ cpp_portable = ["cpp"]
 # Optimize FastPFOR for the current CPU.
 cpp_native = ["cpp"]
 cpp = ["dep:cmake", "dep:cxx", "dep:cxx-build"]
-rust = ["dep:thiserror", "dep:bytes", "dep:bytemuck"]
+rust = ["dep:bytes"]
 
 [dependencies]
-bytemuck = { version = "1.25.0", optional = true }
+bytemuck = { version = "1.25.0", features = ["min_const_generics"] }
 bytes = { version = "1.11", optional = true }
 cxx = { version = "1.0.194", optional = true }
-thiserror = { version = "2.0.18", optional = true }
+thiserror = "2.0.18"
 
 [build-dependencies]
 cmake = { version = "0.1.57", optional = true }
diff --git a/README.md b/README.md
index 67dba4d..b937bbd 100644
--- a/README.md
+++ b/README.md
@@ -90,21 +90,21 @@ Feature selection can be overridden with the `FASTPFOR_SIMD_MODE` environment va
 ### Using C++ Wrapper
 
 ```rust
-use fastpfor::cpp::{Codec32 as _, SimdFastPFor128Codec};
+use fastpfor::{AnyLenCodec as _, cpp};
 
 fn main() {
-  let mut codec = SimdFastPFor128Codec::new();
+  let mut codec = cpp::SimdFastPFor128Codec::new();
 
-  // Encode
-  let mut input = vec![1, 2, 3, 4, 5];
-  let mut output = vec![0; 10];  // must be large enough
-  let enc_slice = codec.encode32(&input, &mut output).unwrap();
+  let input = vec![1u32, 2, 3, 4, 5];
+  let mut compressed = Vec::new();
+  codec.encode(&input, &mut compressed).unwrap();
 
-  // Decode
-  let mut decoded = vec![0; 10]; // must be large enough
-  let dec_slice = codec.decode32(&enc_slice, &mut decoded).unwrap();
+  let mut decoded = Vec::new();
+  codec
+    .decode(&compressed, &mut decoded, None)
+    .unwrap();
 
-  assert_eq!(input, dec_slice);
+  assert_eq!(input, decoded);
 }
 ```
 
diff --git a/benches/bench_utils.rs b/benches/bench_utils.rs
index 8b4dbdc..6b8be2e 100644
--- a/benches/bench_utils.rs
+++ b/benches/bench_utils.rs
@@ -13,6 +13,10 @@ use core::ops::Range;
 pub use std::io::Cursor;
 use std::num::NonZeroU32;
 
+#[cfg(feature = "cpp")]
+use fastpfor::AnyLenCodec as _;
+#[cfg(feature = "cpp")]
+use fastpfor::cpp;
 pub use fastpfor::rust::{BLOCK_SIZE_128, BLOCK_SIZE_256, DEFAULT_PAGE_SIZE, FastPFOR, Integer};
 use rand::rngs::StdRng;
 use rand::{RngExt as _, SeedableRng};
@@ -167,22 +171,24 @@ fn prepare_compressed_data(data: &[u32], block_size: NonZeroU32) -> Vec<u32> {
 // ---------------------------------------------------------------------------
 
 #[cfg(feature = "cpp")]
-pub fn cpp_encode(codec: &fastpfor::cpp::FastPFor128Codec, data: &[u32]) -> Vec<u32> {
-    use fastpfor::cpp::Codec32 as _;
-    let mut out = vec![0u32; data.len() * 2 + 1024];
-    let new_len = codec.encode32(data, &mut out).unwrap().len();
-    out.truncate(new_len);
+pub fn cpp_encode(codec: &mut cpp::FastPFor128Codec, data: &[u32]) -> Vec<u32> {
+    let mut out = Vec::new();
+    codec.encode(data, &mut out).unwrap();
     out
 }
 
 #[cfg(feature = "cpp")]
 pub fn cpp_decode(
-    codec: &fastpfor::cpp::FastPFor128Codec,
+    codec: &mut cpp::FastPFor128Codec,
     compressed: &[u32],
     decompressed: &mut [u32],
 ) -> usize {
-    use fastpfor::cpp::Codec32 as _;
-    codec.decode32(compressed, decompressed).unwrap().len()
+    let mut out = Vec::new();
+    codec
+        .decode(compressed, &mut out, Some(decompressed.len() as u32))
+        .unwrap();
+    decompressed.copy_from_slice(&out);
+    out.len()
 }
 
 // ---------------------------------------------------------------------------
@@ -268,10 +274,9 @@ pub struct CppDecodeFixture {
 #[cfg(feature = "cpp")]
 impl CppDecodeFixture {
     fn new(name: &'static str, generator: DataGeneratorFn, size: usize) -> Self {
-        use fastpfor::cpp::FastPFor128Codec;
         let data = generator(size);
-        let codec = FastPFor128Codec::new();
-        let cpp_compressed = cpp_encode(&codec, &data);
+        let mut codec = cpp::FastPFor128Codec::new();
+        let cpp_compressed = cpp_encode(&mut codec, &data);
         let rust_compressed = prepare_compressed_data(&data, BLOCK_SIZE_128);
         Self {
             name,
diff --git a/benches/fastpfor_benchmark.rs b/benches/fastpfor_benchmark.rs
index ee3fcfa..7fd61b2 100644
--- a/benches/fastpfor_benchmark.rs
+++ b/benches/fastpfor_benchmark.rs
@@ -13,6 +13,8 @@ use bench_utils::{
 };
 #[cfg(feature = "cpp")]
 use bench_utils::{cpp_decode, cpp_decode_fixtures, cpp_encode};
+#[cfg(feature = "cpp")]
+use fastpfor::cpp;
 
 const SIZES: &[usize] = &[1024, 4096];
 
@@ -144,8 +146,6 @@ fn benchmark_compression_ratio(c: &mut Criterion) {
 /// the pure-Rust `FastPFOR` codec with `BLOCK_SIZE_128`.
 #[cfg(feature = "cpp")]
 fn benchmark_cpp_vs_rust(c: &mut Criterion) {
-    use fastpfor::cpp::FastPFor128Codec;
-
     let mut group = c.benchmark_group("cpp_vs_rust/encode");
     for (size, fix) in compress_fixtures(SIZES) {
         group.throughput(Throughput::Elements(size as u64));
@@ -153,8 +153,8 @@ fn benchmark_cpp_vs_rust(c: &mut Criterion) {
             BenchmarkId::new(format!("cpp/{}", fix.name), size),
             &fix.data,
             |b, data| {
-                let codec = FastPFor128Codec::new();
-                b.iter(|| black_box(cpp_encode(&codec, black_box(data))));
+                let mut codec = cpp::FastPFor128Codec::new();
+                b.iter(|| black_box(cpp_encode(&mut codec, black_box(data))));
             },
         );
         group.bench_with_input(
@@ -175,9 +175,9 @@ fn benchmark_cpp_vs_rust(c: &mut Criterion) {
             BenchmarkId::new(format!("cpp/{}", fix.name), size),
             &fix.cpp_compressed,
             |b, compressed| {
-                let codec = FastPFor128Codec::new();
+                let mut codec = cpp::FastPFor128Codec::new();
                 let mut out = vec![0u32; fix.original_len];
-                b.iter(|| black_box(cpp_decode(&codec, black_box(compressed), &mut out)));
+                b.iter(|| black_box(cpp_decode(&mut codec, black_box(compressed), &mut out)));
             },
         );
         group.bench_with_input(
diff --git a/fuzz/fuzz_targets/common.rs b/fuzz/fuzz_targets/common.rs
index 8e5f8bd..ea6b8d5 100644
--- a/fuzz/fuzz_targets/common.rs
+++ b/fuzz/fuzz_targets/common.rs
@@ -1,6 +1,6 @@
-use fastpfor::{cpp, rust};
+use fastpfor::{AnyLenCodec, cpp, rust};
 
-pub type BoxedCppCodec = Box<dyn cpp::Codec32>;
+pub type BoxedCppCodec = Box<dyn AnyLenCodec>;
 
 #[derive(arbitrary::Arbitrary)]
 pub struct FuzzInput<C> {
@@ -77,42 +77,43 @@ pub enum CppCodec {
 
 impl From<CppCodec> for BoxedCppCodec {
     fn from(codec: CppCodec) -> Self {
-        use cpp::*;
         match codec {
-            CppCodec::BP32 => Box::new(BP32Codec::default()),
-            CppCodec::Copy => Box::new(CopyCodec::default()),
-            CppCodec::FastBinaryPacking8 => Box::new(FastBinaryPacking8Codec::default()),
-            CppCodec::FastPFor128 => Box::new(FastPFor128Codec::default()),
-            CppCodec::FastPFor256 => Box::new(FastPFor256Codec::default()),
-            CppCodec::FastBinaryPacking16 => Box::new(FastBinaryPacking16Codec::default()),
-            CppCodec::FastBinaryPacking32 => Box::new(FastBinaryPacking32Codec::default()),
-            CppCodec::MaskedVByte => Box::new(MaskedVByteCodec::default()),
-            CppCodec::NewPFor => Box::new(NewPForCodec::default()),
-            CppCodec::OptPFor => Box::new(OptPForCodec::default()),
-            CppCodec::PFor2008 => Box::new(PFor2008Codec::default()),
-            CppCodec::PFor => Box::new(PForCodec::default()),
-            CppCodec::SimdBinaryPacking => Box::new(SimdBinaryPackingCodec::default()),
-            CppCodec::SimdFastPFor128 => Box::new(SimdFastPFor128Codec::default()),
-            CppCodec::SimdFastPFor256 => Box::new(SimdFastPFor256Codec::default()),
-            CppCodec::SimdGroupSimple => Box::new(SimdGroupSimpleCodec::default()),
-            CppCodec::SimdGroupSimpleRingBuf => Box::new(SimdGroupSimpleRingBufCodec::default()),
-            CppCodec::SimdNewPFor => Box::new(SimdNewPForCodec::default()),
-            CppCodec::SimdOptPFor => Box::new(SimdOptPForCodec::default()),
-            CppCodec::SimdPFor => Box::new(SimdPForCodec::default()),
-            CppCodec::SimdSimplePFor => Box::new(SimdSimplePForCodec::default()),
-            // CppCodec::Simple16 => Box::new(Simple16Codec::default()),
-            // CppCodec::Simple8b => Box::new(Simple8bCodec::default()),
-            // CppCodec::Simple8bRle => Box::new(Simple8bRleCodec::default()),
-            // CppCodec::Simple9 => Box::new(Simple9Codec::default()),
-            // CppCodec::Simple9Rle => Box::new(Simple9RleCodec::default()),
-            // CppCodec::SimplePFor => Box::new(SimplePForCodec::default()),
-            // CppCodec::Snappy => Box::new(SnappyCodec::default()),
-            CppCodec::StreamVByte => Box::new(StreamVByteCodec::default()),
-            CppCodec::VByte => Box::new(VByteCodec::default()),
-            CppCodec::VarInt => Box::new(VarIntCodec::default()),
-            // CppCodec::VarIntG8iu => Box::new(VarIntG8iuCodec::default()),
-            CppCodec::VarIntGb => Box::new(VarIntGbCodec::default()),
-            // CppCodec::VsEncoding => Box::new(VsEncodingCodec::default()),
+            CppCodec::BP32 => Box::new(cpp::BP32Codec::default()),
+            CppCodec::Copy => Box::new(cpp::CopyCodec::default()),
+            CppCodec::FastBinaryPacking8 => Box::new(cpp::FastBinaryPacking8Codec::default()),
+            CppCodec::FastPFor128 => Box::new(cpp::FastPFor128Codec::default()),
+            CppCodec::FastPFor256 => Box::new(cpp::FastPFor256Codec::default()),
+            CppCodec::FastBinaryPacking16 => Box::new(cpp::FastBinaryPacking16Codec::default()),
+            CppCodec::FastBinaryPacking32 => Box::new(cpp::FastBinaryPacking32Codec::default()),
+            CppCodec::MaskedVByte => Box::new(cpp::MaskedVByteCodec::default()),
+            CppCodec::NewPFor => Box::new(cpp::NewPForCodec::default()),
+            CppCodec::OptPFor => Box::new(cpp::OptPForCodec::default()),
+            CppCodec::PFor2008 => Box::new(cpp::PFor2008Codec::default()),
+            CppCodec::PFor => Box::new(cpp::PForCodec::default()),
+            CppCodec::SimdBinaryPacking => Box::new(cpp::SimdBinaryPackingCodec::default()),
+            CppCodec::SimdFastPFor128 => Box::new(cpp::SimdFastPFor128Codec::default()),
+            CppCodec::SimdFastPFor256 => Box::new(cpp::SimdFastPFor256Codec::default()),
+            CppCodec::SimdGroupSimple => Box::new(cpp::SimdGroupSimpleCodec::default()),
+            CppCodec::SimdGroupSimpleRingBuf => {
+                Box::new(cpp::SimdGroupSimpleRingBufCodec::default())
+            }
+            CppCodec::SimdNewPFor => Box::new(cpp::SimdNewPForCodec::default()),
+            CppCodec::SimdOptPFor => Box::new(cpp::SimdOptPForCodec::default()),
+            CppCodec::SimdPFor => Box::new(cpp::SimdPForCodec::default()),
+            CppCodec::SimdSimplePFor => Box::new(cpp::SimdSimplePForCodec::default()),
+            // CppCodec::Simple16 => Box::new(cpp::Simple16Codec::default()),
+            // CppCodec::Simple8b => Box::new(cpp::Simple8bCodec::default()),
+            // CppCodec::Simple8bRle => Box::new(cpp::Simple8bRleCodec::default()),
+            // CppCodec::Simple9 => Box::new(cpp::Simple9Codec::default()),
+            // CppCodec::Simple9Rle => Box::new(cpp::Simple9RleCodec::default()),
+            // CppCodec::SimplePFor => Box::new(cpp::SimplePForCodec::default()),
+            // CppCodec::Snappy => Box::new(cpp::SnappyCodec::default()),
+            CppCodec::StreamVByte => Box::new(cpp::StreamVByteCodec::default()),
+            CppCodec::VByte => Box::new(cpp::VByteCodec::default()),
+            CppCodec::VarInt => Box::new(cpp::VarIntCodec::default()),
+            // CppCodec::VarIntG8iu => Box::new(cpp::VarIntG8iuCodec::default()),
+            CppCodec::VarIntGb => Box::new(cpp::VarIntGbCodec::default()),
+            // CppCodec::VsEncoding => Box::new(cpp::VsEncodingCodec::default()),
         }
     }
 }
diff --git a/fuzz/fuzz_targets/cpp_roundtrip.rs b/fuzz/fuzz_targets/cpp_roundtrip.rs
index 8fd4052..a0559cf 100644
--- a/fuzz/fuzz_targets/cpp_roundtrip.rs
+++ b/fuzz/fuzz_targets/cpp_roundtrip.rs
@@ -5,28 +5,26 @@ mod common;
 use common::*;
 
 fuzz_target!(|data: FuzzInput<CppCodec>| {
-    let codec = BoxedCppCodec::from(data.codec);
+    let mut codec = BoxedCppCodec::from(data.codec);
     let input = data.data;
 
-    // Allocate output buffer with generous size
-    let mut output = vec![0u32; input.len() * 2 + 1024];
+    let mut compressed = Vec::new();
+    codec.encode(&input, &mut compressed).unwrap();
 
-    // Compress the data
-    let enc_slice = codec.encode32(&input, &mut output).unwrap();
-
-    // Now decompress
-    let mut decoded = vec![0u32; input.len() * 2 + 1024];
-    let dec_slice = codec.decode32(enc_slice, &mut decoded).unwrap();
+    let mut decoded = Vec::new();
+    codec
+        .decode(&compressed, &mut decoded, None)
+        .expect("decode");
 
     // Verify roundtrip
-    if dec_slice.len() + input.len() < 200 {
-        assert_eq!(input, dec_slice, "Decompressed output mismatches");
+    if decoded.len() + input.len() < 200 {
+        assert_eq!(input, decoded.as_slice(), "Decompressed output mismatches");
     } else {
-        assert_eq!(dec_slice.len(), input.len(), "Decompressed length mismatch");
-        for (i, (&original, &decoded)) in input.iter().zip(dec_slice.iter()).enumerate() {
+        assert_eq!(decoded.len(), input.len(), "Decompressed length mismatch");
+        for (i, (&original, &out)) in input.iter().zip(decoded.iter()).enumerate() {
             assert_eq!(
-                original, decoded,
-                "Mismatch at position {i}: expected {original}, got {decoded}"
+                original, out,
+                "Mismatch at position {i}: expected {original}, got {out}"
             );
         }
     }
diff --git a/fuzz/fuzz_targets/rust_compress_oracle.rs b/fuzz/fuzz_targets/rust_compress_oracle.rs
index c134726..ff10ab2 100644
--- a/fuzz/fuzz_targets/rust_compress_oracle.rs
+++ b/fuzz/fuzz_targets/rust_compress_oracle.rs
@@ -1,6 +1,6 @@
 #![no_main]
 
-use fastpfor::{CodecToSlice, cpp, rust};
+use fastpfor::{AnyLenCodec, CodecToSlice, cpp, rust};
 use libfuzzer_sys::fuzz_target;
 mod common;
 use common::*;
@@ -28,9 +28,8 @@ fuzz_target!(|data: FuzzInput<RustCodec>| {
     let last_block_size_multiple = input.len() / block_size * block_size;
     let input = &input[..last_block_size_multiple];
 
-    // Allocate output buffers with generous size
+    // Allocate output buffer for Rust (slice API)
     let mut rust_compressed = vec![0u32; input.len() * 2 + 1024];
-    let mut cpp_compressed = vec![0u32; input.len() * 2 + 1024];
 
     // Compress with Rust implementation using Codec wrapper
     let mut rust_codec = rust::Codec::from(data.codec);
@@ -38,33 +37,23 @@ fuzz_target!(|data: FuzzInput<RustCodec>| {
         .compress_to_slice(input, &mut rust_compressed)
         .expect("Rust compression failed");
 
-    // Compress with C++ implementation
-    let compressed_oracle_from_cpp = match data.codec {
-        RustCodec::FastPFOR256 => {
-            let mut cpp_codec = cpp::FastPFor256Codec::new();
-            cpp_codec
-                .compress_to_slice(input, &mut cpp_compressed)
-                .expect("C++ compression failed")
-        }
-        RustCodec::FastPFOR128 => {
-            let mut cpp_codec = cpp::FastPFor128Codec::new();
-            cpp_codec
-                .compress_to_slice(input, &mut cpp_compressed)
-                .expect("C++ compression failed")
-        }
-        RustCodec::VariableByte => {
-            let mut cpp_codec = cpp::MaskedVByteCodec::new();
-            cpp_codec
-                .compress_to_slice(input, &mut cpp_compressed)
-                .expect("C++ compression failed")
-        }
-        RustCodec::JustCopy => {
-            let mut cpp_codec = cpp::CopyCodec::new();
-            cpp_codec
-                .compress_to_slice(input, &mut cpp_compressed)
-                .expect("C++ compression failed")
-        }
-    };
+    // Compress with C++ implementation (`AnyLenCodec` / Vec API)
+    let mut cpp_compressed = Vec::new();
+    match data.codec {
+        RustCodec::FastPFOR256 => cpp::FastPFor256Codec::new()
+            .encode(input, &mut cpp_compressed)
+            .expect("C++ compression failed"),
+        RustCodec::FastPFOR128 => cpp::FastPFor128Codec::new()
+            .encode(input, &mut cpp_compressed)
+            .expect("C++ compression failed"),
+        RustCodec::VariableByte => cpp::MaskedVByteCodec::new()
+            .encode(input, &mut cpp_compressed)
+            .expect("C++ compression failed"),
+        RustCodec::JustCopy => cpp::CopyCodec::new()
+            .encode(input, &mut cpp_compressed)
+            .expect("C++ compression failed"),
+    }
+    let compressed_oracle_from_cpp = cpp_compressed.as_slice();
 
     // Compare compressed outputs
     assert_eq!(
diff --git a/fuzz/fuzz_targets/rust_decompress_oracle.rs b/fuzz/fuzz_targets/rust_decompress_oracle.rs
index 9ced890..2e4a0f7 100644
--- a/fuzz/fuzz_targets/rust_decompress_oracle.rs
+++ b/fuzz/fuzz_targets/rust_decompress_oracle.rs
@@ -1,6 +1,6 @@
 #![no_main]
 
-use fastpfor::{CodecToSlice, cpp, rust};
+use fastpfor::{AnyLenCodec, CodecToSlice, cpp, rust};
 use libfuzzer_sys::fuzz_target;
 mod common;
 use common::*;
@@ -29,33 +29,22 @@ fuzz_target!(|data: FuzzInput<RustCodec>| {
     let input = &input[..last_block_size_multiple];
 
     // First, compress with C++ implementation to get valid compressed data
-    let mut cpp_compressed = vec![0u32; input.len() * 2 + 1024];
-    let compressed_oracle_from_cpp = match data.codec {
-        RustCodec::FastPFOR256 => {
-            let mut cpp_codec = cpp::FastPFor256Codec::new();
-            cpp_codec
-                .compress_to_slice(input, &mut cpp_compressed)
-                .expect("C++ compression failed")
-        }
-        RustCodec::FastPFOR128 => {
-            let mut cpp_codec = cpp::FastPFor128Codec::new();
-            cpp_codec
-                .compress_to_slice(input, &mut cpp_compressed)
-                .expect("C++ compression failed")
-        }
-        RustCodec::VariableByte => {
-            let mut cpp_codec = cpp::MaskedVByteCodec::new();
-            cpp_codec
-                .compress_to_slice(input, &mut cpp_compressed)
-                .expect("C++ compression failed")
-        }
-        RustCodec::JustCopy => {
-            let mut cpp_codec = cpp::CopyCodec::new();
-            cpp_codec
-                .compress_to_slice(input, &mut cpp_compressed)
-                .expect("C++ compression failed")
-        }
-    };
+    let mut cpp_compressed = Vec::new();
+    match data.codec {
+        RustCodec::FastPFOR256 => cpp::FastPFor256Codec::new()
+            .encode(input, &mut cpp_compressed)
+            .expect("C++ compression failed"),
+        RustCodec::FastPFOR128 => cpp::FastPFor128Codec::new()
+            .encode(input, &mut cpp_compressed)
+            .expect("C++ compression failed"),
+        RustCodec::VariableByte => cpp::MaskedVByteCodec::new()
+            .encode(input, &mut cpp_compressed)
+            .expect("C++ compression failed"),
+        RustCodec::JustCopy => cpp::CopyCodec::new()
+            .encode(input, &mut cpp_compressed)
+            .expect("C++ compression failed"),
+    }
+    let compressed_oracle_from_cpp = cpp_compressed.as_slice();
 
     // Now decompress with rust
     let mut rust_decompressed = vec![0u32; input.len()];
diff --git a/fuzz/justfile b/fuzz/justfile
index 1138624..35533bf 100755
--- a/fuzz/justfile
+++ b/fuzz/justfile
@@ -3,24 +3,33 @@
 #   just fuzz::run rust_compress_oracle
 # cargo-fuzz requires nightly Rust and must be run from inside the fuzz/ directory.
 
-fuzz_dir := justfile_directory()
+# How to call the current just executable. Note that just_executable() may have `\` in Windows paths, so we need to quote it.
+just := quote(just_executable())
+fuzz_target := "x86_64-unknown-linux-gnu"
 
 # List available fuzz targets
 list:
-    @cd {{fuzz_dir}} && cargo +nightly fuzz list
+    cargo +nightly fuzz list
 
-# Run a fuzz target indefinitely  (Ctrl-C to stop)
-# Targets: rust_compress_oracle, rust_decompress_oracle, cpp_roundtrip
+# Build all fuzz targets without running them
+build *args:
+    cargo +nightly fuzz build --target {{fuzz_target}} {{args}}
+
+# Print coverage for a fuzz target (requires llvm-tools)
+coverage target *args:
+    cargo +nightly fuzz coverage --target {{fuzz_target}} {{target}} {{args}} ${JUST_FUZZ_EXTRA_ARGS:+-- $JUST_FUZZ_EXTRA_ARGS}
+
+# Run a fuzz target, indefinitely by default (Ctrl-C to stop)
 run target *args:
-    cd {{fuzz_dir}} && cargo +nightly fuzz run --target x86_64-unknown-linux-gnu {{target}} {{args}}
+    cargo +nightly fuzz run --target {{fuzz_target}} {{target}} {{args}} ${JUST_FUZZ_EXTRA_ARGS:+-- $JUST_FUZZ_EXTRA_ARGS}
 
 # Run a fuzz target for a fixed number of seconds
 run-time target seconds='60' *args:
-    cd {{fuzz_dir}} && cargo +nightly fuzz run {{target}} {{args}} -- -max_total_time={{seconds}}
+    JUST_FUZZ_EXTRA_ARGS='-max_total_time={{seconds}}' {{just}} run {{target}} {{args}}
 
 # Run a fuzz target for a fixed number of iterations
 run-iters target iters='10000' *args:
-    cd {{fuzz_dir}} && cargo +nightly fuzz run {{target}} {{args}} -- -runs={{iters}}
+    JUST_FUZZ_EXTRA_ARGS='-runs={{iters}}' {{just}} run {{target}} {{args}}
 
 # Run rust_compress_oracle (Rust only, no C++ required)
 rust-compress *args: (run 'rust_compress_oracle' args)
@@ -36,12 +45,20 @@ cpp-roundtrip *args: (run 'cpp_roundtrip' args)
 
 # Reproduce a specific crash artifact
 repro target artifact:
-    cd {{fuzz_dir}} && cargo +nightly fuzz run {{target}} {{artifact}}
+    cargo +nightly fuzz run {{target}} {{artifact}}
 
-# Build all fuzz targets without running them
-build:
-    cd {{fuzz_dir}} && cargo +nightly fuzz build --target x86_64-unknown-linux-gnu
+# Run a single pass of every fuzz target (CI smoke test; stops after 1 iteration each).
+ci-test:
+    #!/usr/bin/env bash
+    set -euo pipefail
+    for target in $({{just}} list); do
+        {{just}} run-iters $target 1
+    done
 
-# Print coverage for a fuzz target (requires llvm-tools)
-coverage target:
-    cd {{fuzz_dir}} && cargo +nightly fuzz coverage --target x86_64-unknown-linux-gnu {{target}}
+# Run a single pass of every fuzz target (CI smoke test; stops after 1 iteration each).
+coverage-all:
+    #!/usr/bin/env bash
+    set -euo pipefail
+    for target in $({{just}} list); do
+        {{just}} coverage $target
+    done
diff --git a/justfile b/justfile
index 17f60d7..5fc804e 100755
--- a/justfile
+++ b/justfile
@@ -33,6 +33,7 @@ check:
     cargo check --workspace --all-targets --features _all_compatible
     cargo check --workspace --all-targets --no-default-features --features cpp
     cargo check --workspace --all-targets --no-default-features --features rust
+    cargo check --workspace --all-targets --manifest-path fuzz/Cargo.toml
 
 # Generate code coverage report to upload to codecov.io
 ci-coverage: env-info && \
@@ -41,7 +42,8 @@ ci-coverage: env-info && \
     mkdir -p target/llvm-cov
 
 # Run all tests as expected by CI
-ci-test: env-info test-fmt check build clippy test test-doc && assert-git-is-clean
+ci-test: env-info test-fmt check build clippy test test-doc fuzz::ci-test
+    {{ if ci_mode == '1' { just + ' assert-git-is-clean' } else { '' } }}
 
 # Run minimal subset of tests to ensure compatibility with MSRV
 ci-test-msrv: env-info test
@@ -50,10 +52,12 @@ ci-test-msrv: env-info test
 clean:
     cargo clean
     rm -f Cargo.lock
+    cd fuzz && cargo clean && rm -f Cargo.lock
 
 # Run cargo clippy to lint the code
 clippy *args:
     cargo clippy --workspace --all-targets --features _all_compatible {{args}}
+    cargo clippy --workspace --all-targets --manifest-path fuzz/Cargo.toml {{args}}
 
 # Generate code coverage report. Will install `cargo llvm-cov` if missing.
 coverage *args='--open':  (cargo-install 'cargo-llvm-cov')
@@ -61,8 +65,8 @@ coverage *args='--open':  (cargo-install 'cargo-llvm-cov')
     cargo llvm-cov --workspace --all-targets --features _all_compatible --include-build-script {{args}}
 
 # Build and open code documentation
-docs *args='--open':
-    DOCS_RS=1 cargo doc --no-deps {{args}} --workspace --features _all_compatible
+docs *args='--features _all_compatible --open':
+    DOCS_RS=1 cargo doc --no-deps {{args}} --workspace
 
 # Print environment info
 env-info:
@@ -134,7 +138,7 @@ test-all-simd-modes:
     {{just}} test-simd native
 
 # Test documentation generation
-test-doc:  (docs '')
+test-doc:  (docs '')  (docs '--features _all_compatible')
 
 # Test code formatting
 test-fmt: && (fmt-toml '--check' '--check-format')
diff --git a/src/codec.rs b/src/codec.rs
new file mode 100644
index 0000000..8bff34c
--- /dev/null
+++ b/src/codec.rs
@@ -0,0 +1,170 @@
+use bytemuck::{Pod, cast_slice};
+
+use crate::FastPForError;
+
+/// Internal default for max decompressed length. Used by trait defaults and C++ FFI.
+#[inline]
+pub(crate) fn default_max_decoded_len(compressed_words: usize) -> usize {
+    compressed_words.saturating_mul(1024)
+}
+
+/// Compresses and decompresses fixed-size blocks of `u32` values.
+///
+/// The associated type [`Block`](BlockCodec::Block) is the concrete fixed-size
+/// array, e.g. `[u32; 256]`. Using an associated *type* (not an associated
+/// constant) lets `CompositeCodec<Blocks, Tail>` be a clean two-parameter
+/// struct on stable Rust, with no extra `const N` leaking into user-facing
+/// signatures.
+///
+/// # Compile-time safety
+///
+/// Passing a `&[[u32; 128]]` slice to a codec whose `Block = [u32; 256]`
+/// is a **compile error** — the slice element types simply don't match.
+///
+/// # Implementing this trait
+///
+/// ```rust,ignore
+/// impl BlockCodec for MyCodec {
+///     type Block = [u32; 256];
+///     fn encode_blocks(&self, blocks: &[[u32; 256]], out: &mut Vec<u32>)
+///         -> Result<(), FastPForError> { ... }
+///     fn decode_blocks(&self, input: &[u32], expected_len: Option<u32>,
+///         out: &mut Vec<u32>) -> Result<usize, FastPForError> { ... }
+/// }
+/// ```
+pub trait BlockCodec {
+    /// The fixed-size block type.  Must be plain-old-data (`Pod`).
+    /// In practice this will be `[u32; 128]` or `[u32; 256]`.
+    type Block: Pod;
+
+    /// Number of `u32` elements in one block.
+    ///
+    /// Equal to `size_of::<Self::Block>() / 4`. Use this when computing
+    /// element counts from block counts, e.g. `n_blocks * codec.elements_per_block()`.
+    #[inline]
+    #[must_use]
+    fn size() -> usize
+    where
+        Self: Sized,
+    {
+        size_of::<Self::Block>() / size_of::<u32>()
+    }
+
+    /// Compress a slice of complete, fixed-size blocks.
+    ///
+    /// No remainder is possible — the caller must split the input first using
+    /// [`slice_to_blocks`] and handle any remainder separately.
+    fn encode_blocks(
+        &mut self,
+        blocks: &[Self::Block],
+        out: &mut Vec<u32>,
+    ) -> Result<(), FastPForError>;
+
+    /// Decompress blocks from `input`, using the length stored in the header.
+    ///
+    /// When `expected_len` is `Some(n)`:
+    /// - Validates that the header value equals `n` (must be a multiple of
+    ///   [`size`](BlockCodec::size)).
+    ///
+    /// When `expected_len` is `None`:
+    /// - Validates the header value against
+    ///   [`max_decompressed_len`](BlockCodec::max_decompressed_len)(`input.len()`)
+    ///   to avoid allocation from malicious or corrupted data.
+    fn decode_blocks(
+        &mut self,
+        input: &[u32],
+        expected_len: Option<u32>,
+        out: &mut Vec<u32>,
+    ) -> Result<usize, FastPForError>;
+
+    /// Maximum decompressed element count for a given compressed input length.
+    /// Reject `expected_len` values exceeding this to avoid allocation from bad data.
+    #[inline]
+    #[must_use]
+    fn max_decompressed_len(compressed_words: usize) -> usize
+    where
+        Self: Sized,
+    {
+        default_max_decoded_len(compressed_words)
+    }
+}
+
+/// Codec that supports compressing 64-bit integers into a 32-bit word stream.
+///
+/// Only three C++ codecs implement this trait: `CppFastPFor128`,
+/// `CppFastPFor256`, and `CppVarInt`. For simple use, call
+/// `encode64` / `decode64` directly on the struct — no trait import required.
+///
+/// Import `BlockCodec64` only when writing generic code over multiple codecs
+/// that support 64-bit compression.
+#[cfg(feature = "cpp")]
+pub trait BlockCodec64 {
+    /// Compress 64-bit integers into a 32-bit word stream.
+    fn encode64(&mut self, input: &[u64], out: &mut Vec<u32>) -> Result<(), FastPForError>;
+    /// Decompress 64-bit integers from a 32-bit word stream.
+    fn decode64(&mut self, input: &[u32], out: &mut Vec<u64>) -> Result<(), FastPForError>;
+}
+
+/// Compresses and decompresses an arbitrary-length `&[u32]` slice.
+///
+/// Handles any input length including sub-block remainders.  All pure
+/// variable-length codecs (e.g. `VariableByte`, `JustCopy`) implement this
+/// trait directly.  Block-oriented codecs are wrapped in `CompositeCodec`
+/// to produce an `AnyLenCodec`.
+pub trait AnyLenCodec {
+    /// Compress an arbitrary-length slice of `u32` values.
+    fn encode(&mut self, input: &[u32], out: &mut Vec<u32>) -> Result<(), FastPForError>;
+
+    /// Maximum decompressed element count for a given compressed input length.
+    /// Reject `expected_len` values exceeding this to avoid allocation from bad data.
+    #[inline]
+    #[must_use]
+    fn max_decompressed_len(compressed_words: usize) -> usize
+    where
+        Self: Sized,
+    {
+        default_max_decoded_len(compressed_words)
+    }
+
+    /// Decompress a previously compressed slice of `u32` values.
+    ///
+    /// When `expected_len` is `Some(n)`:
+    /// - Rejects if `n` exceeds [`max_decompressed_len`](AnyLenCodec::max_decompressed_len)(`input.len()`)
+    /// - Pre-allocates output capacity for `n` elements
+    /// - Returns [`DecodedCountMismatch`](crate::FastPForError::DecodedCountMismatch) if the
+    ///   actual decoded count differs from `n`
+    ///
+    /// The hint is not trusted: values from untrusted metadata are capped before use.
+    fn decode(
+        &mut self,
+        input: &[u32],
+        out: &mut Vec<u32>,
+        expected_len: Option<u32>,
+    ) -> Result<(), FastPForError>;
+}
+
+/// Split a flat `&[u32]` into `(&[Blocks::Block], &[u32])` without copying.
+///
+/// Uses `size_of::<Blocks::Block>() / 4` to determine the block
+/// size, then [`cast_slice`] for the zero-copy reinterpretation.
+///
+/// The first return value is the largest aligned prefix; the second is the
+/// sub-block remainder (`0..block_size - 1` values) that the caller must
+/// handle separately (e.g. with a `VariableByte` tail).
+///
+/// # Example
+///
+/// ```rust,ignore
+/// let data: Vec<u32> = (0..600).collect(); // 2 × 256 + 88 remainder
+/// let (blocks, remainder) = slice_to_blocks::<FastPForBlock256>(&data);
+/// assert_eq!(blocks.len(), 2);    // 2 blocks of [u32; 256]
+/// assert_eq!(remainder.len(), 88);
+/// ```
+#[must_use]
+pub fn slice_to_blocks<Blocks: BlockCodec + Sized>(input: &[u32]) -> (&[Blocks::Block], &[u32]) {
+    let block_u32s = Blocks::size();
+    let aligned_down = (input.len() / block_u32s) * block_u32s;
+    let (aligned, remainder) = input.split_at(aligned_down);
+    let blocks: &[Blocks::Block] = cast_slice(aligned); // must not panic
+    (blocks, remainder)
+}
diff --git a/src/cpp/README.md b/src/cpp/README.md
deleted file mode 100644
index 585d05a..0000000
--- a/src/cpp/README.md
+++ /dev/null
@@ -1,147 +0,0 @@
-# `FastPFOR` Rust Wrapper
-
-Rust wrapper for the [FastPFOR C++ library](https://github.com/fast-pack/FastPFor), providing fast integer compression codecs optimized for sorted sequences.
-
-## Quick Start
-
-```rust
-use fastpfor::cpp::{FastPFor128Codec, Codec32};
-
-let codec = FastPFor128Codec::new();
-let input = vec![1, 2, 3, 4, 5, 100, 200, 300];
-let mut compressed = vec![0u32; input.len() + 1024];
-
-let encoded = codec.encode32(&input, &mut compressed).unwrap();
-let mut decompressed = vec![0u32; input.len()];
-let decoded = codec.decode32(encoded, &mut decompressed).unwrap();
-
-assert_eq!(decoded, input.as_slice());
-```
-
-### 64-bit Integers
-
-Some codecs support 64-bit integers via the [`Codec64`] trait:
-
-```rust
-use fastpfor::cpp::{FastPFor128Codec, Codec64};
-
-let codec = FastPFor128Codec::new();
-let input = vec![100u64, 200, 300, 400];
-let mut compressed = vec![0u32; input.len() * 2 + 1024];
-let encoded = codec.encode64(&input, &mut compressed).unwrap();
-
-let mut decompressed = vec![0u64; input.len()];
-let decoded = codec.decode64(encoded, &mut decompressed).unwrap();
-assert_eq!(decoded, input.as_slice());
-```
-
-See [`Codec32`] and [`Codec64`] trait documentation for buffer sizing guidelines.
-
-## Codec Selection
-
-> **Note:** See individual codec documentation below for detailed descriptions and use-case recommendations.
-
-### General Purpose (Recommended)
-
-- [`FastPFor128Codec`], [`FastPFor256Codec`] - Best all-around choice. Fast decode, good compression for sorted/clustered data. Support 64-bit.
-- [`SimdFastPFor128Codec`], [`SimdFastPFor256Codec`] - SIMD-optimized variants for maximum throughput
-
-### Patched Frame-of-Reference Variants
-
-Frame-of-reference encoding with exception handling. Excellent for monotonic sequences (timestamps, IDs).
-
-- [`PForCodec`] - Standard implementation
-- [`SimplePForCodec`] - Simplified variant with lower complexity
-- [`NewPForCodec`] - Enhanced exception handling
-- [`OptPForCodec`] - Optimized for common patterns
-- [`PFor2008Codec`] - Reference implementation from research paper
-- **SIMD variants:** [`SimdPForCodec`], [`SimdNewPForCodec`], [`SimdOptPForCodec`], [`SimdSimplePForCodec`]
-
-### Binary Packing
-
-Bit-packing based on maximum bit width. Good for uniform data distributions.
-
-- [`BP32Codec`] - Standard 32-bit block binary packing
-- [`FastBinaryPacking8Codec`], [`FastBinaryPacking16Codec`], [`FastBinaryPacking32Codec`] - Different block sizes
-- [`SimdBinaryPackingCodec`] - SIMD-optimized variant
-
-### Variable Byte Encoding
-
-Best for unsorted data and small integers. Simple and widely compatible.
-
-- [`VByteCodec`] - Standard variable byte encoding (1-5 bytes per integer)
-- [`VarIntCodec`] - Standard varint format. Supports 64-bit.
-- [`VarIntGbCodec`] - Group varint with shared control information
-- **SIMD variants:** [`MaskedVByteCodec`], [`StreamVByteCodec`] - Excellent decode speed on modern CPUs
-
-### Simple Encodings
-
-Efficient for small positive integers (typically < 2^16).
-**Does not support arbitrary u32 inputs.**
-
-- [`Simple16Codec`] - 16 packing modes in 32-bit words
-- [`Simple9Codec`] - 9 packing modes for flexibility
-- [`Simple8bCodec`] - 8 packing modes in 64-bit words
-- [`Simple9RleCodec`], [`Simple8bRleCodec`] - With run-length encoding for repeated values
-- [`SimdGroupSimpleCodec`], [`SimdGroupSimpleRingBufCodec`] - SIMD-optimized
-
-### Utility
-
-- [`CopyCodec`] - No compression (baseline for benchmarking)
-
-## Thread Safety
-
-Codec instances **have internal state** that is **cleared after each operation**.
-They are **not thread-safe** during concurrent encode/decode operations.
-
-Use one of these strategies:
-
-- Create separate codec instances per thread
-- Synchronize access with mutexes
-- Use thread-local storage for codec instances
-
-## Architecture
-
-This module uses [CXX](https://cxx.rs/) to safely bridge Rust and C++:
-
-- Each codec wraps a C++ `IntegerCODEC` instance via [`UniquePtr`]
-- The [`Codec32`] and [`Codec64`] traits provide the Rust API
-- Memory is automatically managed by CXX and Rust's ownership system
-
-See the [FastPFOR C++ library documentation](https://github.com/fast-pack/FastPFor) for underlying implementation details.
-
-[`Codec32`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/trait.Codec32.html
-[`Codec64`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/trait.Codec64.html
-[`Exception`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/type.Exception.html
-[`UniquePtr`]: https://docs.rs/cxx/latest/cxx/struct.UniquePtr.html
-[`BP32Codec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.BP32Codec.html
-[`CopyCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.CopyCodec.html
-[`FastBinaryPacking8Codec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.FastBinaryPacking8Codec.html
-[`FastBinaryPacking16Codec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.FastBinaryPacking16Codec.html
-[`FastBinaryPacking32Codec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.FastBinaryPacking32Codec.html
-[`FastPFor128Codec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.FastPFor128Codec.html
-[`FastPFor256Codec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.FastPFor256Codec.html
-[`MaskedVByteCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.MaskedVByteCodec.html
-[`NewPForCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.NewPForCodec.html
-[`OptPForCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.OptPForCodec.html
-[`PFor2008Codec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.PFor2008Codec.html
-[`PForCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.PForCodec.html
-[`SimdBinaryPackingCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.SimdBinaryPackingCodec.html
-[`SimdFastPFor128Codec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.SimdFastPFor128Codec.html
-[`SimdFastPFor256Codec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.SimdFastPFor256Codec.html
-[`SimdGroupSimpleCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.SimdGroupSimpleCodec.html
-[`SimdGroupSimpleRingBufCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.SimdGroupSimpleRingBufCodec.html
-[`SimdNewPForCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.SimdNewPForCodec.html
-[`SimdOptPForCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.SimdOptPForCodec.html
-[`SimdPForCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.SimdPForCodec.html
-[`SimdSimplePForCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.SimdSimplePForCodec.html
-[`Simple16Codec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.Simple16Codec.html
-[`Simple8bCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.Simple8bCodec.html
-[`Simple8bRleCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.Simple8bRleCodec.html
-[`Simple9Codec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.Simple9Codec.html
-[`Simple9RleCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.Simple9RleCodec.html
-[`SimplePForCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.SimplePForCodec.html
-[`StreamVByteCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.StreamVByteCodec.html
-[`VByteCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.VByteCodec.html
-[`VarIntCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.VarIntCodec.html
-[`VarIntGbCodec`]: https://docs.rs/fastpfor/latest/fastpfor/cpp/struct.VarIntGbCodec.html
diff --git a/src/cpp/codecs.rs b/src/cpp/codecs.rs
new file mode 100644
index 0000000..e8d4743
--- /dev/null
+++ b/src/cpp/codecs.rs
@@ -0,0 +1,277 @@
+use cxx::UniquePtr;
+
+use crate::FastPForError;
+use crate::codec::{AnyLenCodec, BlockCodec64};
+use crate::cpp::ffi;
+use crate::cpp::wrappers::{
+    decode32_anylen_ffi, decode64_to_vec_ffi, encode32_to_vec_ffi, encode64_to_vec_ffi,
+};
+
+// ── Codec struct definitions ─────────────────────────────────────────────────
+//
+// Single macro: all C++ codecs implement AnyLenCodec. Codecs marked with `@ 64`
+// also implement BlockCodec64 for 64-bit integer support.
+
+/// Macro for C++ codec wrappers: struct + Default + `AnyLenCodec`.
+macro_rules! implement_cpp_codecs {
+    ($(
+        $(#[$($attrs:tt)*])*
+        $name:ident => $ffi:ident ,
+    )*) => {
+        $(
+            $(#[$($attrs)*])*
+            pub struct $name(UniquePtr<ffi::IntegerCODEC>);
+
+            impl $name {
+                /// Creates a new instance of this codec.
+                #[must_use]
+                pub fn new() -> Self {
+                    Self(ffi::$ffi())
+                }
+            }
+
+            impl Default for $name {
+                fn default() -> Self {
+                    Self::new()
+                }
+            }
+
+            impl AnyLenCodec for $name {
+                fn encode(&mut self, input: &[u32], out: &mut Vec<u32>) -> Result<(), FastPForError> {
+                    encode32_to_vec_ffi(&self.0, input, out)
+                }
+
+                fn decode(
+                    &mut self,
+                    input: &[u32],
+                    out: &mut Vec<u32>,
+                    expected_len: Option<u32>,
+                ) -> Result<(), FastPForError> {
+                    decode32_anylen_ffi(&self.0, input, out, expected_len)
+                }
+            }
+        )*
+    };
+}
+
+// ── All C++ codecs (composite / any-length) ─
+implement_cpp_codecs! {
+    /// Binary Packing codec optimized for 32-bit blocks.
+    BP32Codec => BP32_codec,
+
+    /// Copy codec that performs no compression.
+    CopyCodec => copy_codec,
+
+    /// Fast binary packing with 8-bit processing blocks.
+    FastBinaryPacking8Codec => fastbinarypacking8_codec,
+
+    /// Fast binary packing with 16-bit processing blocks.
+    FastBinaryPacking16Codec => fastbinarypacking16_codec,
+
+    /// Fast binary packing with 32-bit processing blocks.
+    FastBinaryPacking32Codec => fastbinarypacking32_codec,
+
+    /// Fast [`PForCodec`] with 128-value processing blocks.
+    FastPFor128Codec => fastpfor128_codec,
+
+    /// Fast [`PForCodec`] with 256-value processing blocks.
+    FastPFor256Codec => fastpfor256_codec,
+
+    /// Masked [`VByteCodec`] with SIMD optimizations.
+    MaskedVByteCodec => maskedvbyte_codec,
+
+    /// New [`PForCodec`].
+    NewPForCodec => newpfor_codec,
+
+    /// Optimized [`PForCodec`].
+    OptPForCodec => optpfor_codec,
+
+    /// [`PForCodec`] based on the 2008 research paper.
+    PFor2008Codec => pfor2008_codec,
+
+    /// Standard Patched Frame of Reference codec.
+    PForCodec => pfor_codec,
+
+    /// SIMD-accelerated binary packing codec.
+    SimdBinaryPackingCodec => simdbinarypacking_codec,
+
+    /// SIMD-optimized [`FastPFor128Codec`] with 128-value blocks.
+    SimdFastPFor128Codec => simdfastpfor128_codec,
+
+    /// SIMD-optimized [`FastPFor256Codec`] with 256-value blocks.
+    SimdFastPFor256Codec => simdfastpfor256_codec,
+
+    /// SIMD group simple codec.
+    SimdGroupSimpleCodec => simdgroupsimple_codec,
+
+    /// SIMD group simple codec with ring buffer optimization.
+    SimdGroupSimpleRingBufCodec => simdgroupsimple_ringbuf_codec,
+
+    /// SIMD-accelerated [`NewPForCodec`].
+    SimdNewPForCodec => simdnewpfor_codec,
+
+    /// SIMD-accelerated [`OptPForCodec`].
+    SimdOptPForCodec => simdoptpfor_codec,
+
+    /// SIMD-accelerated [`PForCodec`].
+    SimdPForCodec => simdpfor_codec,
+
+    /// SIMD-accelerated [`SimplePForCodec`].
+    SimdSimplePForCodec => simdsimplepfor_codec,
+
+    /// Simple-16 encoding scheme.
+    Simple16Codec => simple16_codec,
+
+    /// Simple-8b encoding scheme.
+    Simple8bCodec => simple8b_codec,
+
+    /// Simple-8b encoding with run-length encoding.
+    Simple8bRleCodec => simple8b_rle_codec,
+
+    /// Simple-9 encoding scheme.
+    Simple9Codec => simple9_codec,
+
+    /// Simple-9 encoding with run-length encoding.
+    Simple9RleCodec => simple9_rle_codec,
+
+    /// Simple Patched Frame of Reference ([`PForCodec`]) codec.
+    SimplePForCodec => simplepfor_codec,
+
+    // SnappyCodec => snappy_codec,  // Conditional with #ifdef
+
+    /// [`StreamVByteCodec`](https://github.com/lemire/streamvbyte) encoding for fast variable-byte compression.
+    StreamVByteCodec => streamvbyte_codec,
+
+    /// Standard variable-byte encoding.
+    VByteCodec => vbyte_codec,
+
+    /// Variable-length integer encoding.
+    VarIntCodec => varint_codec,
+
+    // VarIntG8iuCodec => varintg8iu_codec,  // Conditional with #ifdef
+
+    /// Group Variable-length integer encoding with optimizations.
+    VarIntGbCodec => varintgb_codec,
+
+    // VsEncodingCodec => vsencoding_codec,  // This is leaking memory
+}
+
+/// Adds `BlockCodec64` impl for codecs that support 64-bit integers.
+macro_rules! implement_cpp_codecs_64 {
+    ($($name:ident => $ffi:ident ,)*) => {
+        $(
+            impl BlockCodec64 for $name {
+                fn encode64(&mut self, input: &[u64], out: &mut Vec<u32>) -> Result<(), FastPForError> {
+                    encode64_to_vec_ffi(&self.0, input, out)
+                }
+                fn decode64(&mut self, input: &[u32], out: &mut Vec<u64>) -> Result<(), FastPForError> {
+                    decode64_to_vec_ffi(&self.0, input, out)
+                }
+            }
+        )*
+    };
+}
+
+implement_cpp_codecs_64! {
+    FastPFor128Codec => fastpfor128_codec,
+    FastPFor256Codec => fastpfor256_codec,
+    VarIntCodec => varint_codec,
+}
+
+#[cfg(test)]
+pub(crate) mod tests {
+    use crate::codec::{AnyLenCodec, BlockCodec64};
+    use crate::cpp::codecs::{FastPFor128Codec, FastPFor256Codec, VByteCodec, VarIntCodec};
+
+    pub fn roundtrip_32(codec: &mut (impl AnyLenCodec + ?Sized), input: &[u32]) {
+        let mut compressed = Vec::new();
+        codec.encode(input, &mut compressed).unwrap();
+        let mut decoded = Vec::new();
+        codec.decode(&compressed, &mut decoded, None).unwrap();
+        assert_eq!(decoded, input);
+    }
+
+    /// C++ `fastpfor256_codec` returns `CompositeCodec<FastPFor<8>, VariableByte>` — already
+    /// any-length. Use it directly; do not wrap in Rust `CompositeCodec`.
+    #[test]
+    fn test_cpp_fastpfor256_composite_anylen() {
+        let mut codec = FastPFor256Codec::new();
+        roundtrip_32(&mut codec, &[1, 2, 3, 4, 5]);
+        let data: Vec<u32> = (0..600).collect();
+        roundtrip_32(&mut codec, &data);
+    }
+
+    #[test]
+    fn test_fastpfor128_anylen() {
+        let data: Vec<u32> = (0..128).collect();
+        roundtrip_32(&mut FastPFor128Codec::new(), &data);
+    }
+
+    #[test]
+    fn test_fastpfor256_anylen() {
+        let data: Vec<u32> = (0..256).collect();
+        roundtrip_32(&mut FastPFor256Codec::new(), &data);
+    }
+
+    #[test]
+    fn test_fastpfor256_u64() {
+        let input: Vec<u64> = (0..256).collect();
+        let mut codec = FastPFor256Codec::new();
+        let mut compressed = Vec::new();
+        codec.encode64(&input, &mut compressed).unwrap();
+        let mut decoded = Vec::new();
+        codec.decode64(&compressed, &mut decoded).unwrap();
+        assert_eq!(decoded, input);
+    }
+
+    #[test]
+    fn test_varint_u64() {
+        let input = vec![1u64, 2, 3, 4, 5];
+        let mut codec = VarIntCodec::new();
+        let mut compressed = Vec::new();
+        codec.encode64(&input, &mut compressed).unwrap();
+        let mut decoded = Vec::new();
+        codec.decode64(&compressed, &mut decoded).unwrap();
+        assert_eq!(decoded, input);
+    }
+
+    #[test]
+    fn test_decode32_empty_input() {
+        let mut codec = VByteCodec::new();
+        let mut out = Vec::new();
+        codec.decode(&[], &mut out, None).unwrap();
+        assert!(out.is_empty());
+    }
+
+    #[test]
+    fn test_decode32_cpp_empty_format() {
+        let mut codec = FastPFor128Codec::new();
+        let mut out = Vec::new();
+        codec.decode(&[0u32], &mut out, Some(0)).unwrap();
+        assert!(out.is_empty());
+    }
+
+    #[test]
+    fn test_decode64_empty_input() {
+        let mut codec = FastPFor256Codec::new();
+        let mut out: Vec<u64> = Vec::new();
+        codec.decode64(&[], &mut out).unwrap();
+        assert!(out.is_empty());
+    }
+
+    #[test]
+    fn test_decode64_empty_format() {
+        let mut codec = VarIntCodec::new();
+        let mut out: Vec<u64> = Vec::new();
+        codec.decode64(&[], &mut out).unwrap();
+        assert!(out.is_empty());
+    }
+
+    #[test]
+    fn test_decode_empty_input() {
+        let mut codec = FastPFor128Codec::new();
+        let mut out = Vec::new();
+        codec.decode(&[], &mut out, None).unwrap();
+        assert!(out.is_empty());
+    }
+}
diff --git a/src/cpp/mod.rs b/src/cpp/mod.rs
index e672ecc..8aae284 100644
--- a/src/cpp/mod.rs
+++ b/src/cpp/mod.rs
@@ -1,10 +1,16 @@
-#![doc = include_str!("README.md")]
-
-// Re-export CXX Exception type to simplify usage
-pub use cxx::Exception;
-use cxx::UniquePtr;
+//! C++ codec wrappers — see the [crate-level documentation](crate) for usage and codec selection.
+//!
+//! All C++ codecs are composite (any-length) and implement [`AnyLenCodec`] only.
+//! Codecs marked with `@ 64` also implement [`BlockCodec64`] for 64-bit integers.
+//!
+//! **Thread safety:** instances have internal state and are not thread-safe. Use one per thread.
+
+mod codecs;
+#[cfg(test)]
+mod tests;
+mod wrappers;
 
-use crate::CodecToSlice;
+pub use codecs::*;
 
 /// FFI bridge to the C++ FastPFOR library.
 ///
@@ -94,389 +100,3 @@ mod ffi {
         ) -> Result<usize>;
     }
 }
-
-/// Internal trait for providing access to the underlying C++ codec.
-trait CodecWrapper {
-    fn codec(&self) -> &UniquePtr<ffi::IntegerCODEC>;
-}
-
-/// Trait for codecs that support 32-bit integer compression.
-///
-/// # Example
-///
-/// ```no_run
-/// # use fastpfor::cpp::{BP32Codec, Codec32};
-/// let codec = BP32Codec::new();
-/// let data = vec![10, 20, 30, 40];
-/// let mut compressed = vec![0u32; data.len() + 1024];
-///
-/// let encoded = codec.encode32(&data, &mut compressed).unwrap();
-/// println!("Compressed {} integers into {} words", data.len(), encoded.len());
-///
-/// let mut decompressed = vec![0u32; data.len()];
-/// let result = codec.decode32(encoded, &mut decompressed).unwrap();
-/// assert_eq!(result, &data[..]);
-/// ```
-#[expect(private_bounds)]
-pub trait Codec32: CodecWrapper {
-    /// Encodes 32-bit integers into compressed form.
-    ///
-    /// Returns a mutable slice containing only the compressed data (a sub-slice of `output`).
-    fn encode32<'out>(
-        &self,
-        input: &[u32],
-        output: &'out mut [u32],
-    ) -> Result<&'out mut [u32], Exception> {
-        let n = ffi::codec_encode32(self.codec(), input, output)?;
-        Ok(&mut output[..n])
-    }
-
-    /// Decodes compressed 32-bit integers.
-    ///
-    /// Returns a mutable slice containing only the decompressed data (a sub-slice of `output`).
-    fn decode32<'out>(
-        &self,
-        input: &[u32],
-        output: &'out mut [u32],
-    ) -> Result<&'out mut [u32], Exception> {
-        let n = ffi::codec_decode32(self.codec(), input, output)?;
-        Ok(&mut output[..n])
-    }
-}
-
-impl<C: Codec32> CodecToSlice<u32> for C {
-    type Error = Exception;
-
-    fn compress_to_slice<'out>(
-        &mut self,
-        input: &[u32],
-        output: &'out mut [u32],
-    ) -> Result<&'out [u32], Self::Error> {
-        let result = self.encode32(input, output)?;
-        Ok(result)
-    }
-
-    fn decompress_to_slice<'out>(
-        &mut self,
-        input: &[u32],
-        output: &'out mut [u32],
-    ) -> Result<&'out [u32], Self::Error> {
-        let result = self.decode32(input, output)?;
-        Ok(result)
-    }
-}
-
-// Note: 64-bit integers are compressed into 32-bit word arrays.
-impl<C: Codec64> CodecToSlice<u64, u32> for C {
-    type Error = Exception;
-
-    fn compress_to_slice<'out>(
-        &mut self,
-        input: &[u64],
-        output: &'out mut [u32],
-    ) -> Result<&'out [u32], Self::Error> {
-        let result = self.encode64(input, output)?;
-        Ok(result)
-    }
-
-    fn decompress_to_slice<'out>(
-        &mut self,
-        input: &[u32],
-        output: &'out mut [u64],
-    ) -> Result<&'out [u64], Self::Error> {
-        let result = self.decode64(input, output)?;
-        Ok(result)
-    }
-}
-
-/// Trait for codecs that support 64-bit integer compression.
-///
-/// Only certain codecs support 64-bit integers. These are marked with the `@ 64`
-/// annotation in the `implement_codecs!` macro invocation.
-///
-/// # Compressed Format
-///
-/// 64-bit integers are compressed into 32-bit word arrays for compatibility with
-/// the underlying C++ library.
-///
-/// # Example
-///
-/// ```no_run
-/// # use fastpfor::cpp::{FastPFor128Codec, Codec64};
-/// let codec = FastPFor128Codec::new();
-/// let data = vec![100u64, 200, 300, 400];
-/// let mut compressed = vec![0u32; data.len() * 2 + 1024];
-///
-/// let encoded = codec.encode64(&data, &mut compressed).unwrap();
-/// println!("Compressed {} 64-bit integers into {} words", data.len(), encoded.len());
-///
-/// let mut decompressed = vec![0u64; data.len()];
-/// let result = codec.decode64(encoded, &mut decompressed).unwrap();
-/// assert_eq!(result, &data[..]);
-/// ```
-#[expect(private_bounds)]
-pub trait Codec64: CodecWrapper {
-    /// Encodes 64-bit integers into compressed 32-bit word form.
-    ///
-    /// Returns a mutable slice containing only the compressed data (a sub-slice of `output`).
-    fn encode64<'out>(
-        &self,
-        input: &[u64],
-        output: &'out mut [u32],
-    ) -> Result<&'out mut [u32], Exception> {
-        let n = ffi::codec_encode64(self.codec(), input, output)?;
-        Ok(&mut output[..n])
-    }
-
-    /// Decodes 64-bit integers from compressed 32-bit word form.
-    ///
-    /// Returns a mutable slice containing only the decompressed data (a sub-slice of `output`).
-    fn decode64<'out>(
-        &self,
-        input: &[u32],
-        output: &'out mut [u64],
-    ) -> Result<&'out mut [u64], Exception> {
-        let n = ffi::codec_decode64(self.codec(), input, output)?;
-        Ok(&mut output[..n])
-    }
-}
-
-/// Macro to generate codec wrapper types and their implementations.
-///
-/// The `@ 64` marker indicates that a codec supports 64-bit integers.
-macro_rules! implement_codecs {
-    ($(
-        $(#[$($attrs:tt)*])*
-        $name:ident $(@ $is_64:literal)? => $ffi:ident ,
-    )*) => {
-        $(
-            $(#[$($attrs)*])*
-            pub struct $name(UniquePtr<ffi::IntegerCODEC>);
-
-            impl $name {
-                /// Creates a new instance of this codec.
-                pub fn new() -> Self {
-                    Self(ffi::$ffi())
-                }
-            }
-
-            impl Default for $name {
-                fn default() -> Self {
-                    Self::new()
-                }
-            }
-
-            impl CodecWrapper for $name {
-                fn codec(&self) -> &UniquePtr<ffi::IntegerCODEC> {
-                    &self.0
-                }
-            }
-
-            impl Codec32 for $name {}
-            $(
-                // hack to only expand this block if $is_64 is set
-                const _ : () = { let _ = $is_64; };
-                impl Codec64 for $name {}
-            )*
-        )*
-
-        #[cfg(test)]
-        mod codec_tests {
-            use super::*;
-
-            $(
-                #[test]
-                #[expect(non_snake_case)]
-                fn $name() {
-                    $(
-                        // hack to only expand this block if $is_64 is set
-                        const _ : () = { let _ = $is_64; };
-                        roundtrip_64($name::new());
-                    )*
-                    roundtrip_32($name::new());
-                }
-            )*
-
-            fn roundtrip_32(codec: impl Codec32) {
-                let input = vec![1, 2, 3, 4, 5];
-                let mut output = vec![0; 10];
-                let encoded = codec.encode32(&input, &mut output).unwrap();
-                let mut decoded = vec![0; 10];
-                let decoded = codec.decode32(encoded, &mut decoded).unwrap();
-                assert_eq!(decoded, input);
-            }
-
-            fn roundtrip_64(codec: impl Codec64) {
-                let input = vec![1, 2, 3, 4, 5];
-                let mut output = vec![0; 10];
-                let encoded = codec.encode64(&input, &mut output).unwrap();
-
-                let mut decoded = vec![0; 10];
-                let decoded = codec.decode64(encoded, &mut decoded).unwrap();
-                assert_eq!(decoded, input);
-            }
-        }
-    };
-}
-
-// Codec implementations generated by the macro above.
-//
-// Codecs marked with `@ 64` support both 32-bit and 64-bit integers.
-implement_codecs! {
-    /// Binary Packing codec optimized for 32-bit blocks.
-    BP32Codec => BP32_codec,
-
-    /// Copy codec that performs no compression.
-    CopyCodec => copy_codec,
-
-    /// Fast binary packing with 8-bit processing blocks.
-    FastBinaryPacking8Codec => fastbinarypacking8_codec,
-
-    /// Fast binary packing with 16-bit processing blocks.
-    FastBinaryPacking16Codec => fastbinarypacking16_codec,
-
-    /// Fast binary packing with 32-bit processing blocks.
-    FastBinaryPacking32Codec => fastbinarypacking32_codec,
-
-    /// Fast [`PForCodec`] with 128-value processing blocks.
-    FastPFor128Codec @ 64 => fastpfor128_codec,
-
-    /// Fast [`PForCodec`] with 256-value processing blocks.
-    FastPFor256Codec @ 64 => fastpfor256_codec,
-
-    /// Masked [`VByteCodec`] with SIMD optimizations.
-    MaskedVByteCodec => maskedvbyte_codec,
-
-    /// New [`PForCodec`].
-    NewPForCodec => newpfor_codec,
-
-    /// Optimized [`PForCodec`].
-    OptPForCodec => optpfor_codec,
-
-    /// [`PForCodec`] based on the 2008 research paper.
-    PFor2008Codec => pfor2008_codec,
-
-    /// Standard Patched Frame of Reference codec.
-    PForCodec => pfor_codec,
-
-    /// SIMD-accelerated binary packing codec.
-    SimdBinaryPackingCodec => simdbinarypacking_codec,
-
-    /// SIMD-optimized [`FastPFor128Codec`] with 128-value blocks.
-    SimdFastPFor128Codec => simdfastpfor128_codec,
-
-    /// SIMD-optimized [`FastPFor256Codec`] with 256-value blocks.
-    SimdFastPFor256Codec => simdfastpfor256_codec,
-
-    /// SIMD group simple codec.
-    SimdGroupSimpleCodec => simdgroupsimple_codec,
-
-    /// SIMD group simple codec with ring buffer optimization.
-    SimdGroupSimpleRingBufCodec => simdgroupsimple_ringbuf_codec,
-
-    /// SIMD-accelerated [`NewPForCodec`].
-    SimdNewPForCodec => simdnewpfor_codec,
-
-    /// SIMD-accelerated [`OptPForCodec`].
-    SimdOptPForCodec => simdoptpfor_codec,
-
-    /// SIMD-accelerated [`PForCodec`].
-    SimdPForCodec => simdpfor_codec,
-
-    /// SIMD-accelerated [`SimplePForCodec`].
-    SimdSimplePForCodec => simdsimplepfor_codec,
-
-    /// Simple-16 encoding scheme.
-    Simple16Codec => simple16_codec,
-
-    /// Simple-8b encoding scheme.
-    Simple8bCodec => simple8b_codec,
-
-    /// Simple-8b encoding with run-length encoding.
-    Simple8bRleCodec => simple8b_rle_codec,
-
-    /// Simple-9 encoding scheme.
-    Simple9Codec => simple9_codec,
-
-    /// Simple-9 encoding with run-length encoding.
-    Simple9RleCodec => simple9_rle_codec,
-
-    /// Simple Patched Frame of Reference ([`PForCodec`]) codec.
-    SimplePForCodec => simplepfor_codec,
-
-    // SnappyCodec => snappy_codec,  // Conditional with #ifdef
-
-    /// [`StreamVByte`](https://github.com/lemire/streamvbyte) encoding for fast variable-byte compression.
-    StreamVByteCodec => streamvbyte_codec,
-
-    /// Standard variable-byte encoding.
-    VByteCodec => vbyte_codec,
-
-    /// Variable-length integer encoding.
-    VarIntCodec @ 64 => varint_codec,
-
-    // VarIntG8iuCodec => varintg8iu_codec,  // Conditional with #ifdef
-
-    /// Group Variable-length integer encoding with optimizations.
-    VarIntGbCodec => varintgb_codec,
-
-    // VsEncodingCodec => vsencoding_codec,  // This is leaking memory
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    // These duplicate the macro-generated tests, but we want to test that
-    // the macro expansion itself works correctly
-
-    #[test]
-    fn test_32() {
-        let mut codec = FastPFor128Codec::new();
-        let input = vec![1, 2, 3, 4, 5];
-        let mut output = vec![0; 10];
-        let mut output2 = vec![0; 10];
-        let mut output3 = vec![0; 10];
-        let encoded = codec.encode32(&input, &mut output).unwrap();
-        let encoded2 = codec.encode32(&input, &mut output2).unwrap();
-        let encoded3 = codec.compress_to_slice(&input, &mut output3).unwrap();
-        assert_eq!(encoded, encoded2);
-        assert_eq!(encoded, encoded3);
-
-        let mut decoded = vec![0; 10];
-        let mut decoded2 = vec![0; 10];
-        let mut decoded3 = vec![0; 10];
-        let decoded = codec.decode32(encoded, &mut decoded).unwrap();
-        let decoded2 = codec.decode32(encoded, &mut decoded2).unwrap();
-        let decoded3 = codec.decompress_to_slice(encoded, &mut decoded3).unwrap();
-        assert_eq!(decoded, decoded2);
-        assert_eq!(decoded, decoded3);
-
-        assert_eq!(decoded, input);
-    }
-
-    #[test]
-    fn test_64() {
-        let mut codec = FastPFor128Codec::new();
-        let input = vec![1, 2, 3, 4, 5];
-        let mut output = vec![0; 10];
-        let mut output2 = vec![0; 10];
-        let mut output3 = vec![0; 10];
-        let encoded = codec.encode64(&input, &mut output).unwrap();
-        let encoded2 = codec.encode64(&input, &mut output2).unwrap();
-        let encoded3 = codec.compress_to_slice(&input, &mut output3).unwrap();
-        assert_eq!(encoded, encoded2);
-        assert_eq!(encoded, encoded3);
-
-        let mut decoded = vec![0; 10];
-        let mut decoded2 = vec![0; 10];
-        let mut decoded3 = vec![0; 10];
-        let decoded = codec.decode64(encoded, &mut decoded).unwrap();
-        let decoded2 = codec.decode64(encoded, &mut decoded2).unwrap();
-        let decoded3 = codec.decompress_to_slice(encoded, &mut decoded3).unwrap();
-        assert_eq!(decoded, decoded2);
-        assert_eq!(decoded, decoded3);
-
-        assert_eq!(decoded, input);
-    }
-}
diff --git a/src/cpp/tests.rs b/src/cpp/tests.rs
new file mode 100644
index 0000000..3147391
--- /dev/null
+++ b/src/cpp/tests.rs
@@ -0,0 +1,123 @@
+use crate::cpp::codecs::tests::roundtrip_32;
+
+// Test all codecs compile and do a basic 32-bit roundtrip
+macro_rules! test_anylen {
+        ($($name:ident),*) => {
+            $(
+                #[test]
+                #[allow(non_snake_case)]
+                fn $name() {
+                    roundtrip_32(&mut $crate::cpp::$name::new(), &[1u32, 2, 3, 4, 5]);
+                }
+            )*
+        };
+    }
+
+test_anylen!(
+    BP32Codec,
+    CopyCodec,
+    FastBinaryPacking8Codec,
+    FastBinaryPacking16Codec,
+    FastBinaryPacking32Codec,
+    FastPFor128Codec,
+    FastPFor256Codec,
+    MaskedVByteCodec,
+    NewPForCodec,
+    OptPForCodec,
+    PFor2008Codec,
+    PForCodec,
+    SimdBinaryPackingCodec,
+    SimdFastPFor128Codec,
+    SimdFastPFor256Codec,
+    SimdGroupSimpleCodec,
+    SimdGroupSimpleRingBufCodec,
+    SimdNewPForCodec,
+    SimdOptPForCodec,
+    SimdPForCodec,
+    SimdSimplePForCodec,
+    SimplePForCodec,
+    StreamVByteCodec,
+    VByteCodec,
+    VarIntCodec,
+    VarIntGbCodec
+);
+
+// Simple-9/16/8b codecs require values that fit in small bit widths and a
+// block-aligned count; test them separately with 128 small values.
+macro_rules! test_anylen_128 {
+        ($($name:ident),*) => {
+            $(
+                #[test]
+                #[allow(non_snake_case)]
+                fn $name() {
+                    let input: Vec<u32> = (1..=128).collect();
+                    roundtrip_32(&mut $crate::cpp::$name::new(), &input);
+                }
+            )*
+        };
+    }
+
+// Note: Simple9Rle crashes with heap corruption on various inputs; skip everywhere.
+test_anylen_128!(Simple16Codec, Simple8bCodec, Simple9Codec);
+
+// Simple8bRle reinterpret-casts uint32_t* → uint64_t* inside the C++ header,
+// which is UB on strict-alignment architectures (ARM64 requires 8-byte alignment
+// for 64-bit loads/stores and will SIGSEGV on unaligned access). The codec is
+// otherwise correct on x86/x86_64 where unaligned access is handled in hardware.
+// Tracked upstream; skip on aarch64 until fixed in the submodule.
+// #[cfg(not(target_arch = "aarch64"))]
+test_anylen_128!(Simple8bRleCodec);
+
+// Verify Default impl routes through new() for all generated codec types.
+macro_rules! test_default {
+        ($($name:ident),*) => {
+            $(
+                #[test]
+                #[allow(non_snake_case)]
+                fn $name() {
+                    let _codec = $crate::cpp::$name::default();
+                }
+            )*
+        };
+    }
+
+// Use a distinct prefix to avoid name collisions with test_anylen tests.
+mod default_impls {
+    test_default!(
+        BP32Codec,
+        CopyCodec,
+        FastBinaryPacking8Codec,
+        FastBinaryPacking16Codec,
+        FastBinaryPacking32Codec,
+        FastPFor128Codec,
+        FastPFor256Codec,
+        MaskedVByteCodec,
+        NewPForCodec,
+        OptPForCodec,
+        PFor2008Codec,
+        PForCodec,
+        SimdBinaryPackingCodec,
+        SimdFastPFor128Codec,
+        SimdFastPFor256Codec,
+        SimdGroupSimpleCodec,
+        SimdGroupSimpleRingBufCodec,
+        SimdNewPForCodec,
+        SimdOptPForCodec,
+        SimdPForCodec,
+        SimdSimplePForCodec,
+        Simple16Codec,
+        Simple8bCodec,
+        Simple8bRleCodec,
+        Simple9Codec,
+        SimplePForCodec,
+        StreamVByteCodec,
+        VByteCodec,
+        VarIntCodec,
+        VarIntGbCodec
+    );
+}
+
+mod default_impls2 {
+    // #[cfg(not(target_arch = "aarch64"))]
+    test_default!(Simple9RleCodec);
+}
diff --git a/src/cpp/wrappers.rs b/src/cpp/wrappers.rs
new file mode 100644
index 0000000..7d600a3
--- /dev/null
+++ b/src/cpp/wrappers.rs
@@ -0,0 +1,90 @@
+use cxx::UniquePtr;
+
+use crate::FastPForError;
+use crate::codec::default_max_decoded_len;
+use crate::cpp::ffi;
+use crate::helpers::AsUsize;
+
+/// Pass-through to C++ `encodeArray` / `decodeArray`. No extra header is added.
+///
+/// Block-based C++ codecs (`FastPFor`, `PFor`, etc.) store the original data length
+/// in their own wire format. Byte-oriented codecs (`VariableByte`) rely on the
+/// caller passing the encoded stream length to decode, which we have via `input.len()`.
+pub fn encode32_to_vec_ffi(
+    codec: &UniquePtr<ffi::IntegerCODEC>,
+    input: &[u32],
+    out: &mut Vec<u32>,
+) -> Result<(), FastPForError> {
+    let capacity = input.len() * 2 + 1024;
+    let start = out.len();
+    out.resize(start + capacity, 0);
+    let n = ffi::codec_encode32(codec, input, &mut out[start..])?;
+    out.truncate(start + n);
+    Ok(())
+}
+
+fn decode32_to_vec_ffi(
+    codec: &UniquePtr<ffi::IntegerCODEC>,
+    input: &[u32],
+    out: &mut Vec<u32>,
+    capacity: usize,
+) -> Result<(), FastPForError> {
+    if !input.is_empty() {
+        let start = out.len();
+        out.resize(start + capacity, 0);
+        let n = ffi::codec_decode32(codec, input, &mut out[start..])?;
+        out.truncate(start + n);
+    }
+    Ok(())
+}
+
+pub fn decode32_anylen_ffi(
+    codec: &UniquePtr<ffi::IntegerCODEC>,
+    input: &[u32],
+    out: &mut Vec<u32>,
+    expected_len: Option<u32>,
+) -> Result<(), FastPForError> {
+    let max = default_max_decoded_len(input.len());
+    let capacity = if let Some(n) = expected_len {
+        n.is_valid_expected(max)?
+    } else {
+        // C++ decodeArray needs output buffer capacity. Block codecs read count from stream;
+        // variable-byte decodes until input is consumed. Simple9/16 pack up to 28 values/word.
+        max
+    };
+    let start = out.len();
+    decode32_to_vec_ffi(codec, input, out, capacity)?;
+    if let Some(n) = expected_len {
+        (out.len() - start).is_decoded_mismatch(n)?;
+    }
+    Ok(())
+}
+
+pub fn encode64_to_vec_ffi(
+    codec: &UniquePtr<ffi::IntegerCODEC>,
+    input: &[u64],
+    out: &mut Vec<u32>,
+) -> Result<(), FastPForError> {
+    let capacity = input.len() * 3 + 1024;
+    let start = out.len();
+    out.resize(start + capacity, 0);
+    let n = ffi::codec_encode64(codec, input, &mut out[start..])?;
+    out.truncate(start + n);
+    Ok(())
+}
+
+pub fn decode64_to_vec_ffi(
+    codec: &UniquePtr<ffi::IntegerCODEC>,
+    input: &[u32],
+    out: &mut Vec<u64>,
+) -> Result<(), FastPForError> {
+    if !input.is_empty() {
+        // C++ decodeArray needs output buffer. Variable-byte can pack multiple values per word.
+        let capacity = input.len().saturating_mul(4);
+        let start = out.len();
+        out.resize(start + capacity, 0);
+        let n = ffi::codec_decode64(codec, input, &mut out[start..])?;
+        out.truncate(start + n);
+    }
+    Ok(())
+}
diff --git a/src/error.rs b/src/error.rs
new file mode 100644
index 0000000..522b2a1
--- /dev/null
+++ b/src/error.rs
@@ -0,0 +1,48 @@
+use thiserror::Error;
+
+/// Alias for the result type of `FastPFor` operations.
+pub type FastPForResult<T> = Result<T, FastPForError>;
+
+/// Errors that can occur when using the `FastPFor` codecs.
+#[non_exhaustive]
+#[derive(Error, Debug)]
+pub enum FastPForError {
+    /// Unsupported operation
+    #[error("Unsupported operation")]
+    Unimplemented,
+
+    /// Not enough data in the input buffer
+    #[error("Not enough data in the input buffer")]
+    NotEnoughData,
+
+    /// Output buffer too small
+    #[error("Output buffer too small")]
+    OutputBufferTooSmall,
+
+    /// Invalid input length
+    #[error("Invalid input length {0}")]
+    InvalidInputLength(usize),
+
+    /// Error propagated from the C++ `FastPFOR` library
+    #[cfg(feature = "cpp")]
+    #[error("C++ exception: {0}")]
+    CppError(#[from] cxx::Exception),
+
+    /// Expected element count exceeds maximum allowed (possible corrupt or untrusted input)
+    #[error("Expected element count {expected} exceeds maximum {max}")]
+    ExpectedCountExceedsMax {
+        /// The expected count provided by the caller
+        expected: usize,
+        /// The maximum allowed based on input size
+        max: usize,
+    },
+
+    /// Decoded element count did not match the expected count
+    #[error("Decoded {actual} elements, expected {expected}")]
+    DecodedCountMismatch {
+        /// Number of elements actually decoded
+        actual: usize,
+        /// Expected count provided by the caller
+        expected: usize,
+    },
+}
diff --git a/src/helpers.rs b/src/helpers.rs
new file mode 100644
index 0000000..8db9446
--- /dev/null
+++ b/src/helpers.rs
@@ -0,0 +1,89 @@
+use crate::FastPForError;
+
+/// Finds the greatest multiple of `factor` that is less than or equal to `value`.
+#[cfg_attr(feature = "cpp", allow(dead_code))]
+pub fn greatest_multiple(value: u32, factor: u32) -> u32 {
+    value - value % factor
+}
+
+/// Returns the number of bits needed to represent `i`.
+/// Returns 0 for input 0.
+#[cfg_attr(feature = "cpp", allow(dead_code))]
+pub fn bits(i: u32) -> usize {
+    32 - i.leading_zeros().as_usize()
+}
+
+pub trait AsUsize: Eq + Copy {
+    fn as_usize(self) -> usize;
+
+    #[inline]
+    #[cfg(feature = "cpp")]
+    fn is_decoded_mismatch(self, expected: impl AsUsize) -> Result<(), FastPForError> {
+        let actual = self.as_usize();
+        let expected = expected.as_usize();
+        if self.as_usize() == expected {
+            Ok(())
+        } else {
+            Err(FastPForError::DecodedCountMismatch { actual, expected })
+        }
+    }
+
+    /// Returns an error if `expected` exceeds `max`.
+    #[inline]
+    #[cfg(feature = "cpp")]
+    fn is_valid_expected(self, max: impl AsUsize) -> Result<usize, FastPForError> {
+        let expected = self.as_usize();
+        let max = max.as_usize();
+        if expected > max {
+            Err(FastPForError::ExpectedCountExceedsMax { expected, max })
+        } else {
+            Ok(expected)
+        }
+    }
+}
+
+impl AsUsize for usize {
+    #[inline]
+    fn as_usize(self) -> usize {
+        self
+    }
+}
+
+impl AsUsize for u32 {
+    #[inline]
+    fn as_usize(self) -> usize {
+        const _: () = {
+            // Some day Rust may support usize smaller than u32?
+            assert!(
+                size_of::<u32>() <= size_of::<usize>(),
+                "usize must be able to hold all u32 values"
+            );
+        };
+
+        #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
+        {
+            self as usize
+        }
+    }
+}
+
+#[cfg_attr(feature = "cpp", allow(dead_code))]
+pub trait GetWithErr<T> {
+    fn get_val(&self, pos: impl AsUsize) -> Result<T, FastPForError>;
+}
+
+impl<T: Copy> GetWithErr<T> for &[T] {
+    #[inline]
+    fn get_val(&self, pos: impl AsUsize) -> Result<T, FastPForError> {
+        self.get(pos.as_usize())
+            .copied()
+            .ok_or(FastPForError::NotEnoughData)
+    }
+}
+
+impl<T: Copy> GetWithErr<T> for Vec<T> {
+    #[inline]
+    fn get_val(&self, pos: impl AsUsize) -> Result<T, FastPForError> {
+        self.as_slice().get_val(pos)
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index c4b1904..a0cd734 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -5,6 +5,10 @@
 #[cfg(not(any(feature = "cpp", feature = "rust",)))]
 compile_error!("At least one of the features 'cpp' or 'rust' must be enabled");
 
+// Error types are always available regardless of which codec features are enabled.
+mod error;
+pub use error::{FastPForError, FastPForResult};
+
 // FIXME: need decide on the external API. Some ideas:
 //  - offer two sets of similar APIs - rust and cpp ffi
 //  - it will be possible to enable/disable each with a feature flag
@@ -19,6 +23,13 @@ pub mod cpp;
 /// Rust re-implementation of `FastPFor` (work in progress)
 pub mod rust;
 
+mod codec;
+#[cfg(feature = "cpp")]
+pub use codec::BlockCodec64;
+pub use codec::{AnyLenCodec, BlockCodec, slice_to_blocks};
+
+pub(crate) mod helpers;
+
 /// Low-level compression interface using caller-provided buffers.
 ///
 /// Codecs write into pre-allocated slices and return a sub-slice showing exactly
diff --git a/src/rust/error.rs b/src/rust/error.rs
deleted file mode 100644
index 23964ed..0000000
--- a/src/rust/error.rs
+++ /dev/null
@@ -1,25 +0,0 @@
-use thiserror::Error;
-
-/// Alias for the result type of `FastPFor` operations.
-pub type FastPForResult<T> = Result<T, FastPForError>;
-
-/// Errors that can occur when using the `FastPFor` codecs.
-#[non_exhaustive]
-#[derive(Error, Debug)]
-pub enum FastPForError {
-    /// Unsupported operation
-    #[error("Unsupported operation")]
-    Unimplemented,
-
-    /// Not enough data in the input buffer
-    #[error("Not enough data in the input buffer")]
-    NotEnoughData,
-
-    /// Output buffer too small
-    #[error("Output buffer too small")]
-    OutputBufferTooSmall,
-
-    /// Invalid input length
-    #[error("Invalid input length {0}")]
-    InvalidInputLength(usize),
-}
diff --git a/src/rust/integer_compression/bitpacking.rs b/src/rust/integer_compression/bitpacking.rs
index 919b020..d911113 100644
--- a/src/rust/integer_compression/bitpacking.rs
+++ b/src/rust/integer_compression/bitpacking.rs
@@ -1299,3 +1299,59 @@ fn fast_pack31(input: &[u32], inpos: usize, output: &mut [u32], outpos: usize) {
 fn fast_pack32(input: &[u32], inpos: usize, output: &mut [u32], outpos: usize) {
     output[outpos..outpos + 32].copy_from_slice(&input[inpos..inpos + 32]);
 }
+
+#[cfg(test)]
+mod tests {
+    use rand::RngExt as _;
+
+    use super::fast_pack;
+    use crate::rust::integer_compression::bitunpacking::fast_unpack;
+
+    #[test]
+    fn pack_unpack_roundtrip() {
+        let n = 32;
+        let times = 1000;
+        let mut r = rand::rng();
+        let mut data = vec![0u32; n];
+        let mut compressed = vec![0u32; n];
+        let mut uncompressed = vec![0u32; n];
+
+        for bit in 0..31u8 {
+            for _ in 0..times {
+                for value in &mut data {
+                    *value = r.random_range(0..(1 << bit));
+                }
+                fast_pack(&data, 0, &mut compressed, 0, bit);
+                fast_unpack(&compressed, 0, &mut uncompressed, 0, bit);
+                assert_eq!(uncompressed, data, "Mismatch for bit {bit}");
+            }
+        }
+    }
+
+    #[test]
+    fn pack_unpack_with_masking() {
+        const N: usize = 32;
+        const TIMES: usize = 1000;
+        let mut rng = rand::rng();
+        let mut data = vec![0u32; N];
+        let mut compressed = vec![0u32; N];
+        let mut uncompressed = vec![0u32; N];
+
+        for bit in 0..31u8 {
+            for _ in 0..TIMES {
+                for value in &mut data {
+                    *value = rng.random();
+                }
+                fast_pack(&data, 0, &mut compressed, 0, bit);
+                fast_unpack(&compressed, 0, &mut uncompressed, 0, bit);
+                for value in &mut data {
+                    *value &= (1 << bit) - 1;
+                }
+                assert_eq!(
+                    data, uncompressed,
+                    "Data does not match uncompressed output"
+                );
+            }
+        }
+    }
+}
diff --git a/src/rust/integer_compression/fastpfor.rs b/src/rust/integer_compression/fastpfor.rs
index 1737c3e..fbf55e8 100644
--- a/src/rust/integer_compression/fastpfor.rs
+++ b/src/rust/integer_compression/fastpfor.rs
@@ -4,10 +4,11 @@ use std::num::NonZeroU32;
 
 use bytes::{Buf as _, BufMut as _, BytesMut};
 
+use crate::helpers::{GetWithErr, bits, greatest_multiple};
 use crate::rust::cursor::IncrementCursor;
-use crate::rust::integer_compression::helpers::GetWithErr;
-use crate::rust::integer_compression::{bitpacking, bitunpacking, helpers};
-use crate::rust::{FastPForError, FastPForResult, Integer, Skippable};
+use crate::rust::integer_compression::{bitpacking, bitunpacking};
+use crate::rust::{Integer, Skippable};
+use crate::{FastPForError, FastPForResult};
 
 /// Block size constant for 256 integers per block
 pub const BLOCK_SIZE_256: NonZeroU32 = NonZeroU32::new(256).unwrap();
@@ -61,7 +62,7 @@ impl Skippable for FastPFOR {
         output: &mut [u32],
         output_offset: &mut Cursor<u32>,
     ) -> FastPForResult<()> {
-        let inlength = helpers::greatest_multiple(input_length, self.block_size);
+        let inlength = greatest_multiple(input_length, self.block_size);
         let final_inpos = input_offset.position() as u32 + inlength;
         while input_offset.position() as u32 != final_inpos {
             let this_size =
@@ -85,7 +86,7 @@ impl Skippable for FastPFOR {
             // Return early if there is no data to uncompress and block size is 128
             return Ok(());
         }
-        let mynvalue = helpers::greatest_multiple(inlength, self.block_size);
+        let mynvalue = greatest_multiple(inlength, self.block_size);
         let final_out = output_offset.position() as u32 + mynvalue;
         while output_offset.position() as u32 != final_out {
             let this_size =
@@ -105,7 +106,7 @@ impl Integer<u32> for FastPFOR {
         output: &mut [u32],
         output_offset: &mut Cursor<u32>,
     ) -> FastPForResult<()> {
-        let inlength = helpers::greatest_multiple(input_length, self.block_size);
+        let inlength = greatest_multiple(input_length, self.block_size);
         if inlength == 0 {
             // Return early if there is no data to compress
             return Ok(());
@@ -290,7 +291,7 @@ impl FastPFOR {
         self.freqs.fill(0);
         let k_end = std::cmp::min(pos + self.block_size, input.len() as u32);
         for k in pos..k_end {
-            self.freqs[helpers::bits(input[k as usize])] += 1;
+            self.freqs[bits(input[k as usize])] += 1;
         }
 
         self.optimal_bits = 32;
diff --git a/src/rust/integer_compression/helpers.rs b/src/rust/integer_compression/helpers.rs
deleted file mode 100644
index 13a4b32..0000000
--- a/src/rust/integer_compression/helpers.rs
+++ /dev/null
@@ -1,61 +0,0 @@
-use crate::rust::{FastPForError, FastPForResult};
-
-/// Finds the greatest multiple of `factor` that is less than or equal to `value`.
-pub fn greatest_multiple(value: u32, factor: u32) -> u32 {
-    value - value % factor
-}
-
-/// Returns the number of bits needed to represent `i`.
-/// Returns 0 for input 0.
-pub fn bits(i: u32) -> usize {
-    32 - i.leading_zeros() as usize
-}
-
-pub trait AsUsize: Eq + Copy {
-    fn as_usize(self) -> usize;
-}
-
-impl AsUsize for usize {
-    #[inline]
-    fn as_usize(self) -> usize {
-        self
-    }
-}
-
-impl AsUsize for u32 {
-    #[inline]
-    fn as_usize(self) -> usize {
-        const _: () = {
-            // Some day Rust may support usize smaller than u32?
-            assert!(
-                size_of::<u32>() <= size_of::<usize>(),
-                "usize must be able to hold all u32 values"
-            );
-        };
-
-        #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
-        {
-            self as usize
-        }
-    }
-}
-
-pub trait GetWithErr<T> {
-    fn get_val(&self, pos: impl AsUsize) -> FastPForResult<T>;
-}
-
-impl<T: Copy> GetWithErr<T> for &[T] {
-    #[inline]
-    fn get_val(&self, pos: impl AsUsize) -> FastPForResult<T> {
-        self.get(pos.as_usize())
-            .copied()
-            .ok_or(FastPForError::NotEnoughData)
-    }
-}
-
-impl<T: Copy> GetWithErr<T> for Vec<T> {
-    #[inline]
-    fn get_val(&self, pos: impl AsUsize) -> FastPForResult<T> {
-        self.as_slice().get_val(pos)
-    }
-}
diff --git a/src/rust/integer_compression/mod.rs b/src/rust/integer_compression/mod.rs
index 7a89eba..26bc651 100644
--- a/src/rust/integer_compression/mod.rs
+++ b/src/rust/integer_compression/mod.rs
@@ -4,7 +4,6 @@ pub mod codec;
 pub mod composition;
 pub mod differential;
 pub mod fastpfor;
-pub mod helpers;
 pub mod integer_codec;
 pub mod just_copy;
 pub mod skippable_codec;
diff --git a/src/rust/integer_compression/variable_byte.rs b/src/rust/integer_compression/variable_byte.rs
index 352eb29..26e7277 100644
--- a/src/rust/integer_compression/variable_byte.rs
+++ b/src/rust/integer_compression/variable_byte.rs
@@ -2,8 +2,8 @@ use std::io::Cursor;
 
 use bytemuck::{cast_slice, cast_slice_mut};
 
+use crate::helpers::AsUsize;
 use crate::rust::cursor::IncrementCursor;
-use crate::rust::integer_compression::helpers::AsUsize;
 use crate::rust::{FastPForError, FastPForResult, Integer, Skippable};
 
 /// Variable-byte encoding codec for integer compression.
diff --git a/src/rust/mod.rs b/src/rust/mod.rs
index 5f5bb43..24493ae 100644
--- a/src/rust/mod.rs
+++ b/src/rust/mod.rs
@@ -1,9 +1,7 @@
 mod cursor;
-mod error;
 mod integer_compression;
 
 pub use cursor::IncrementCursor;
-pub use error::{FastPForError, FastPForResult};
 pub use integer_compression::bitpacking::fast_pack;
 pub use integer_compression::bitunpacking::fast_unpack;
 pub use integer_compression::codec::Codec;
@@ -16,3 +14,5 @@ pub use integer_compression::integer_codec::Integer;
 pub use integer_compression::just_copy::JustCopy;
 pub use integer_compression::skippable_codec::Skippable;
 pub use integer_compression::variable_byte::VariableByte;
+
+pub use crate::{FastPForError, FastPForResult};
diff --git a/tests/benchmark_smoke.rs b/tests/benchmark_smoke.rs
index f931ed8..68932d7 100644
--- a/tests/benchmark_smoke.rs
+++ b/tests/benchmark_smoke.rs
@@ -14,6 +14,8 @@ use bench_utils::{
 };
 #[cfg(feature = "cpp")]
 use bench_utils::{cpp_decode, cpp_decode_fixtures};
+#[cfg(feature = "cpp")]
+use fastpfor::cpp;
 
 const SMOKE_SIZE: usize = 256;
 
@@ -108,13 +110,11 @@ fn smoke_compression_ratio() {
 #[cfg(feature = "cpp")]
 #[test]
 fn smoke_cpp_vs_rust() {
-    use fastpfor::cpp::FastPFor128Codec;
-
     for (_, fix) in cpp_decode_fixtures(&[SMOKE_SIZE]) {
         // C++ decode
-        let codec = FastPFor128Codec::new();
+        let mut codec = cpp::FastPFor128Codec::new();
         let mut cpp_out = vec![0u32; fix.original_len];
-        let n = cpp_decode(&codec, &fix.cpp_compressed, &mut cpp_out);
+        let n = cpp_decode(&mut codec, &fix.cpp_compressed, &mut cpp_out);
         assert_eq!(
             n, fix.original_len,
             "{}: C++ decoded wrong element count",
diff --git a/tests/cpp_compat_tests.rs b/tests/cpp_compat_tests.rs
index ea8ab5a..a01b894 100644
--- a/tests/cpp_compat_tests.rs
+++ b/tests/cpp_compat_tests.rs
@@ -4,32 +4,32 @@
 
 use std::io::Cursor;
 
-use fastpfor::cpp::Codec32 as _;
 use fastpfor::rust::Integer as _;
-use fastpfor::{cpp, rust};
+use fastpfor::{AnyLenCodec as _, cpp, rust};
 
 mod common;
 use common::{get_test_cases, test_input_sizes};
 
 #[test]
 fn test_rust_decompresses_cpp_encoded_data() {
-    let codec_cpp = cpp::FastPFor128Codec::new();
+    let mut codec_cpp = cpp::FastPFor128Codec::new();
     let mut codec_rs = rust::FastPFOR::new(rust::DEFAULT_PAGE_SIZE, rust::BLOCK_SIZE_128);
 
     for n in test_input_sizes() {
         for input in get_test_cases(n + rust::BLOCK_SIZE_128.get() as usize) {
-            // Buffer for the C++ encoded
-            let mut compressed_buffer = vec![0; input.len()];
+            let mut compressed_buffer = Vec::new();
+            codec_cpp.encode(&input, &mut compressed_buffer).unwrap();
+            let compressed_len = compressed_buffer.len();
 
-            // C++ encoding
-            let encoded_cpp = codec_cpp.encode32(&input, &mut compressed_buffer).unwrap();
-            let compressed_len = encoded_cpp.len();
-
-            // C++ decoding
-            let mut decoded_by_cpp = vec![0; input.len()];
-            let decoded_cpp = codec_cpp
-                .decode32(encoded_cpp, &mut decoded_by_cpp)
+            let mut decoded_by_cpp = Vec::new();
+            codec_cpp
+                .decode(
+                    &compressed_buffer,
+                    &mut decoded_by_cpp,
+                    Some(input.len() as u32),
+                )
                 .unwrap();
+            let decoded_cpp = decoded_by_cpp.as_slice();
 
             // Rust decoding
             let mut input_offset = Cursor::new(0u32);
@@ -49,24 +49,25 @@ fn test_rust_decompresses_cpp_encoded_data() {
                 decoded_by_rust.len(),
                 "Mismatched output lengths"
             );
-            assert_eq!(decoded_cpp, decoded_by_rust);
+            assert_eq!(decoded_cpp, decoded_by_rust.as_slice());
         }
     }
 }
 
 #[test]
 fn test_rust_and_cpp_fastpfor32_compression_matches() {
-    let codec_cpp = cpp::FastPFor128Codec::new();
+    let mut codec_cpp = cpp::FastPFor128Codec::new();
     let mut codec_rs = rust::FastPFOR::new(rust::DEFAULT_PAGE_SIZE, rust::BLOCK_SIZE_128);
 
     for n in test_input_sizes() {
         for input in get_test_cases(n + rust::BLOCK_SIZE_128.get() as usize) {
-            // Buffer for the C++ encoded
-            let mut compressed_buffer = vec![0; input.len()];
+            // Rust `FastPFOR::compress` is a no-op for length 0; C++ still writes a stream header.
+            if input.is_empty() {
+                continue;
+            }
 
-            // C++ encoding
-            let encoded_cpp = codec_cpp.encode32(&input, &mut compressed_buffer).unwrap();
-            let compressed_len = encoded_cpp.len();
+            let mut compressed_buffer = Vec::new();
+            codec_cpp.encode(&input, &mut compressed_buffer).unwrap();
 
             // Rust encoding
             let mut input_offset_rs = Cursor::new(0u32);
@@ -82,10 +83,13 @@ fn test_rust_and_cpp_fastpfor32_compression_matches() {
                 )
                 .unwrap();
 
+            let compressed_len_rs = output_offset_rs.position() as usize;
             assert_eq!(
-                &encoded_cpp[..compressed_len],
-                &encoded_rs[..compressed_len]
+                compressed_buffer.len(),
+                compressed_len_rs,
+                "C++ vs Rust compressed length mismatch"
             );
+            assert_eq!(&compressed_buffer, &encoded_rs[..compressed_len_rs]);
         }
     }
 }