From 678f6574574aaa8a3ea2ee45d6c9c059e4d9bf81 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Wed, 26 Nov 2025 20:58:47 +0800
Subject: [PATCH 1/4] fix: update .version to 0.4.0

---
 .version | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.version b/.version
index 0d91a54..1d0ba9e 100644
--- a/.version
+++ b/.version
@@ -1 +1 @@
-0.3.0
+0.4.0

From 8035108187b6903e472c5d488fe850ca7b55ad36 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Wed, 26 Nov 2025 21:07:40 +0800
Subject: [PATCH 2/4] chore: bump version to 0.5.0

Add batch decoding methods (decode_batch and decode_batch_lossy) to both Rust core tokenizer and Python bindings for parallel processing of multiple token lists.
---
 .version               |  2 +-
 Cargo.toml             |  2 +-
 pyproject.toml         |  2 +-
 src/core/tokenizer.rs  | 27 +++++++++++++++++++++++++++
 src/python/bindings.rs | 29 +++++++++++++++++++++++++++++
 uv.lock                |  2 +-
 6 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/.version b/.version
index 1d0ba9e..8f0916f 100644
--- a/.version
+++ b/.version
@@ -1 +1 @@
-0.4.0
+0.5.0
diff --git a/Cargo.toml b/Cargo.toml
index 9ef7d9e..6a0cd1d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "splintr"
-version = "0.4.0"
+version = "0.5.0"
 edition = "2021"
 description = "Fast Rust BPE tokenizer with Python bindings"
 license = "MIT"
diff --git a/pyproject.toml b/pyproject.toml
index f524a6b..97be6bc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "splintr-rs"
-version = "0.4.0"
+version = "0.5.0"
 description = "Fast Rust BPE tokenizer with Python bindings"
 readme = "README.md"
 license = {text = "MIT"}
diff --git a/src/core/tokenizer.rs b/src/core/tokenizer.rs
index ed8d356..d5443a6 100644
--- a/src/core/tokenizer.rs
+++ b/src/core/tokenizer.rs
@@ -898,6 +898,33 @@ impl Tokenizer {
             .collect()
     }
 
+    /// Batch decode multiple token lists in parallel.
+    ///
+    /// Uses Rayon to parallelize decoding across token lists.
+    ///
+    /// # Example
+    ///
+    /// ```ignore
+    /// let token_lists = vec![vec![1, 2, 3], vec![4, 5, 6]];
+    /// let texts = tokenizer.decode_batch(&token_lists)?;
+    /// ```
+    pub fn decode_batch(&self, token_lists: &[Vec<u32>]) -> Result<Vec<String>, TokenizerError> {
+        token_lists
+            .par_iter()
+            .map(|tokens| self.decode(tokens))
+            .collect()
+    }
+
+    /// Batch decode multiple token lists in parallel, replacing invalid UTF-8.
+    ///
+    /// Like [`decode_batch`], but uses lossy UTF-8 conversion.
+    pub fn decode_batch_lossy(&self, token_lists: &[Vec<u32>]) -> Vec<String> {
+        token_lists
+            .par_iter()
+            .map(|tokens| self.decode_lossy(tokens))
+            .collect()
+    }
+
     /// Get the vocabulary size (number of tokens).
     pub fn vocab_size(&self) -> usize {
         self.encoder.len() + self.special_tokens.len()
diff --git a/src/python/bindings.rs b/src/python/bindings.rs
index e9c4c7f..648c80f 100644
--- a/src/python/bindings.rs
+++ b/src/python/bindings.rs
@@ -709,6 +709,35 @@ impl PyTokenizer {
         self.inner.encode_batch_with_special(&texts)
     }
 
+    /// Batch decode multiple token lists in parallel.
+    ///
+    /// Uses Rayon to parallelize decoding across token lists.
+    ///
+    /// Args:
+    ///     token_lists: List of token ID lists
+    ///
+    /// Returns:
+    ///     List of decoded strings
+    ///
+    /// Raises:
+    ///     ValueError: If any decoded bytes are not valid UTF-8
+    fn decode_batch(&self, token_lists: Vec<Vec<u32>>) -> PyResult<Vec<String>> {
+        self.inner
+            .decode_batch(&token_lists)
+            .map_err(|e| PyValueError::new_err(e.to_string()))
+    }
+
+    /// Batch decode multiple token lists, replacing invalid UTF-8.
+    ///
+    /// Args:
+    ///     token_lists: List of token ID lists
+    ///
+    /// Returns:
+    ///     List of decoded strings with replacement characters for invalid UTF-8
+    fn decode_batch_lossy(&self, token_lists: Vec<Vec<u32>>) -> Vec<String> {
+        self.inner.decode_batch_lossy(&token_lists)
+    }
+
     /// Get the vocabulary size (including special tokens).
     #[getter]
     fn vocab_size(&self) -> usize {
diff --git a/uv.lock b/uv.lock
index aa14eb2..6c3757c 100644
--- a/uv.lock
+++ b/uv.lock
@@ -3,5 +3,5 @@ requires-python = ">=3.8"
 
 [[package]]
 name = "splintr-rs"
-version = "0.4.0"
+version = "0.5.0"
 source = { editable = "." }

From 1c8a7cdc7f032189ee70899177ed7685b410938e Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Wed, 26 Nov 2025 21:13:27 +0800
Subject: [PATCH 3/4] feat: add comprehensive benchmark scripts for tokenizers

Add benchmark suites for cl100k_base, Llama 3, and o200k_base tokenizers.
Each script compares splintr performance against reference implementations
(tiktoken, HuggingFace) across single/batch encoding and decoding operations.

- benchmark_cl100k.py: GPT-4/GPT-3.5-turbo tokenizer benchmarks
- benchmark_llama3.py: Llama 3 family tokenizer benchmarks
- benchmark_o200k.py: GPT-4o tokenizer benchmarks

Benchmarks measure throughput (MB/s, tokens/s) and latency across various
text types (short, medium, long, code, multilingual) with visualization
support via matplotlib charts.
---
 benchmarks/vocabs/benchmark_cl100k.py | 523 +++++++++++++++++++++++
 benchmarks/vocabs/benchmark_llama3.py | 590 ++++++++++++++++++++++++++
 benchmarks/vocabs/benchmark_o200k.py  | 523 +++++++++++++++++++++++
 3 files changed, 1636 insertions(+)
 create mode 100644 benchmarks/vocabs/benchmark_cl100k.py
 create mode 100644 benchmarks/vocabs/benchmark_llama3.py
 create mode 100644 benchmarks/vocabs/benchmark_o200k.py

diff --git a/benchmarks/vocabs/benchmark_cl100k.py b/benchmarks/vocabs/benchmark_cl100k.py
new file mode 100644
index 0000000..918f3fc
--- /dev/null
+++ b/benchmarks/vocabs/benchmark_cl100k.py
@@ -0,0 +1,523 @@
+#!/usr/bin/env python3
+"""
+Benchmark comparison for cl100k_base tokenizers: splintr vs tiktoken
+
+cl100k_base is used by GPT-4 and GPT-3.5-turbo.
+
+Usage:
+    pip install tiktoken matplotlib numpy
+    python benchmarks/vocabs/benchmark_cl100k.py
+"""
+
+import gc
+import json
+import statistics
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable
+
+# Sample texts for benchmarking
+SAMPLE_TEXTS = {
+    "short": "Hello, world! This is a test.",
+    "medium": """The quick brown fox jumps over the lazy dog.
+    Machine learning models require tokenization to process text efficiently.
+    Tokenizers convert text into numerical representations that models can understand."""
+    * 10,
+    "long": """Artificial intelligence and machine learning have revolutionized
+    the way we process and understand natural language. Large language models (LLMs)
+    like GPT-4, Claude, and others rely heavily on efficient tokenization to handle
+    vast amounts of text data. The performance of tokenizers directly impacts the
+    overall throughput of these systems, making optimization crucial for production
+    deployments. BPE (Byte Pair Encoding) has become the de facto standard for
+    modern tokenizers due to its balance of vocabulary efficiency and handling of
+    out-of-vocabulary words."""
+    * 50,
+    "code": """
+def fibonacci(n: int) -> int:
+    \"\"\"Calculate the nth Fibonacci number.\"\"\"
+    if n <= 1:
+        return n
+    return fibonacci(n - 1) + fibonacci(n - 2)
+
+class TokenizerBenchmark:
+    def __init__(self, name: str):
+        self.name = name
+        self.results = []
+
+    def run(self, text: str, iterations: int = 100):
+        for _ in range(iterations):
+            tokens = self.encode(text)
+            self.results.append(len(tokens))
+"""
+    * 20,
+    "multilingual": """
+    English: The quick brown fox jumps over the lazy dog.
+    中文: 快速的棕色狐狸跳过懒狗。
+    日本語: 素早い茶色の狐が怠惰な犬を飛び越える。
+    한국어: 빠른 갈색 여우가 게으른 개를 뛰어넘습니다.
+    العربية: الثعلب البني السريع يقفز فوق الكلب الكسول.
+    Русский: Быстрая коричневая лиса прыгает через ленивую собаку.
+    """
+    * 20,
+}
+
+
+@dataclass
+class BenchmarkResult:
+    name: str
+    text_type: str
+    benchmark_type: str
+    bytes_per_second: float
+    tokens_per_second: float
+    num_tokens: int
+    num_bytes: int
+    latency_ms: float
+
+
+def benchmark_encode(
+    name: str,
+    encode_fn: Callable[[str], list],
+    text: str,
+    text_type: str,
+    warmup: int = 3,
+    iterations: int = 10,
+) -> BenchmarkResult:
+    """Benchmark single text encoding."""
+    num_bytes = len(text.encode("utf-8"))
+
+    for _ in range(warmup):
+        encode_fn(text)
+
+    gc.collect()
+
+    times = []
+    num_tokens = 0
+    for _ in range(iterations):
+        start = time.perf_counter_ns()
+        tokens = encode_fn(text)
+        end = time.perf_counter_ns()
+        times.append((end - start) / 1e9)
+        num_tokens = len(tokens)
+
+    avg_time = statistics.mean(times)
+    bytes_per_second = num_bytes / avg_time
+    tokens_per_second = num_tokens / avg_time
+
+    return BenchmarkResult(
+        name=name,
+        text_type=text_type,
+        benchmark_type="single_encode",
+        bytes_per_second=bytes_per_second,
+        tokens_per_second=tokens_per_second,
+        num_tokens=num_tokens,
+        num_bytes=num_bytes,
+        latency_ms=avg_time * 1000,
+    )
+
+
+def benchmark_batch_encode(
+    name: str,
+    encode_batch_fn: Callable[[list[str]], list],
+    texts: list[str],
+    text_type: str,
+    warmup: int = 2,
+    iterations: int = 5,
+) -> BenchmarkResult:
+    """Benchmark batch encoding."""
+    num_bytes = sum(len(t.encode("utf-8")) for t in texts)
+
+    for _ in range(warmup):
+        encode_batch_fn(texts)
+
+    gc.collect()
+
+    times = []
+    total_tokens = 0
+    for _ in range(iterations):
+        start = time.perf_counter_ns()
+        results = encode_batch_fn(texts)
+        end = time.perf_counter_ns()
+        times.append((end - start) / 1e9)
+        total_tokens = sum(len(r) for r in results)
+
+    avg_time = statistics.mean(times)
+    bytes_per_second = num_bytes / avg_time
+    tokens_per_second = total_tokens / avg_time
+
+    return BenchmarkResult(
+        name=name,
+        text_type=text_type,
+        benchmark_type="batch_encode",
+        bytes_per_second=bytes_per_second,
+        tokens_per_second=tokens_per_second,
+        num_tokens=total_tokens,
+        num_bytes=num_bytes,
+        latency_ms=avg_time * 1000,
+    )
+
+
+def benchmark_decode(
+    name: str,
+    decode_fn: Callable[[list[int]], str],
+    tokens: list[int],
+    text_type: str,
+    original_bytes: int,
+    warmup: int = 3,
+    iterations: int = 10,
+) -> BenchmarkResult:
+    """Benchmark single text decoding."""
+    num_tokens = len(tokens)
+
+    for _ in range(warmup):
+        decode_fn(tokens)
+
+    gc.collect()
+
+    times = []
+    num_bytes = 0
+    for _ in range(iterations):
+        start = time.perf_counter_ns()
+        text = decode_fn(tokens)
+        end = time.perf_counter_ns()
+        times.append((end - start) / 1e9)
+        num_bytes = len(text.encode("utf-8"))
+
+    avg_time = statistics.mean(times)
+    bytes_per_second = num_bytes / avg_time
+    tokens_per_second = num_tokens / avg_time
+
+    return BenchmarkResult(
+        name=name,
+        text_type=text_type,
+        benchmark_type="single_decode",
+        bytes_per_second=bytes_per_second,
+        tokens_per_second=tokens_per_second,
+        num_tokens=num_tokens,
+        num_bytes=num_bytes,
+        latency_ms=avg_time * 1000,
+    )
+
+
+def benchmark_batch_decode(
+    name: str,
+    decode_batch_fn: Callable[[list[list[int]]], list[str]],
+    token_lists: list[list[int]],
+    text_type: str,
+    warmup: int = 2,
+    iterations: int = 5,
+) -> BenchmarkResult:
+    """Benchmark batch decoding."""
+    total_tokens = sum(len(t) for t in token_lists)
+
+    for _ in range(warmup):
+        decode_batch_fn(token_lists)
+
+    gc.collect()
+
+    times = []
+    total_bytes = 0
+    for _ in range(iterations):
+        start = time.perf_counter_ns()
+        results = decode_batch_fn(token_lists)
+        end = time.perf_counter_ns()
+        times.append((end - start) / 1e9)
+        total_bytes = sum(len(r.encode("utf-8")) for r in results)
+
+    avg_time = statistics.mean(times)
+    bytes_per_second = total_bytes / avg_time
+    tokens_per_second = total_tokens / avg_time
+
+    return BenchmarkResult(
+        name=name,
+        text_type=text_type,
+        benchmark_type="batch_decode",
+        bytes_per_second=bytes_per_second,
+        tokens_per_second=tokens_per_second,
+        num_tokens=total_tokens,
+        num_bytes=total_bytes,
+        latency_ms=avg_time * 1000,
+    )
+
+
+def load_tokenizers() -> dict:
+    """Load all available cl100k_base tokenizers."""
+    tokenizers = {}
+
+    # splintr
+    try:
+        import splintr
+
+        enc = splintr.Tokenizer.from_pretrained("cl100k_base")
+
+        tokenizers["splintr"] = {
+            "encode": enc.encode,
+            "encode_batch": enc.encode_batch,
+            "decode": enc.decode,
+            "decode_batch": enc.decode_batch,
+            "color": "#2ecc71",  # Green
+        }
+        print("Loaded: splintr (cl100k_base)")
+    except ImportError:
+        print("splintr not available - run: maturin develop --release")
+
+    # tiktoken
+    try:
+        import tiktoken
+
+        tik_enc = tiktoken.get_encoding("cl100k_base")
+
+        def tik_encode_batch(texts):
+            return tik_enc.encode_ordinary_batch(texts)
+
+        tokenizers["tiktoken"] = {
+            "encode": tik_enc.encode,
+            "encode_batch": tik_encode_batch,
+            "decode": tik_enc.decode,
+            "decode_batch": tik_enc.decode_batch,
+            "color": "#3498db",  # Blue
+        }
+        print("Loaded: tiktoken (cl100k_base)")
+    except ImportError:
+        print("tiktoken not available - run: pip install tiktoken")
+
+    return tokenizers
+
+
+def run_benchmarks(tokenizers: dict, text_types: list[str] | None = None) -> list[BenchmarkResult]:
+    """Run all benchmarks."""
+    if text_types is None:
+        text_types = list(SAMPLE_TEXTS.keys())
+
+    results = []
+
+    # Single text encoding benchmarks
+    print("\n" + "=" * 70)
+    print("SINGLE TEXT ENCODING BENCHMARKS")
+    print("=" * 70)
+
+    for text_type in text_types:
+        text = SAMPLE_TEXTS[text_type]
+        num_bytes = len(text.encode("utf-8"))
+        print(f"\n--- {text_type.upper()} ({num_bytes:,} bytes) ---")
+
+        for name, tok in tokenizers.items():
+            result = benchmark_encode(name, tok["encode"], text, text_type)
+            results.append(result)
+            print(
+                f"{name:15} {result.bytes_per_second / 1e6:8.2f} MB/s  "
+                f"{result.tokens_per_second / 1e3:8.2f} Ktok/s  "
+                f"{result.latency_ms:8.3f} ms"
+            )
+
+    # Batch encoding benchmarks
+    print("\n" + "=" * 70)
+    print("BATCH ENCODING BENCHMARKS (100 texts)")
+    print("=" * 70)
+
+    for text_type in ["medium", "long"]:
+        texts = [SAMPLE_TEXTS[text_type]] * 100
+        total_bytes = sum(len(t.encode("utf-8")) for t in texts)
+        print(f"\n--- {text_type.upper()} x100 ({total_bytes:,} bytes total) ---")
+
+        for name, tok in tokenizers.items():
+            result = benchmark_batch_encode(
+                name, tok["encode_batch"], texts, f"{text_type}_batch"
+            )
+            results.append(result)
+            print(
+                f"{name:15} {result.bytes_per_second / 1e6:8.2f} MB/s  "
+                f"{result.tokens_per_second / 1e3:8.2f} Ktok/s  "
+                f"{result.latency_ms:8.3f} ms"
+            )
+
+    # Single text decoding benchmarks
+    print("\n" + "=" * 70)
+    print("SINGLE TEXT DECODING BENCHMARKS")
+    print("=" * 70)
+
+    for text_type in text_types:
+        text = SAMPLE_TEXTS[text_type]
+        num_bytes = len(text.encode("utf-8"))
+
+        for name, tok in tokenizers.items():
+            tokens = tok["encode"](text)
+            print(f"\n--- {text_type.upper()} ({len(tokens):,} tokens) ---") if name == list(tokenizers.keys())[0] else None
+            result = benchmark_decode(name, tok["decode"], tokens, text_type, num_bytes)
+            results.append(result)
+            print(
+                f"{name:15} {result.bytes_per_second / 1e6:8.2f} MB/s  "
+                f"{result.tokens_per_second / 1e3:8.2f} Ktok/s  "
+                f"{result.latency_ms:8.3f} ms"
+            )
+
+    # Batch decoding benchmarks
+    print("\n" + "=" * 70)
+    print("BATCH DECODING BENCHMARKS (100 texts)")
+    print("=" * 70)
+
+    for text_type in ["medium", "long"]:
+        texts = [SAMPLE_TEXTS[text_type]] * 100
+        print(f"\n--- {text_type.upper()} x100 ---")
+
+        for name, tok in tokenizers.items():
+            token_lists = tok["encode_batch"](texts)
+            result = benchmark_batch_decode(
+                name, tok["decode_batch"], token_lists, f"{text_type}_batch"
+            )
+            results.append(result)
+            print(
+                f"{name:15} {result.bytes_per_second / 1e6:8.2f} MB/s  "
+                f"{result.tokens_per_second / 1e3:8.2f} Ktok/s  "
+                f"{result.latency_ms:8.3f} ms"
+            )
+
+    return results
+
+
+def generate_chart(
+    results: list[BenchmarkResult],
+    tokenizers: dict,
+    benchmark_type: str,
+    title: str,
+    output_path: str,
+):
+    """Generate a bar chart for a specific benchmark type."""
+    try:
+        import matplotlib.pyplot as plt
+        import numpy as np
+    except ImportError:
+        print("matplotlib/numpy not available - run: pip install matplotlib numpy")
+        return
+
+    filtered = [r for r in results if r.benchmark_type == benchmark_type]
+    if not filtered:
+        return
+
+    names = list(tokenizers.keys())
+    text_types = list(dict.fromkeys(r.text_type for r in filtered))
+
+    fig, ax = plt.subplots(figsize=(12, 6))
+
+    x = np.arange(len(text_types))
+    width = 0.8 / len(names)
+
+    for i, name in enumerate(names):
+        throughputs = []
+        for text_type in text_types:
+            for r in filtered:
+                if r.name == name and r.text_type == text_type:
+                    throughputs.append(r.bytes_per_second / 1e6)
+                    break
+            else:
+                throughputs.append(0)
+
+        ax.bar(
+            x + i * width - width * len(names) / 2 + width / 2,
+            throughputs,
+            width,
+            label=name,
+            color=tokenizers[name]["color"],
+        )
+
+    ax.set_xlabel("Text Type", fontsize=12)
+    ax.set_ylabel("Throughput (MB/s)", fontsize=12)
+    ax.set_title(title, fontsize=14, fontweight="bold")
+    ax.set_xticks(x)
+    ax.set_xticklabels([t.replace("_batch", " (batch)").capitalize() for t in text_types])
+    ax.legend()
+    ax.grid(axis="y", alpha=0.3)
+
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150, bbox_inches="tight")
+    print(f"Chart saved to: {output_path}")
+    plt.close()
+
+
+def save_results_json(results: list[BenchmarkResult], output_path: str):
+    """Save benchmark results as JSON."""
+    data = [
+        {
+            "name": r.name,
+            "text_type": r.text_type,
+            "benchmark_type": r.benchmark_type,
+            "bytes_per_second": r.bytes_per_second,
+            "tokens_per_second": r.tokens_per_second,
+            "num_tokens": r.num_tokens,
+            "num_bytes": r.num_bytes,
+            "latency_ms": r.latency_ms,
+        }
+        for r in results
+    ]
+    with open(output_path, "w") as f:
+        json.dump(data, f, indent=2)
+    print(f"Results saved to: {output_path}")
+
+
+def main():
+    print("=" * 70)
+    print("CL100K_BASE TOKENIZER BENCHMARK COMPARISON")
+    print("splintr vs tiktoken (GPT-4, GPT-3.5-turbo)")
+    print("=" * 70)
+
+    # Create output directory
+    output_dir = Path(__file__).parent.parent / "results" / "cl100k"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Load tokenizers
+    tokenizers = load_tokenizers()
+
+    if len(tokenizers) < 2:
+        print("\nWarning: Less than 2 tokenizers available for comparison")
+        print("Install missing packages:")
+        print("  pip install tiktoken matplotlib numpy")
+        if "splintr" not in tokenizers:
+            print("  maturin develop --release  # for splintr")
+
+    if not tokenizers:
+        print("No tokenizers available!")
+        return
+
+    # Run benchmarks
+    results = run_benchmarks(tokenizers)
+
+    # Generate outputs
+    print("\n" + "=" * 70)
+    print("GENERATING OUTPUTS")
+    print("=" * 70)
+
+    generate_chart(
+        results,
+        tokenizers,
+        "single_encode",
+        "cl100k_base Single Text Encoding Throughput",
+        str(output_dir / "single_encode.png"),
+    )
+    generate_chart(
+        results,
+        tokenizers,
+        "batch_encode",
+        "cl100k_base Batch Encoding Throughput (100 texts)",
+        str(output_dir / "batch_encode.png"),
+    )
+    generate_chart(
+        results,
+        tokenizers,
+        "single_decode",
+        "cl100k_base Single Text Decoding Throughput",
+        str(output_dir / "single_decode.png"),
+    )
+    generate_chart(
+        results,
+        tokenizers,
+        "batch_decode",
+        "cl100k_base Batch Decoding Throughput (100 texts)",
+        str(output_dir / "batch_decode.png"),
+    )
+    save_results_json(results, str(output_dir / "results.json"))
+
+    print("\nDone!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/vocabs/benchmark_llama3.py b/benchmarks/vocabs/benchmark_llama3.py
new file mode 100644
index 0000000..2a1452b
--- /dev/null
+++ b/benchmarks/vocabs/benchmark_llama3.py
@@ -0,0 +1,590 @@
+#!/usr/bin/env python3
+"""
+Benchmark comparison for Llama 3 tokenizers: splintr vs tiktoken vs HuggingFace
+
+Generates performance charts comparing encoding/decoding throughput across different
+Llama 3 tokenizer implementations.
+
+Usage:
+    pip install tiktoken tokenizers transformers matplotlib numpy
+    huggingface-cli login  # For gated Llama 3 models (optional)
+    python benchmarks/vocabs/benchmark_llama3.py
+"""
+
+import base64
+import gc
+import json
+import statistics
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable
+
+# Llama 3 regex pattern (same as o200k_base)
+LLAMA3_PATTERN = r"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
+
+# Sample texts for benchmarking
+SAMPLE_TEXTS = {
+    "short": "Hello, world! This is a test.",
+    "medium": """The quick brown fox jumps over the lazy dog.
+    Machine learning models require tokenization to process text efficiently.
+    Tokenizers convert text into numerical representations that models can understand."""
+    * 10,
+    "long": """Artificial intelligence and machine learning have revolutionized
+    the way we process and understand natural language. Large language models (LLMs)
+    like GPT-4, Claude, and others rely heavily on efficient tokenization to handle
+    vast amounts of text data. The performance of tokenizers directly impacts the
+    overall throughput of these systems, making optimization crucial for production
+    deployments. BPE (Byte Pair Encoding) has become the de facto standard for
+    modern tokenizers due to its balance of vocabulary efficiency and handling of
+    out-of-vocabulary words."""
+    * 50,
+    "code": """
+def fibonacci(n: int) -> int:
+    \"\"\"Calculate the nth Fibonacci number.\"\"\"
+    if n <= 1:
+        return n
+    return fibonacci(n - 1) + fibonacci(n - 2)
+
+class TokenizerBenchmark:
+    def __init__(self, name: str):
+        self.name = name
+        self.results = []
+
+    def run(self, text: str, iterations: int = 100):
+        for _ in range(iterations):
+            tokens = self.encode(text)
+            self.results.append(len(tokens))
+"""
+    * 20,
+    "multilingual": """
+    English: The quick brown fox jumps over the lazy dog.
+    中文: 快速的棕色狐狸跳过懒狗。
+    日本語: 素早い茶色の狐が怠惰な犬を飛び越える。
+    한국어: 빠른 갈색 여우가 게으른 개를 뛰어넘습니다.
+    العربية: الثعلب البني السريع يقفز فوق الكلب الكسول.
+    Русский: Быстрая коричневая лиса прыгает через ленивую собаку.
+    """
+    * 20,
+}
+
+
+@dataclass
+class BenchmarkResult:
+    name: str
+    text_type: str
+    benchmark_type: str  # "single_encode", "batch_encode", "single_decode", "batch_decode"
+    bytes_per_second: float
+    tokens_per_second: float
+    num_tokens: int
+    num_bytes: int
+    latency_ms: float
+
+
+def benchmark_encode(
+    name: str,
+    encode_fn: Callable[[str], list],
+    text: str,
+    text_type: str,
+    warmup: int = 3,
+    iterations: int = 10,
+) -> BenchmarkResult:
+    """Benchmark single text encoding."""
+    num_bytes = len(text.encode("utf-8"))
+
+    # Warmup
+    for _ in range(warmup):
+        encode_fn(text)
+
+    gc.collect()
+
+    times = []
+    num_tokens = 0
+    for _ in range(iterations):
+        start = time.perf_counter_ns()
+        tokens = encode_fn(text)
+        end = time.perf_counter_ns()
+        times.append((end - start) / 1e9)
+        num_tokens = len(tokens)
+
+    avg_time = statistics.mean(times)
+    bytes_per_second = num_bytes / avg_time
+    tokens_per_second = num_tokens / avg_time
+
+    return BenchmarkResult(
+        name=name,
+        text_type=text_type,
+        benchmark_type="single_encode",
+        bytes_per_second=bytes_per_second,
+        tokens_per_second=tokens_per_second,
+        num_tokens=num_tokens,
+        num_bytes=num_bytes,
+        latency_ms=avg_time * 1000,
+    )
+
+
+def benchmark_batch_encode(
+    name: str,
+    encode_batch_fn: Callable[[list[str]], list],
+    texts: list[str],
+    text_type: str,
+    warmup: int = 2,
+    iterations: int = 5,
+) -> BenchmarkResult:
+    """Benchmark batch encoding."""
+    num_bytes = sum(len(t.encode("utf-8")) for t in texts)
+
+    # Warmup
+    for _ in range(warmup):
+        encode_batch_fn(texts)
+
+    gc.collect()
+
+    times = []
+    total_tokens = 0
+    for _ in range(iterations):
+        start = time.perf_counter_ns()
+        results = encode_batch_fn(texts)
+        end = time.perf_counter_ns()
+        times.append((end - start) / 1e9)
+        total_tokens = sum(len(r) for r in results)
+
+    avg_time = statistics.mean(times)
+    bytes_per_second = num_bytes / avg_time
+    tokens_per_second = total_tokens / avg_time
+
+    return BenchmarkResult(
+        name=name,
+        text_type=text_type,
+        benchmark_type="batch_encode",
+        bytes_per_second=bytes_per_second,
+        tokens_per_second=tokens_per_second,
+        num_tokens=total_tokens,
+        num_bytes=num_bytes,
+        latency_ms=avg_time * 1000,
+    )
+
+
+def benchmark_decode(
+    name: str,
+    decode_fn: Callable[[list[int]], str],
+    tokens: list[int],
+    text_type: str,
+    original_bytes: int,
+    warmup: int = 3,
+    iterations: int = 10,
+) -> BenchmarkResult:
+    """Benchmark single text decoding."""
+    num_tokens = len(tokens)
+
+    # Warmup
+    for _ in range(warmup):
+        decode_fn(tokens)
+
+    gc.collect()
+
+    times = []
+    num_bytes = 0
+    for _ in range(iterations):
+        start = time.perf_counter_ns()
+        text = decode_fn(tokens)
+        end = time.perf_counter_ns()
+        times.append((end - start) / 1e9)
+        num_bytes = len(text.encode("utf-8"))
+
+    avg_time = statistics.mean(times)
+    bytes_per_second = num_bytes / avg_time
+    tokens_per_second = num_tokens / avg_time
+
+    return BenchmarkResult(
+        name=name,
+        text_type=text_type,
+        benchmark_type="single_decode",
+        bytes_per_second=bytes_per_second,
+        tokens_per_second=tokens_per_second,
+        num_tokens=num_tokens,
+        num_bytes=num_bytes,
+        latency_ms=avg_time * 1000,
+    )
+
+
+def benchmark_batch_decode(
+    name: str,
+    decode_batch_fn: Callable[[list[list[int]]], list[str]],
+    token_lists: list[list[int]],
+    text_type: str,
+    warmup: int = 2,
+    iterations: int = 5,
+) -> BenchmarkResult:
+    """Benchmark batch decoding."""
+    total_tokens = sum(len(t) for t in token_lists)
+
+    # Warmup
+    for _ in range(warmup):
+        decode_batch_fn(token_lists)
+
+    gc.collect()
+
+    times = []
+    total_bytes = 0
+    for _ in range(iterations):
+        start = time.perf_counter_ns()
+        results = decode_batch_fn(token_lists)
+        end = time.perf_counter_ns()
+        times.append((end - start) / 1e9)
+        total_bytes = sum(len(r.encode("utf-8")) for r in results)
+
+    avg_time = statistics.mean(times)
+    bytes_per_second = total_bytes / avg_time
+    tokens_per_second = total_tokens / avg_time
+
+    return BenchmarkResult(
+        name=name,
+        text_type=text_type,
+        benchmark_type="batch_decode",
+        bytes_per_second=bytes_per_second,
+        tokens_per_second=tokens_per_second,
+        num_tokens=total_tokens,
+        num_bytes=total_bytes,
+        latency_ms=avg_time * 1000,
+    )
+
+
+def load_llama3_vocab() -> dict[bytes, int]:
+    """Load Llama 3 vocabulary from tiktoken file."""
+    vocab_path = Path(__file__).parent.parent.parent / "python/splintr/vocabs/llama3.tiktoken"
+    mergeable_ranks = {}
+    with open(vocab_path, "rb") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            parts = line.rsplit(b" ", 1)
+            token_b64 = parts[0]
+            rank = int(parts[1])
+            mergeable_ranks[base64.b64decode(token_b64)] = rank
+    return mergeable_ranks
+
+
+def load_tokenizers() -> dict:
+    """Load all available Llama 3 tokenizers."""
+    tokenizers = {}
+
+    # splintr
+    try:
+        import splintr
+
+        enc = splintr.Tokenizer.from_pretrained("llama3")
+
+        tokenizers["splintr"] = {
+            "encode": enc.encode,
+            "encode_batch": enc.encode_batch,
+            "decode": enc.decode,
+            "decode_batch": enc.decode_batch,
+            "color": "#2ecc71",  # Green
+        }
+        print("Loaded: splintr (llama3)")
+    except ImportError:
+        print("splintr not available - run: maturin develop --release")
+
+    # tiktoken with custom Llama 3 vocab
+    try:
+        import tiktoken
+
+        mergeable_ranks = load_llama3_vocab()
+        tik_enc = tiktoken.Encoding(
+            name="llama3",
+            pat_str=LLAMA3_PATTERN,
+            mergeable_ranks=mergeable_ranks,
+            special_tokens={},
+        )
+
+        def tik_encode_batch(texts):
+            return [tik_enc.encode(t) for t in texts]
+
+        tokenizers["tiktoken"] = {
+            "encode": tik_enc.encode,
+            "encode_batch": tik_encode_batch,
+            "decode": tik_enc.decode,
+            "decode_batch": tik_enc.decode_batch,
+            "color": "#3498db",  # Blue
+        }
+        print("Loaded: tiktoken (llama3 custom)")
+    except ImportError:
+        print("tiktoken not available - run: pip install tiktoken")
+    except Exception as e:
+        print(f"tiktoken loading failed: {e}")
+
+    # HuggingFace tokenizers
+    try:
+        from transformers import AutoTokenizer
+
+        hf_enc = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", use_fast=True)
+
+        def hf_encode(text):
+            return hf_enc.encode(text, add_special_tokens=False)
+
+        def hf_encode_batch(texts):
+            return [hf_enc.encode(t, add_special_tokens=False) for t in texts]
+
+        def hf_decode(tokens):
+            return hf_enc.decode(tokens)
+
+        tokenizers["huggingface"] = {
+            "encode": hf_encode,
+            "encode_batch": hf_encode_batch,
+            "decode": hf_decode,
+            "decode_batch": hf_enc.batch_decode,  # HuggingFace uses batch_decode
+            "color": "#e74c3c",  # Red
+        }
+        print("Loaded: huggingface (meta-llama/Llama-3.2-1B)")
+    except ImportError:
+        print("HuggingFace transformers not available - run: pip install transformers")
+    except Exception as e:
+        print(f"HuggingFace loading failed (may need huggingface-cli login): {e}")
+
+    return tokenizers
+
+
+def run_benchmarks(tokenizers: dict, text_types: list[str] | None = None) -> list[BenchmarkResult]:
+    """Run all benchmarks."""
+    if text_types is None:
+        text_types = list(SAMPLE_TEXTS.keys())
+
+    results = []
+
+    # Single text encoding benchmarks
+    print("\n" + "=" * 70)
+    print("SINGLE TEXT ENCODING BENCHMARKS")
+    print("=" * 70)
+
+    for text_type in text_types:
+        text = SAMPLE_TEXTS[text_type]
+        num_bytes = len(text.encode("utf-8"))
+        print(f"\n--- {text_type.upper()} ({num_bytes:,} bytes) ---")
+
+        for name, tok in tokenizers.items():
+            result = benchmark_encode(name, tok["encode"], text, text_type)
+            results.append(result)
+            print(
+                f"{name:15} {result.bytes_per_second / 1e6:8.2f} MB/s  "
+                f"{result.tokens_per_second / 1e3:8.2f} Ktok/s  "
+                f"{result.latency_ms:8.3f} ms"
+            )
+
+    # Batch encoding benchmarks
+    print("\n" + "=" * 70)
+    print("BATCH ENCODING BENCHMARKS (100 texts)")
+    print("=" * 70)
+
+    for text_type in ["medium", "long"]:
+        texts = [SAMPLE_TEXTS[text_type]] * 100
+        total_bytes = sum(len(t.encode("utf-8")) for t in texts)
+        print(f"\n--- {text_type.upper()} x100 ({total_bytes:,} bytes total) ---")
+
+        for name, tok in tokenizers.items():
+            result = benchmark_batch_encode(
+                name, tok["encode_batch"], texts, f"{text_type}_batch"
+            )
+            results.append(result)
+            print(
+                f"{name:15} {result.bytes_per_second / 1e6:8.2f} MB/s  "
+                f"{result.tokens_per_second / 1e3:8.2f} Ktok/s  "
+                f"{result.latency_ms:8.3f} ms"
+            )
+
+    # Single text decoding benchmarks
+    print("\n" + "=" * 70)
+    print("SINGLE TEXT DECODING BENCHMARKS")
+    print("=" * 70)
+
+    # Use the first available tokenizer to generate tokens for decoding
+    reference_tokenizer = next(iter(tokenizers.values()))
+    for text_type in text_types:
+        text = SAMPLE_TEXTS[text_type]
+        tokens = reference_tokenizer["encode"](text)
+        num_bytes = len(text.encode("utf-8"))
+        print(f"\n--- {text_type.upper()} ({len(tokens):,} tokens) ---")
+
+        for name, tok in tokenizers.items():
+            # Re-encode with this tokenizer to get correct tokens
+            tokens = tok["encode"](text)
+            result = benchmark_decode(name, tok["decode"], tokens, text_type, num_bytes)
+            results.append(result)
+            print(
+                f"{name:15} {result.bytes_per_second / 1e6:8.2f} MB/s  "
+                f"{result.tokens_per_second / 1e3:8.2f} Ktok/s  "
+                f"{result.latency_ms:8.3f} ms"
+            )
+
+    # Batch decoding benchmarks
+    print("\n" + "=" * 70)
+    print("BATCH DECODING BENCHMARKS (100 texts)")
+    print("=" * 70)
+
+    for text_type in ["medium", "long"]:
+        texts = [SAMPLE_TEXTS[text_type]] * 100
+        print(f"\n--- {text_type.upper()} x100 ---")
+
+        for name, tok in tokenizers.items():
+            token_lists = tok["encode_batch"](texts)
+            result = benchmark_batch_decode(
+                name, tok["decode_batch"], token_lists, f"{text_type}_batch"
+            )
+            results.append(result)
+            print(
+                f"{name:15} {result.bytes_per_second / 1e6:8.2f} MB/s  "
+                f"{result.tokens_per_second / 1e3:8.2f} Ktok/s  "
+                f"{result.latency_ms:8.3f} ms"
+            )
+
+    return results
+
+
+def generate_chart(
+    results: list[BenchmarkResult],
+    tokenizers: dict,
+    benchmark_type: str,
+    title: str,
+    output_path: str,
+):
+    """Generate a bar chart for a specific benchmark type."""
+    try:
+        import matplotlib.pyplot as plt
+        import numpy as np
+    except ImportError:
+        print("matplotlib/numpy not available - run: pip install matplotlib numpy")
+        return
+
+    # Filter results for this benchmark type
+    filtered = [r for r in results if r.benchmark_type == benchmark_type]
+    if not filtered:
+        return
+
+    names = list(tokenizers.keys())
+    text_types = list(dict.fromkeys(r.text_type for r in filtered))
+
+    fig, ax = plt.subplots(figsize=(12, 6))
+
+    x = np.arange(len(text_types))
+    width = 0.8 / len(names)
+
+    for i, name in enumerate(names):
+        throughputs = []
+        for text_type in text_types:
+            for r in filtered:
+                if r.name == name and r.text_type == text_type:
+                    throughputs.append(r.bytes_per_second / 1e6)
+                    break
+            else:
+                throughputs.append(0)
+
+        ax.bar(
+            x + i * width - width * len(names) / 2 + width / 2,
+            throughputs,
+            width,
+            label=name,
+            color=tokenizers[name]["color"],
+        )
+
+    ax.set_xlabel("Text Type", fontsize=12)
+    ax.set_ylabel("Throughput (MB/s)", fontsize=12)
+    ax.set_title(title, fontsize=14, fontweight="bold")
+    ax.set_xticks(x)
+    ax.set_xticklabels([t.replace("_batch", " (batch)").capitalize() for t in text_types])
+    ax.legend()
+    ax.grid(axis="y", alpha=0.3)
+
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150, bbox_inches="tight")
+    print(f"Chart saved to: {output_path}")
+    plt.close()
+
+
+def save_results_json(results: list[BenchmarkResult], output_path: str):
+    """Save benchmark results as JSON."""
+    data = [
+        {
+            "name": r.name,
+            "text_type": r.text_type,
+            "benchmark_type": r.benchmark_type,
+            "bytes_per_second": r.bytes_per_second,
+            "tokens_per_second": r.tokens_per_second,
+            "num_tokens": r.num_tokens,
+            "num_bytes": r.num_bytes,
+            "latency_ms": r.latency_ms,
+        }
+        for r in results
+    ]
+    with open(output_path, "w") as f:
+        json.dump(data, f, indent=2)
+    print(f"Results saved to: {output_path}")
+
+
+def main():
+    print("=" * 70)
+    print("LLAMA 3 TOKENIZER BENCHMARK COMPARISON")
+    print("splintr vs tiktoken vs HuggingFace")
+    print("=" * 70)
+
+    # Create output directory
+    output_dir = Path(__file__).parent.parent / "results" / "llama3"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Load tokenizers
+    tokenizers = load_tokenizers()
+
+    if len(tokenizers) < 2:
+        print("\nWarning: Less than 2 tokenizers available for comparison")
+        print("Install missing packages:")
+        print("  pip install tiktoken transformers matplotlib numpy")
+        if "splintr" not in tokenizers:
+            print("  maturin develop --release  # for splintr")
+
+    if not tokenizers:
+        print("No tokenizers available!")
+        return
+
+    # Run benchmarks
+    results = run_benchmarks(tokenizers)
+
+    # Generate outputs
+    print("\n" + "=" * 70)
+    print("GENERATING OUTPUTS")
+    print("=" * 70)
+
+    generate_chart(
+        results,
+        tokenizers,
+        "single_encode",
+        "Llama 3 Single Text Encoding Throughput",
+        str(output_dir / "single_encode.png"),
+    )
+    generate_chart(
+        results,
+        tokenizers,
+        "batch_encode",
+        "Llama 3 Batch Encoding Throughput (100 texts)",
+        str(output_dir / "batch_encode.png"),
+    )
+    generate_chart(
+        results,
+        tokenizers,
+        "single_decode",
+        "Llama 3 Single Text Decoding Throughput",
+        str(output_dir / "single_decode.png"),
+    )
+    generate_chart(
+        results,
+        tokenizers,
+        "batch_decode",
+        "Llama 3 Batch Decoding Throughput (100 texts)",
+        str(output_dir / "batch_decode.png"),
+    )
+    save_results_json(results, str(output_dir / "results.json"))
+
+    print("\nDone!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/vocabs/benchmark_o200k.py b/benchmarks/vocabs/benchmark_o200k.py
new file mode 100644
index 0000000..8901e0e
--- /dev/null
+++ b/benchmarks/vocabs/benchmark_o200k.py
@@ -0,0 +1,523 @@
+#!/usr/bin/env python3
+"""
+Benchmark comparison for o200k_base tokenizers: splintr vs tiktoken
+
+o200k_base is used by GPT-4o.
+
+Usage:
+    pip install tiktoken matplotlib numpy
+    python benchmarks/vocabs/benchmark_o200k.py
+"""
+
+import gc
+import json
+import statistics
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable
+
+# Sample texts for benchmarking
+SAMPLE_TEXTS = {
+    "short": "Hello, world! This is a test.",
+    "medium": """The quick brown fox jumps over the lazy dog.
+    Machine learning models require tokenization to process text efficiently.
+    Tokenizers convert text into numerical representations that models can understand."""
+    * 10,
+    "long": """Artificial intelligence and machine learning have revolutionized
+    the way we process and understand natural language. Large language models (LLMs)
+    like GPT-4, Claude, and others rely heavily on efficient tokenization to handle
+    vast amounts of text data. The performance of tokenizers directly impacts the
+    overall throughput of these systems, making optimization crucial for production
+    deployments. BPE (Byte Pair Encoding) has become the de facto standard for
+    modern tokenizers due to its balance of vocabulary efficiency and handling of
+    out-of-vocabulary words."""
+    * 50,
+    "code": """
+def fibonacci(n: int) -> int:
+    \"\"\"Calculate the nth Fibonacci number.\"\"\"
+    if n <= 1:
+        return n
+    return fibonacci(n - 1) + fibonacci(n - 2)
+
+class TokenizerBenchmark:
+    def __init__(self, name: str):
+        self.name = name
+        self.results = []
+
+    def run(self, text: str, iterations: int = 100):
+        for _ in range(iterations):
+            tokens = self.encode(text)
+            self.results.append(len(tokens))
+"""
+    * 20,
+    "multilingual": """
+    English: The quick brown fox jumps over the lazy dog.
+    中文: 快速的棕色狐狸跳过懒狗。
+    日本語: 素早い茶色の狐が怠惰な犬を飛び越える。
+    한국어: 빠른 갈색 여우가 게으른 개를 뛰어넘습니다.
+    العربية: الثعلب البني السريع يقفز فوق الكلب الكسول.
+    Русский: Быстрая коричневая лиса прыгает через ленивую собаку.
+    """
+    * 20,
+}
+
+
+@dataclass
+class BenchmarkResult:
+    name: str
+    text_type: str
+    benchmark_type: str
+    bytes_per_second: float
+    tokens_per_second: float
+    num_tokens: int
+    num_bytes: int
+    latency_ms: float
+
+
+def benchmark_encode(
+    name: str,
+    encode_fn: Callable[[str], list],
+    text: str,
+    text_type: str,
+    warmup: int = 3,
+    iterations: int = 10,
+) -> BenchmarkResult:
+    """Benchmark single text encoding."""
+    num_bytes = len(text.encode("utf-8"))
+
+    for _ in range(warmup):
+        encode_fn(text)
+
+    gc.collect()
+
+    times = []
+    num_tokens = 0
+    for _ in range(iterations):
+        start = time.perf_counter_ns()
+        tokens = encode_fn(text)
+        end = time.perf_counter_ns()
+        times.append((end - start) / 1e9)
+        num_tokens = len(tokens)
+
+    avg_time = statistics.mean(times)
+    bytes_per_second = num_bytes / avg_time
+    tokens_per_second = num_tokens / avg_time
+
+    return BenchmarkResult(
+        name=name,
+        text_type=text_type,
+        benchmark_type="single_encode",
+        bytes_per_second=bytes_per_second,
+        tokens_per_second=tokens_per_second,
+        num_tokens=num_tokens,
+        num_bytes=num_bytes,
+        latency_ms=avg_time * 1000,
+    )
+
+
+def benchmark_batch_encode(
+    name: str,
+    encode_batch_fn: Callable[[list[str]], list],
+    texts: list[str],
+    text_type: str,
+    warmup: int = 2,
+    iterations: int = 5,
+) -> BenchmarkResult:
+    """Benchmark batch encoding."""
+    num_bytes = sum(len(t.encode("utf-8")) for t in texts)
+
+    for _ in range(warmup):
+        encode_batch_fn(texts)
+
+    gc.collect()
+
+    times = []
+    total_tokens = 0
+    for _ in range(iterations):
+        start = time.perf_counter_ns()
+        results = encode_batch_fn(texts)
+        end = time.perf_counter_ns()
+        times.append((end - start) / 1e9)
+        total_tokens = sum(len(r) for r in results)
+
+    avg_time = statistics.mean(times)
+    bytes_per_second = num_bytes / avg_time
+    tokens_per_second = total_tokens / avg_time
+
+    return BenchmarkResult(
+        name=name,
+        text_type=text_type,
+        benchmark_type="batch_encode",
+        bytes_per_second=bytes_per_second,
+        tokens_per_second=tokens_per_second,
+        num_tokens=total_tokens,
+        num_bytes=num_bytes,
+        latency_ms=avg_time * 1000,
+    )
+
+
+def benchmark_decode(
+    name: str,
+    decode_fn: Callable[[list[int]], str],
+    tokens: list[int],
+    text_type: str,
+    original_bytes: int,
+    warmup: int = 3,
+    iterations: int = 10,
+) -> BenchmarkResult:
+    """Benchmark single text decoding."""
+    num_tokens = len(tokens)
+
+    for _ in range(warmup):
+        decode_fn(tokens)
+
+    gc.collect()
+
+    times = []
+    num_bytes = 0
+    for _ in range(iterations):
+        start = time.perf_counter_ns()
+        text = decode_fn(tokens)
+        end = time.perf_counter_ns()
+        times.append((end - start) / 1e9)
+        num_bytes = len(text.encode("utf-8"))
+
+    avg_time = statistics.mean(times)
+    bytes_per_second = num_bytes / avg_time
+    tokens_per_second = num_tokens / avg_time
+
+    return BenchmarkResult(
+        name=name,
+        text_type=text_type,
+        benchmark_type="single_decode",
+        bytes_per_second=bytes_per_second,
+        tokens_per_second=tokens_per_second,
+        num_tokens=num_tokens,
+        num_bytes=num_bytes,
+        latency_ms=avg_time * 1000,
+    )
+
+
+def benchmark_batch_decode(
+    name: str,
+    decode_batch_fn: Callable[[list[list[int]]], list[str]],
+    token_lists: list[list[int]],
+    text_type: str,
+    warmup: int = 2,
+    iterations: int = 5,
+) -> BenchmarkResult:
+    """Benchmark batch decoding."""
+    total_tokens = sum(len(t) for t in token_lists)
+
+    for _ in range(warmup):
+        decode_batch_fn(token_lists)
+
+    gc.collect()
+
+    times = []
+    total_bytes = 0
+    for _ in range(iterations):
+        start = time.perf_counter_ns()
+        results = decode_batch_fn(token_lists)
+        end = time.perf_counter_ns()
+        times.append((end - start) / 1e9)
+        total_bytes = sum(len(r.encode("utf-8")) for r in results)
+
+    avg_time = statistics.mean(times)
+    bytes_per_second = total_bytes / avg_time
+    tokens_per_second = total_tokens / avg_time
+
+    return BenchmarkResult(
+        name=name,
+        text_type=text_type,
+        benchmark_type="batch_decode",
+        bytes_per_second=bytes_per_second,
+        tokens_per_second=total_tokens / avg_time,
+        num_tokens=total_tokens,
+        num_bytes=total_bytes,
+        latency_ms=avg_time * 1000,
+    )
+
+
+def load_tokenizers() -> dict:
+    """Load all available o200k_base tokenizers."""
+    tokenizers = {}
+
+    # splintr
+    try:
+        import splintr
+
+        enc = splintr.Tokenizer.from_pretrained("o200k_base")
+
+        tokenizers["splintr"] = {
+            "encode": enc.encode,
+            "encode_batch": enc.encode_batch,
+            "decode": enc.decode,
+            "decode_batch": enc.decode_batch,
+            "color": "#2ecc71",  # Green
+        }
+        print("Loaded: splintr (o200k_base)")
+    except ImportError:
+        print("splintr not available - run: maturin develop --release")
+
+    # tiktoken
+    try:
+        import tiktoken
+
+        tik_enc = tiktoken.get_encoding("o200k_base")
+
+        def tik_encode_batch(texts):
+            return tik_enc.encode_ordinary_batch(texts)
+
+        tokenizers["tiktoken"] = {
+            "encode": tik_enc.encode,
+            "encode_batch": tik_encode_batch,
+            "decode": tik_enc.decode,
+            "decode_batch": tik_enc.decode_batch,
+            "color": "#3498db",  # Blue
+        }
+        print("Loaded: tiktoken (o200k_base)")
+    except ImportError:
+        print("tiktoken not available - run: pip install tiktoken")
+
+    return tokenizers
+
+
+def run_benchmarks(tokenizers: dict, text_types: list[str] | None = None) -> list[BenchmarkResult]:
+    """Run all benchmarks."""
+    if text_types is None:
+        text_types = list(SAMPLE_TEXTS.keys())
+
+    results = []
+
+    # Single text encoding benchmarks
+    print("\n" + "=" * 70)
+    print("SINGLE TEXT ENCODING BENCHMARKS")
+    print("=" * 70)
+
+    for text_type in text_types:
+        text = SAMPLE_TEXTS[text_type]
+        num_bytes = len(text.encode("utf-8"))
+        print(f"\n--- {text_type.upper()} ({num_bytes:,} bytes) ---")
+
+        for name, tok in tokenizers.items():
+            result = benchmark_encode(name, tok["encode"], text, text_type)
+            results.append(result)
+            print(
+                f"{name:15} {result.bytes_per_second / 1e6:8.2f} MB/s  "
+                f"{result.tokens_per_second / 1e3:8.2f} Ktok/s  "
+                f"{result.latency_ms:8.3f} ms"
+            )
+
+    # Batch encoding benchmarks
+    print("\n" + "=" * 70)
+    print("BATCH ENCODING BENCHMARKS (100 texts)")
+    print("=" * 70)
+
+    for text_type in ["medium", "long"]:
+        texts = [SAMPLE_TEXTS[text_type]] * 100
+        total_bytes = sum(len(t.encode("utf-8")) for t in texts)
+        print(f"\n--- {text_type.upper()} x100 ({total_bytes:,} bytes total) ---")
+
+        for name, tok in tokenizers.items():
+            result = benchmark_batch_encode(
+                name, tok["encode_batch"], texts, f"{text_type}_batch"
+            )
+            results.append(result)
+            print(
+                f"{name:15} {result.bytes_per_second / 1e6:8.2f} MB/s  "
+                f"{result.tokens_per_second / 1e3:8.2f} Ktok/s  "
+                f"{result.latency_ms:8.3f} ms"
+            )
+
+    # Single text decoding benchmarks
+    print("\n" + "=" * 70)
+    print("SINGLE TEXT DECODING BENCHMARKS")
+    print("=" * 70)
+
+    for text_type in text_types:
+        text = SAMPLE_TEXTS[text_type]
+        num_bytes = len(text.encode("utf-8"))
+
+        for name, tok in tokenizers.items():
+            tokens = tok["encode"](text)
+            print(f"\n--- {text_type.upper()} ({len(tokens):,} tokens) ---") if name == list(tokenizers.keys())[0] else None
+            result = benchmark_decode(name, tok["decode"], tokens, text_type, num_bytes)
+            results.append(result)
+            print(
+                f"{name:15} {result.bytes_per_second / 1e6:8.2f} MB/s  "
+                f"{result.tokens_per_second / 1e3:8.2f} Ktok/s  "
+                f"{result.latency_ms:8.3f} ms"
+            )
+
+    # Batch decoding benchmarks
+    print("\n" + "=" * 70)
+    print("BATCH DECODING BENCHMARKS (100 texts)")
+    print("=" * 70)
+
+    for text_type in ["medium", "long"]:
+        texts = [SAMPLE_TEXTS[text_type]] * 100
+        print(f"\n--- {text_type.upper()} x100 ---")
+
+        for name, tok in tokenizers.items():
+            token_lists = tok["encode_batch"](texts)
+            result = benchmark_batch_decode(
+                name, tok["decode_batch"], token_lists, f"{text_type}_batch"
+            )
+            results.append(result)
+            print(
+                f"{name:15} {result.bytes_per_second / 1e6:8.2f} MB/s  "
+                f"{result.tokens_per_second / 1e3:8.2f} Ktok/s  "
+                f"{result.latency_ms:8.3f} ms"
+            )
+
+    return results
+
+
+def generate_chart(
+    results: list[BenchmarkResult],
+    tokenizers: dict,
+    benchmark_type: str,
+    title: str,
+    output_path: str,
+):
+    """Generate a bar chart for a specific benchmark type."""
+    try:
+        import matplotlib.pyplot as plt
+        import numpy as np
+    except ImportError:
+        print("matplotlib/numpy not available - run: pip install matplotlib numpy")
+        return
+
+    filtered = [r for r in results if r.benchmark_type == benchmark_type]
+    if not filtered:
+        return
+
+    names = list(tokenizers.keys())
+    text_types = list(dict.fromkeys(r.text_type for r in filtered))
+
+    fig, ax = plt.subplots(figsize=(12, 6))
+
+    x = np.arange(len(text_types))
+    width = 0.8 / len(names)
+
+    for i, name in enumerate(names):
+        throughputs = []
+        for text_type in text_types:
+            for r in filtered:
+                if r.name == name and r.text_type == text_type:
+                    throughputs.append(r.bytes_per_second / 1e6)
+                    break
+            else:
+                throughputs.append(0)
+
+        ax.bar(
+            x + i * width - width * len(names) / 2 + width / 2,
+            throughputs,
+            width,
+            label=name,
+            color=tokenizers[name]["color"],
+        )
+
+    ax.set_xlabel("Text Type", fontsize=12)
+    ax.set_ylabel("Throughput (MB/s)", fontsize=12)
+    ax.set_title(title, fontsize=14, fontweight="bold")
+    ax.set_xticks(x)
+    ax.set_xticklabels([t.replace("_batch", " (batch)").capitalize() for t in text_types])
+    ax.legend()
+    ax.grid(axis="y", alpha=0.3)
+
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150, bbox_inches="tight")
+    print(f"Chart saved to: {output_path}")
+    plt.close()
+
+
+def save_results_json(results: list[BenchmarkResult], output_path: str):
+    """Save benchmark results as JSON."""
+    data = [
+        {
+            "name": r.name,
+            "text_type": r.text_type,
+            "benchmark_type": r.benchmark_type,
+            "bytes_per_second": r.bytes_per_second,
+            "tokens_per_second": r.tokens_per_second,
+            "num_tokens": r.num_tokens,
+            "num_bytes": r.num_bytes,
+            "latency_ms": r.latency_ms,
+        }
+        for r in results
+    ]
+    with open(output_path, "w") as f:
+        json.dump(data, f, indent=2)
+    print(f"Results saved to: {output_path}")
+
+
+def main():
+    print("=" * 70)
+    print("O200K_BASE TOKENIZER BENCHMARK COMPARISON")
+    print("splintr vs tiktoken (GPT-4o)")
+    print("=" * 70)
+
+    # Create output directory
+    output_dir = Path(__file__).parent.parent / "results" / "o200k"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Load tokenizers
+    tokenizers = load_tokenizers()
+
+    if len(tokenizers) < 2:
+        print("\nWarning: Less than 2 tokenizers available for comparison")
+        print("Install missing packages:")
+        print("  pip install tiktoken matplotlib numpy")
+        if "splintr" not in tokenizers:
+            print("  maturin develop --release  # for splintr")
+
+    if not tokenizers:
+        print("No tokenizers available!")
+        return
+
+    # Run benchmarks
+    results = run_benchmarks(tokenizers)
+
+    # Generate outputs
+    print("\n" + "=" * 70)
+    print("GENERATING OUTPUTS")
+    print("=" * 70)
+
+    generate_chart(
+        results,
+        tokenizers,
+        "single_encode",
+        "o200k_base Single Text Encoding Throughput",
+        str(output_dir / "single_encode.png"),
+    )
+    generate_chart(
+        results,
+        tokenizers,
+        "batch_encode",
+        "o200k_base Batch Encoding Throughput (100 texts)",
+        str(output_dir / "batch_encode.png"),
+    )
+    generate_chart(
+        results,
+        tokenizers,
+        "single_decode",
+        "o200k_base Single Text Decoding Throughput",
+        str(output_dir / "single_decode.png"),
+    )
+    generate_chart(
+        results,
+        tokenizers,
+        "batch_decode",
+        "o200k_base Batch Decoding Throughput (100 texts)",
+        str(output_dir / "batch_decode.png"),
+    )
+    save_results_json(results, str(output_dir / "results.json"))
+
+    print("\nDone!")
+
+
+if __name__ == "__main__":
+    main()

From f18227ab18c16231a9300f02fcb6bc4b0382008c Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Wed, 26 Nov 2025 21:13:27 +0800
Subject: [PATCH 4/4] feat: add comprehensive benchmark scripts for tokenizers

Add benchmark suites for cl100k_base, Llama 3, and o200k_base tokenizers.
Each script compares splintr performance against reference implementations
(tiktoken, HuggingFace) across single/batch encoding and decoding operations.

- benchmark_cl100k.py: GPT-4/GPT-3.5-turbo tokenizer benchmarks
- benchmark_llama3.py: Llama 3 family tokenizer benchmarks
- benchmark_o200k.py: GPT-4o tokenizer benchmarks

Benchmarks measure throughput (MB/s, tokens/s) and latency across various
text types (short, medium, long, code, multilingual) with visualization
support via matplotlib charts.
---
 python/splintr/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/splintr/__init__.py b/python/splintr/__init__.py
index 652e163..e7488c1 100644
--- a/python/splintr/__init__.py
+++ b/python/splintr/__init__.py
@@ -92,4 +92,4 @@
     "O200K_AGENT_TOKENS",
     "LLAMA3_AGENT_TOKENS",
 ]
-__version__ = "0.4.0"
+__version__ = "0.5.0"