diff --git a/.version b/.version index 0ea3a94..0d91a54 100644 --- a/.version +++ b/.version @@ -1 +1 @@ -0.2.0 +0.3.0 diff --git a/Cargo.toml b/Cargo.toml index 851ad9a..66bd894 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "splintr" -version = "0.2.0" +version = "0.3.0" edition = "2021" description = "Fast Rust BPE tokenizer with Python bindings" license = "MIT" diff --git a/README.md b/README.md index d963504..6c56362 100644 --- a/README.md +++ b/README.md @@ -104,9 +104,10 @@ tokenizer = Tokenizer( **Encoding:** -- `encode(text: str) -> list[int]`: Encode text to token IDs, treating special tokens as regular text +- `encode(text: str) -> list[int]`: Encode text to token IDs (sequential, optimal for most use cases) - `encode_with_special(text: str) -> list[int]`: Encode text, recognizing special tokens in the input -- `encode_batch(texts: list[str]) -> list[list[int]]`: Encode multiple texts in parallel +- `encode_batch(texts: list[str]) -> list[list[int]]`: Encode multiple texts in parallel (uses Rayon) +- `encode_rayon(text: str) -> list[int]`: Encode using Rayon parallelization (only beneficial for texts >1MB) **Decoding:** @@ -159,7 +160,22 @@ BPE tokens don't always align with UTF-8 character boundaries. For example, a mu ### Rust API -The Rust API provides similar functionality with strongly-typed interfaces. See the [API documentation](https://docs.rs/splintr) for detailed information. +The Rust API provides similar functionality with strongly-typed interfaces: + +**Encoding:** + +- `encode(&self, text: &str) -> Vec`: Sequential encoding (optimal for texts <1MB) +- `encode_with_special(&self, text: &str) -> Vec`: Encode with special token recognition +- `encode_batch(&self, texts: &[String]) -> Vec>`: Parallel encoding across texts +- `encode_rayon(&self, text: &str) -> Vec`: Parallel encoding within text (for texts >1MB) + +**Decoding:** + +- `decode(&self, tokens: &[u32]) -> Result`: Decode to UTF-8 string +- `decode_bytes(&self, tokens: &[u32]) -> Vec`: Decode to raw bytes +- `decode_lossy(&self, tokens: &[u32]) -> String`: Decode with replacement for invalid UTF-8 + +See the [API documentation](https://docs.rs/splintr) for detailed information. ## Streaming Decoder @@ -200,36 +216,51 @@ This approach ensures that: ## Performance -Benchmarks performed on Linux (6.16.8-arch3-1) with 24 CPU cores, comparing splintr to tiktoken (the reference Python implementation). +Benchmarks performed on Linux (6.16.8-arch3-1) with 24 CPU cores, comparing splintr against tiktoken (reference Python implementation), Hugging Face tokenizers, and TokenDagger. ### Single Text Encoding -Performance on various text types: +Splintr achieves **3-4x faster** single-text encoding compared to tiktoken across various text sizes: + +![Single Text Encoding Comparison](images/benchmark_single.png) + +**Latency by text type:** -| Content Type | Size | splintr (ms) | tiktoken (ms) | Speedup | -| ---------------- | ------------- | ------------ | ------------- | -------- | -| Long English | 450,000 chars | 7.94 | 19.91 | **2.5x** | -| Python Code | 59,200 chars | 1.67 | 5.90 | **3.5x** | -| JSON | 29,000 chars | 1.20 | 2.76 | **2.3x** | -| Numbers | 55,000 chars | 2.27 | 6.09 | **2.7x** | -| Whitespace-heavy | 50,000 chars | 1.36 | 4.91 | **3.6x** | -| Chinese | 11,500 chars | 1.09 | 1.45 | **1.3x** | +![Latency Comparison](images/benchmark_single_latency.png) + +Splintr consistently maintains lower latency across different content types (Python code, JSON, English prose, Chinese text), making it ideal for interactive applications and real-time processing. ### Batch Encoding -Batch operations show significant speedup through parallelism: +For batch operations, splintr achieves **10-12x speedup** over tiktoken by parallelizing across texts: + +![Batch Encoding Throughput](images/benchmark_batch.png) -| Configuration | splintr parallel (ms) | tiktoken (ms) | Speedup vs tiktoken | -| ------------------ | --------------------- | ------------- | ------------------- | -| 10 × 1,000 chars | 0.25 | 0.48 | **1.9x** | -| 100 × 1,000 chars | 1.11 | 4.66 | **4.2x** | -| 1,000 × 100 chars | 1.42 | 6.95 | **4.9x** | -| 100 × 10,000 chars | 8.24 | 45.72 | **5.5x** | +| Configuration | Splintr | Tiktoken | Speedup | +| ---------------- | ------------ | ---------- | -------- | +| 1,000 × 100 chars | 111 MB/s | 9 MB/s | **12.3x** | +| 100 × 1,000 chars | 89 MB/s | 8 MB/s | **11.1x** | +| 10 × 10,000 chars | 72 MB/s | 7 MB/s | **10.3x** | -**Parallel speedup within splintr:** +![Batch Speedup vs Tiktoken](images/benchmark_batch_speedup.png) -- 100 × 1,000 chars: 8.6x faster (parallel vs sequential) -- 1,000 × 100 chars: 16.8x faster (parallel vs sequential) +The batch encoding speedup scales effectively across different batch configurations, with higher speedups on larger batches where parallelization overhead is amortized. + +### Design Decision: Sequential by Default + +Splintr uses **sequential encoding for single texts** and **parallel encoding across batches**. This design choice is based on empirical benchmarking: + +![Sequential vs Rayon Internal Parallelization](images/benchmark_splintr.png) + +**Key findings:** + +- **Sequential is faster** for texts up to ~1MB (typical LLM use case) +- Rayon's parallelization overhead only pays off at ~1MB+ text sizes +- Most real-world inputs (prompts, documents, code) are well under 1MB +- `encode()` uses sequential processing for optimal single-text performance +- `encode_batch()` parallelizes across multiple texts for maximum throughput + +This architecture ensures splintr is optimized for the most common tokenization patterns in LLM applications while still providing excellent batch performance for data processing pipelines. ### Running Benchmarks @@ -261,13 +292,15 @@ The benchmark suite tests: You can customize the benchmark by modifying `benchmark.py` or adding your own test data in the `data/` directory. -## Supported Models +## Supported Vocabularies -| Model | Use Case | Vocabulary Size | Special Tokens | Import Constant | +| Vocabulary | Used By | Vocabulary Size | Special Tokens | Import Constant | | ------------- | -------------------- | --------------- | -------------- | --------------------- | | `cl100k_base` | GPT-4, GPT-3.5-turbo | ~100,000 | 5 + 54 agent | `CL100K_BASE_PATTERN` | | `o200k_base` | GPT-4o | ~200,000 | 2 + 54 agent | `O200K_BASE_PATTERN` | +More vocabularies will be added in future releases. + **OpenAI standard tokens:** - **cl100k_base**: `<|endoftext|>`, `<|fim_prefix|>`, `<|fim_middle|>`, `<|fim_suffix|>`, `<|endofprompt|>` @@ -356,6 +389,19 @@ The pre-commit hook automatically runs formatting, clippy, and tests before each This project is licensed under the MIT License - see the LICENSE file for details. +## Citation + +If you use Splintr in your research, please cite: + +```bibtex +@software{splintr, + author = {Farhan Syah}, + title = {Splintr: High-Performance BPE Tokenizer}, + year = {2025}, + url = {https://github.com/farhan-syah/splintr} +} +``` + ## Acknowledgments Splintr builds upon concepts from: diff --git a/benchmarks/benchmark_batch.py b/benchmarks/benchmark_batch.py new file mode 100644 index 0000000..3d7ebaa --- /dev/null +++ b/benchmarks/benchmark_batch.py @@ -0,0 +1,318 @@ +#!/usr/bin/env python3 +""" +Benchmark: Batch Encoding Comparison +Compares tokenizer throughput for batch encoding across different batch sizes. + +Usage: + python benchmarks/benchmark_batch.py +""" + +import gc +import statistics +import time +from dataclasses import dataclass +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np + + +# Sample text for batch benchmarks +SAMPLE_TEXT = """The quick brown fox jumps over the lazy dog. +Machine learning models require tokenization to process text efficiently. +Tokenizers convert text into numerical representations that models can understand.""" + +TOKENIZER_COLORS = { + "splintr": "#2ecc71", # Green + "tiktoken": "#3498db", # Blue + "huggingface": "#e74c3c", # Red + "tokendagger": "#9b59b6", # Purple +} + + +@dataclass +class BenchmarkResult: + name: str + batch_size: int + bytes_per_second: float + tokens_per_second: float + total_tokens: int + total_bytes: int + latency_ms: float + + +def benchmark_batch( + name: str, + encode_batch_fn, + texts: list[str], + batch_size: int, + warmup: int = 3, + iterations: int = 10, +) -> BenchmarkResult: + """Benchmark batch encoding.""" + total_bytes = sum(len(t.encode("utf-8")) for t in texts) + + # Warmup + for _ in range(warmup): + encode_batch_fn(texts) + + gc.collect() + + times = [] + total_tokens = 0 + for _ in range(iterations): + start = time.perf_counter_ns() + results = encode_batch_fn(texts) + end = time.perf_counter_ns() + times.append((end - start) / 1e9) + total_tokens = sum(len(r) for r in results) + + avg_time = statistics.mean(times) + bytes_per_second = total_bytes / avg_time + tokens_per_second = total_tokens / avg_time + + return BenchmarkResult( + name=name, + batch_size=batch_size, + bytes_per_second=bytes_per_second, + tokens_per_second=tokens_per_second, + total_tokens=total_tokens, + total_bytes=total_bytes, + latency_ms=avg_time * 1000, + ) + + +def load_tokenizers(): + """Load all available tokenizers with batch functions. + + All tokenizers use their native batch encoding methods: + - splintr: encode_batch (Rayon parallel) + - tiktoken: encode_ordinary_batch (native batch) + - huggingface: encode_batch (native batch) + - tokendagger: encode_batch (native batch) + """ + tokenizers = {} + + # splintr - native batch via Rayon + try: + import splintr + enc = splintr.Tokenizer.from_pretrained("cl100k_base") + tokenizers["splintr"] = enc.encode_batch + print("Loaded: splintr (native encode_batch)") + except ImportError: + print("splintr not available") + + # tiktoken - native batch + try: + import tiktoken + enc = tiktoken.get_encoding("cl100k_base") + tokenizers["tiktoken"] = enc.encode_ordinary_batch + print("Loaded: tiktoken (native encode_ordinary_batch)") + except ImportError: + print("tiktoken not available") + + # HuggingFace tokenizers - native batch + try: + from tokenizers import Tokenizer as HFTokenizer + hf_enc = HFTokenizer.from_pretrained("gpt2") + + def hf_encode_batch(texts): + return [e.ids for e in hf_enc.encode_batch(texts)] + + tokenizers["huggingface"] = hf_encode_batch + print("Loaded: huggingface (native encode_batch)") + except ImportError: + print("huggingface not available") + + # TokenDagger - native batch + try: + import tokendagger + import tiktoken + tik_enc = tiktoken.get_encoding("cl100k_base") + enc = tokendagger.Tokenizer( + name="cl100k_base", + pat_str=tik_enc._pat_str, + mergeable_ranks=tik_enc._mergeable_ranks, + special_tokens=tik_enc._special_tokens, + ) + tokenizers["tokendagger"] = enc.encode_batch + print("Loaded: tokendagger (native encode_batch)") + except (ImportError, Exception) as e: + print(f"tokendagger not available: {e}") + + return tokenizers + + +def run_benchmarks(tokenizers: dict) -> list[BenchmarkResult]: + """Run batch benchmarks with various batch sizes.""" + results = [] + + # Warmup all tokenizers + print("\nWarming up all tokenizers...") + warmup_texts = [SAMPLE_TEXT] * 100 + for name, encode_batch_fn in tokenizers.items(): + for _ in range(10): + encode_batch_fn(warmup_texts) + print("Warmup complete.") + + batch_sizes = [1, 10, 50, 100, 500, 1000] + + print("\n" + "=" * 70) + print("BATCH ENCODING BENCHMARKS") + print("=" * 70) + + for batch_size in batch_sizes: + texts = [SAMPLE_TEXT] * batch_size + total_bytes = sum(len(t.encode("utf-8")) for t in texts) + + print(f"\n--- Batch Size: {batch_size} ({total_bytes:,} bytes total) ---") + print(f"{'Tokenizer':<15} {'MB/s':>10} {'Ktok/s':>10} {'Latency':>12}") + print("-" * 50) + + for name, encode_batch_fn in tokenizers.items(): + result = benchmark_batch(name, encode_batch_fn, texts, batch_size) + results.append(result) + print( + f"{name:<15} {result.bytes_per_second / 1e6:>10.2f} " + f"{result.tokens_per_second / 1e3:>10.2f} " + f"{result.latency_ms:>10.2f} ms" + ) + + return results + + +def generate_chart(results: list[BenchmarkResult], output_path: str): + """Generate batch encoding comparison chart.""" + + names = list(dict.fromkeys(r.name for r in results)) + batch_sizes = list(dict.fromkeys(r.batch_size for r in results)) + + fig, ax = plt.subplots(figsize=(12, 7)) + + x = np.arange(len(batch_sizes)) + width = 0.8 / len(names) + + for i, name in enumerate(names): + throughputs = [] + for batch_size in batch_sizes: + for r in results: + if r.name == name and r.batch_size == batch_size: + throughputs.append(r.bytes_per_second / 1e6) + break + + offset = i * width - width * len(names) / 2 + width / 2 + bars = ax.bar( + x + offset, + throughputs, + width, + label=name, + color=TOKENIZER_COLORS.get(name, "#95a5a6"), + ) + + # Add value labels + for bar, val in zip(bars, throughputs): + height = bar.get_height() + ax.annotate( + f'{val:.0f}', + xy=(bar.get_x() + bar.get_width() / 2, height), + xytext=(0, 3), + textcoords="offset points", + ha='center', + va='bottom', + fontsize=8, + ) + + ax.set_xlabel("Batch Size (number of texts)", fontsize=12) + ax.set_ylabel("Throughput (MB/s)", fontsize=12) + ax.set_title("Batch Encoding Throughput Comparison", fontsize=14, fontweight="bold") + ax.set_xticks(x) + ax.set_xticklabels([str(bs) for bs in batch_sizes]) + ax.legend(loc="upper left") + ax.grid(axis="y", alpha=0.3) + + plt.tight_layout() + plt.savefig(output_path, dpi=150, bbox_inches="tight") + print(f"\nChart saved to: {output_path}") + plt.close() + + +def generate_speedup_chart(results: list[BenchmarkResult], output_path: str): + """Generate speedup vs tiktoken chart.""" + + if not any(r.name == "tiktoken" for r in results): + print("tiktoken not available for speedup chart") + return + + names = [n for n in dict.fromkeys(r.name for r in results) if n != "tiktoken"] + batch_sizes = list(dict.fromkeys(r.batch_size for r in results)) + + # Get tiktoken baseline + tiktoken_throughput = {} + for r in results: + if r.name == "tiktoken": + tiktoken_throughput[r.batch_size] = r.bytes_per_second + + fig, ax = plt.subplots(figsize=(10, 6)) + + x = np.arange(len(batch_sizes)) + width = 0.8 / len(names) + + for i, name in enumerate(names): + speedups = [] + for batch_size in batch_sizes: + for r in results: + if r.name == name and r.batch_size == batch_size: + speedup = r.bytes_per_second / tiktoken_throughput[batch_size] + speedups.append(speedup) + break + + offset = i * width - width * len(names) / 2 + width / 2 + ax.bar( + x + offset, + speedups, + width, + label=name, + color=TOKENIZER_COLORS.get(name, "#95a5a6"), + ) + + # Baseline line + ax.axhline(y=1.0, color="gray", linestyle="--", linewidth=1, label="tiktoken (baseline)") + + ax.set_xlabel("Batch Size", fontsize=12) + ax.set_ylabel("Speedup vs tiktoken", fontsize=12) + ax.set_title("Batch Encoding Speedup Relative to tiktoken", fontsize=14, fontweight="bold") + ax.set_xticks(x) + ax.set_xticklabels([str(bs) for bs in batch_sizes]) + ax.legend() + ax.grid(axis="y", alpha=0.3) + + plt.tight_layout() + plt.savefig(output_path, dpi=150, bbox_inches="tight") + print(f"Speedup chart saved to: {output_path}") + plt.close() + + +def main(): + print("=" * 70) + print("TOKENIZER BENCHMARK: BATCH ENCODING") + print("=" * 70) + + output_dir = Path(__file__).parent / "results" + output_dir.mkdir(exist_ok=True) + + tokenizers = load_tokenizers() + + if len(tokenizers) < 2: + print("\nNeed at least 2 tokenizers for comparison") + return + + results = run_benchmarks(tokenizers) + + generate_chart(results, str(output_dir / "benchmark_batch.png")) + generate_speedup_chart(results, str(output_dir / "benchmark_batch_speedup.png")) + + print("\nDone!") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/benchmark_single.py b/benchmarks/benchmark_single.py new file mode 100644 index 0000000..bb402bd --- /dev/null +++ b/benchmarks/benchmark_single.py @@ -0,0 +1,451 @@ +#!/usr/bin/env python3 +""" +Benchmark: Single Text Encoding Comparison +Compares tokenizer throughput across different text types and sizes. + +Usage: + python benchmarks/benchmark_single.py +""" + +import gc +import statistics +import time +from dataclasses import dataclass +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np + + +# Sample texts for benchmarking +SAMPLE_TEXTS = { + "short": "Hello, world! This is a test.", + "medium": """The quick brown fox jumps over the lazy dog. + Machine learning models require tokenization to process text efficiently. + Tokenizers convert text into numerical representations that models can understand.""" * 10, + "long": """Artificial intelligence and machine learning have revolutionized + the way we process and understand natural language. Large language models (LLMs) + like GPT-4, Claude, and others rely heavily on efficient tokenization to handle + vast amounts of text data. The performance of tokenizers directly impacts the + overall throughput of these systems, making optimization crucial for production + deployments. BPE (Byte Pair Encoding) has become the de facto standard for + modern tokenizers due to its balance of vocabulary efficiency and handling of + out-of-vocabulary words.""" * 50, + "code": ''' +def fibonacci(n: int) -> int: + """Calculate the nth Fibonacci number.""" + if n <= 1: + return n + return fibonacci(n - 1) + fibonacci(n - 2) + +class TokenizerBenchmark: + def __init__(self, name: str): + self.name = name + self.results = [] + + def run(self, text: str, iterations: int = 100): + for _ in range(iterations): + tokens = self.encode(text) + self.results.append(len(tokens)) +''' * 20, + "multilingual": """ + English: The quick brown fox jumps over the lazy dog. + 中文: 快速的棕色狐狸跳过懒狗。 + 日本語: 素早い茶色の狐が怠惰な犬を飛び越える。 + 한국어: 빠른 갈색 여우가 게으른 개를 뛰어넘습니다. + العربية: الثعلب البني السريع يقفز فوق الكلب الكسول. + Русский: Быстрая коричневая лиса прыгает через ленивую собаку. + """ * 20, +} + +TOKENIZER_COLORS = { + "splintr": "#2ecc71", # Green + "tiktoken": "#3498db", # Blue + "huggingface": "#e74c3c", # Red + "tokendagger": "#9b59b6", # Purple +} + + +@dataclass +class BenchmarkResult: + name: str + text_type: str + bytes_per_second: float + tokens_per_second: float + num_tokens: int + num_bytes: int + latency_ms: float + latency_std_ms: float + + +def benchmark_encode( + name: str, + encode_fn, + text: str, + text_type: str, + warmup: int = 50, + iterations: int = 100, +) -> BenchmarkResult: + """Benchmark a single encode function.""" + num_bytes = len(text.encode("utf-8")) + + # Warmup - use more iterations to ensure thread pools are initialized + for _ in range(warmup): + encode_fn(text) + + # Force garbage collection before timing + gc.collect() + + # Benchmark + times = [] + num_tokens = 0 + for _ in range(iterations): + start = time.perf_counter_ns() + tokens = encode_fn(text) + end = time.perf_counter_ns() + times.append((end - start) / 1e9) # Convert to seconds + num_tokens = len(tokens) + + avg_time = statistics.mean(times) + std_time = statistics.stdev(times) if len(times) > 1 else 0 + bytes_per_second = num_bytes / avg_time + tokens_per_second = num_tokens / avg_time + + return BenchmarkResult( + name=name, + text_type=text_type, + bytes_per_second=bytes_per_second, + tokens_per_second=tokens_per_second, + num_tokens=num_tokens, + num_bytes=num_bytes, + latency_ms=avg_time * 1000, + latency_std_ms=std_time * 1000, + ) + + +def load_tokenizers(): + """Load all available tokenizers.""" + tokenizers = {} + + # splintr + try: + import splintr + enc = splintr.Tokenizer.from_pretrained("cl100k_base") + tokenizers["splintr"] = enc.encode + print("Loaded: splintr") + except ImportError: + print("splintr not available") + + # tiktoken + try: + import tiktoken + enc = tiktoken.get_encoding("cl100k_base") + tokenizers["tiktoken"] = enc.encode + print("Loaded: tiktoken") + except ImportError: + print("tiktoken not available") + + # HuggingFace tokenizers + try: + from tokenizers import Tokenizer as HFTokenizer + hf_enc = HFTokenizer.from_pretrained("gpt2") + + def hf_encode(text): + return hf_enc.encode(text).ids + + tokenizers["huggingface"] = hf_encode + print("Loaded: huggingface") + except ImportError: + print("huggingface not available") + + # TokenDagger + try: + import tokendagger + import tiktoken + tik_enc = tiktoken.get_encoding("cl100k_base") + enc = tokendagger.Tokenizer( + name="cl100k_base", + pat_str=tik_enc._pat_str, + mergeable_ranks=tik_enc._mergeable_ranks, + special_tokens=tik_enc._special_tokens, + ) + tokenizers["tokendagger"] = enc.encode + print("Loaded: tokendagger") + except (ImportError, Exception) as e: + print(f"tokendagger not available: {e}") + + return tokenizers + + +def run_benchmarks(tokenizers: dict) -> list[BenchmarkResult]: + """Run benchmarks for all text types.""" + results = [] + + # Global warmup to initialize thread pools + print("\nWarming up all tokenizers...") + warmup_text = "This is a warmup text to initialize thread pools and caches." * 10 + for name, encode_fn in tokenizers.items(): + for _ in range(100): + encode_fn(warmup_text) + print("Warmup complete.") + + print("\n" + "=" * 70) + print("TEXT TYPE BENCHMARKS") + print("=" * 70) + + for text_type, text in SAMPLE_TEXTS.items(): + num_bytes = len(text.encode("utf-8")) + print(f"\n--- {text_type.upper()} ({num_bytes:,} bytes) ---") + print(f"{'Tokenizer':<15} {'MB/s':>10} {'Ktok/s':>10} {'Latency':>12} {'Std':>10}") + print("-" * 60) + + for name, encode_fn in tokenizers.items(): + result = benchmark_encode(name, encode_fn, text, text_type) + results.append(result) + print( + f"{name:<15} {result.bytes_per_second / 1e6:>10.2f} " + f"{result.tokens_per_second / 1e3:>10.2f} " + f"{result.latency_ms:>10.3f} ms " + f"{result.latency_std_ms:>8.3f} ms" + ) + + return results + + +def generate_chart(results: list[BenchmarkResult], output_path: str): + """Generate text type comparison chart.""" + + # Get unique tokenizers and text types + names = list(dict.fromkeys(r.name for r in results)) + text_types = list(dict.fromkeys(r.text_type for r in results)) + + # Create figure + fig, ax = plt.subplots(figsize=(12, 7)) + + x = np.arange(len(text_types)) + width = 0.8 / len(names) + + # Create bars for each tokenizer + for i, name in enumerate(names): + throughputs = [] + for text_type in text_types: + for r in results: + if r.name == name and r.text_type == text_type: + throughputs.append(r.bytes_per_second / 1e6) + break + + offset = i * width - width * len(names) / 2 + width / 2 + bars = ax.bar( + x + offset, + throughputs, + width, + label=name, + color=TOKENIZER_COLORS.get(name, "#95a5a6"), + ) + + # Add value labels on bars + for bar, val in zip(bars, throughputs): + height = bar.get_height() + ax.annotate( + f'{val:.1f}', + xy=(bar.get_x() + bar.get_width() / 2, height), + xytext=(0, 3), + textcoords="offset points", + ha='center', + va='bottom', + fontsize=8, + ) + + # Add text size annotations below x-axis + text_sizes = [] + for text_type in text_types: + for r in results: + if r.text_type == text_type: + text_sizes.append(r.num_bytes) + break + + ax.set_xlabel("Text Type", fontsize=12) + ax.set_ylabel("Throughput (MB/s)", fontsize=12) + ax.set_title("Tokenizer Throughput by Text Type", fontsize=14, fontweight="bold") + ax.set_xticks(x) + + # Create x-tick labels with size info + xlabels = [f"{t.capitalize()}\n({text_sizes[i]:,} bytes)" for i, t in enumerate(text_types)] + ax.set_xticklabels(xlabels) + + ax.legend(loc="upper left") + ax.grid(axis="y", alpha=0.3) + + # Add a note about short text performance + ax.text( + 0.98, 0.02, + "Lower is worse for short texts due to fixed overhead", + transform=ax.transAxes, + fontsize=9, + ha='right', + va='bottom', + style='italic', + color='gray', + ) + + plt.tight_layout() + plt.savefig(output_path, dpi=150, bbox_inches="tight") + print(f"\nChart saved to: {output_path}") + plt.close() + + +def generate_latency_chart(results: list[BenchmarkResult], output_path: str): + """Generate latency comparison chart (good for seeing short text overhead).""" + + names = list(dict.fromkeys(r.name for r in results)) + text_types = list(dict.fromkeys(r.text_type for r in results)) + + fig, ax = plt.subplots(figsize=(12, 7)) + + x = np.arange(len(text_types)) + width = 0.8 / len(names) + + for i, name in enumerate(names): + latencies = [] + errors = [] + for text_type in text_types: + for r in results: + if r.name == name and r.text_type == text_type: + latencies.append(r.latency_ms) + errors.append(r.latency_std_ms) + break + + offset = i * width - width * len(names) / 2 + width / 2 + bars = ax.bar( + x + offset, + latencies, + width, + label=name, + color=TOKENIZER_COLORS.get(name, "#95a5a6"), + yerr=errors, + capsize=3, + ) + + # Add value labels + for bar, val in zip(bars, latencies): + height = bar.get_height() + ax.annotate( + f'{val:.2f}', + xy=(bar.get_x() + bar.get_width() / 2, height), + xytext=(0, 3), + textcoords="offset points", + ha='center', + va='bottom', + fontsize=8, + ) + + ax.set_xlabel("Text Type", fontsize=12) + ax.set_ylabel("Latency (ms) - Lower is Better", fontsize=12) + ax.set_title("Tokenizer Latency by Text Type", fontsize=14, fontweight="bold") + ax.set_xticks(x) + + text_sizes = [] + for text_type in text_types: + for r in results: + if r.text_type == text_type: + text_sizes.append(r.num_bytes) + break + + xlabels = [f"{t.capitalize()}\n({text_sizes[i]:,} bytes)" for i, t in enumerate(text_types)] + ax.set_xticklabels(xlabels) + + ax.legend(loc="upper left") + ax.grid(axis="y", alpha=0.3) + ax.set_yscale("log") # Log scale to see small values + + plt.tight_layout() + plt.savefig(output_path, dpi=150, bbox_inches="tight") + print(f"Latency chart saved to: {output_path}") + plt.close() + + +def analyze_short_text_overhead(tokenizers: dict): + """Analyze why short texts are slow - measure fixed overhead.""" + + print("\n" + "=" * 70) + print("SHORT TEXT OVERHEAD ANALYSIS") + print("=" * 70) + + # First, do a global warmup for all tokenizers + print("\nWarming up all tokenizers...") + warmup_text = "This is a warmup text to initialize thread pools and caches." * 10 + for name, encode_fn in tokenizers.items(): + for _ in range(100): + encode_fn(warmup_text) + print("Warmup complete.\n") + + # Test with progressively shorter texts + test_texts = [ + ("1 char", "H"), + ("5 chars", "Hello"), + ("10 chars", "Hello worl"), + ("29 chars (short)", "Hello, world! This is a test."), + ("100 chars", "Hello, world! " * 7), + ("500 chars", "Hello, world! " * 35), + ] + + print(f"{'Text':<20} {'Size':>8} ", end="") + for name in tokenizers.keys(): + print(f"{name:>12}", end=" ") + print() + print("-" * (30 + 13 * len(tokenizers))) + + for label, text in test_texts: + num_bytes = len(text.encode("utf-8")) + print(f"{label:<20} {num_bytes:>6} B ", end="") + + for name, encode_fn in tokenizers.items(): + # Additional per-text warmup + for _ in range(20): + encode_fn(text) + + # Measure + gc.collect() + times = [] + for _ in range(100): + start = time.perf_counter_ns() + encode_fn(text) + end = time.perf_counter_ns() + times.append((end - start) / 1e6) # ms + + avg_ms = statistics.mean(times) + print(f"{avg_ms:>10.4f}ms", end=" ") + print() + + +def main(): + print("=" * 70) + print("TOKENIZER BENCHMARK: TEXT TYPES") + print("=" * 70) + + # Create output directory + output_dir = Path(__file__).parent / "results" + output_dir.mkdir(exist_ok=True) + + # Load tokenizers + tokenizers = load_tokenizers() + + if len(tokenizers) < 2: + print("\nNeed at least 2 tokenizers for comparison") + return + + # Run main benchmarks + results = run_benchmarks(tokenizers) + + # Generate charts + generate_chart(results, str(output_dir / "benchmark_single.png")) + generate_latency_chart(results, str(output_dir / "benchmark_single_latency.png")) + + # Analyze short text overhead + analyze_short_text_overhead(tokenizers) + + print("\nDone!") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/benchmark_splintr.py b/benchmarks/benchmark_splintr.py new file mode 100644 index 0000000..680874c --- /dev/null +++ b/benchmarks/benchmark_splintr.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +""" +Benchmark: Splintr Sequential vs Rayon (Parallel) Single Text Encoding +Finds the crossover point where Rayon parallelization becomes beneficial. + +Usage: + python benchmarks/benchmark_splintr.py +""" + +import gc +import statistics +import time +from dataclasses import dataclass +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np + + +@dataclass +class BenchmarkResult: + size_bytes: int + sequential_ms: float + rayon_ms: float + sequential_throughput: float # MB/s + rayon_throughput: float # MB/s + speedup: float # rayon / sequential (>1 means rayon is faster) + + +def create_test_text(target_size: int) -> str: + """Create test text of approximately target size.""" + base = "The quick brown fox jumps over the lazy dog. " + repeat = max(1, target_size // len(base)) + text = base * repeat + return text[:target_size] if len(text) > target_size else text + + +def benchmark_size(size: int, warmup: int = 20, iterations: int = 50) -> BenchmarkResult: + """Benchmark both sequential and rayon encoding for a given size.""" + import splintr + + text = create_test_text(size) + actual_size = len(text.encode("utf-8")) + + enc = splintr.Tokenizer.from_pretrained("cl100k_base") + + # Warmup both + for _ in range(warmup): + enc.encode(text) + enc.encode_rayon(text) + + gc.collect() + + # Benchmark sequential (encode) + seq_times = [] + for _ in range(iterations): + start = time.perf_counter_ns() + enc.encode(text) + end = time.perf_counter_ns() + seq_times.append((end - start) / 1e6) # ms + + # Benchmark rayon (encode_rayon) + rayon_times = [] + for _ in range(iterations): + start = time.perf_counter_ns() + enc.encode_rayon(text) + end = time.perf_counter_ns() + rayon_times.append((end - start) / 1e6) # ms + + seq_avg = statistics.mean(seq_times) + rayon_avg = statistics.mean(rayon_times) + + seq_throughput = actual_size / seq_avg / 1000 # MB/s + rayon_throughput = actual_size / rayon_avg / 1000 # MB/s + + speedup = seq_avg / rayon_avg # >1 means rayon is faster + + return BenchmarkResult( + size_bytes=actual_size, + sequential_ms=seq_avg, + rayon_ms=rayon_avg, + sequential_throughput=seq_throughput, + rayon_throughput=rayon_throughput, + speedup=speedup, + ) + + +def check_rayon_available(): + """Check if encode_rayon is available.""" + try: + import splintr + enc = splintr.Tokenizer.from_pretrained("cl100k_base") + if not hasattr(enc, 'encode_rayon'): + print("ERROR: encode_rayon method not found!") + print("You need to add encode_rayon to the Tokenizer class.") + print("\nAdd this to src/core/tokenizer.rs:") + print(""" + /// Encode text using Rayon parallel processing. + /// Use this for very large texts (>100KB) where parallelization helps. + pub fn encode_rayon(&self, text: &str) -> Vec { + // ... parallel implementation + } +""") + return False + return True + except Exception as e: + print(f"Error: {e}") + return False + + +def run_benchmarks() -> list[BenchmarkResult]: + """Run benchmarks across various sizes.""" + results = [] + + # Test sizes from 100 bytes to 1MB + sizes = [ + 100, 200, 500, + 1000, 2000, 5000, + 10000, 20000, 50000, + 100000, 200000, 500000, + 1000000, + ] + + print("\n" + "=" * 80) + print("SEQUENTIAL vs RAYON BENCHMARK") + print("=" * 80) + print(f"\n{'Size':>12} {'Sequential':>12} {'Rayon':>12} {'Seq MB/s':>10} {'Rayon MB/s':>10} {'Speedup':>10}") + print("-" * 80) + + for size in sizes: + result = benchmark_size(size) + results.append(result) + + speedup_str = f"{result.speedup:.2f}x" + if result.speedup > 1.1: + speedup_str += " (rayon wins)" + elif result.speedup < 0.9: + speedup_str += " (seq wins)" + + print( + f"{result.size_bytes:>10} B " + f"{result.sequential_ms:>10.3f} ms " + f"{result.rayon_ms:>10.3f} ms " + f"{result.sequential_throughput:>10.1f} " + f"{result.rayon_throughput:>10.1f} " + f"{speedup_str:>14}" + ) + + return results + + +def find_crossover(results: list[BenchmarkResult]) -> int | None: + """Find the size where Rayon becomes consistently faster.""" + for i, r in enumerate(results): + # Check if rayon is faster for this and subsequent sizes + if r.speedup > 1.0: + # Verify it stays faster for larger sizes + remaining = results[i:] + if all(rr.speedup >= 0.95 for rr in remaining): # Allow 5% margin + return r.size_bytes + return None + + +def generate_chart(results: list[BenchmarkResult], output_path: str): + """Generate comparison chart.""" + sizes = [r.size_bytes for r in results] + seq_throughput = [r.sequential_throughput for r in results] + rayon_throughput = [r.rayon_throughput for r in results] + + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6)) + + # Chart 1: Throughput comparison + ax1.plot(sizes, seq_throughput, 'o-', label='Sequential', color='#2ecc71', linewidth=2, markersize=6) + ax1.plot(sizes, rayon_throughput, 's-', label='Rayon (parallel)', color='#e74c3c', linewidth=2, markersize=6) + + ax1.set_xscale('log') + ax1.set_xlabel('Text Size (bytes)', fontsize=12) + ax1.set_ylabel('Throughput (MB/s)', fontsize=12) + ax1.set_title('Sequential vs Rayon Throughput', fontsize=14, fontweight='bold') + ax1.legend() + ax1.grid(True, alpha=0.3) + + # Add crossover annotation + crossover = find_crossover(results) + if crossover: + ax1.axvline(x=crossover, color='gray', linestyle='--', alpha=0.7) + ax1.annotate( + f'Crossover\n~{crossover/1000:.0f}KB', + xy=(crossover, max(seq_throughput) * 0.8), + fontsize=10, + ha='center', + ) + + # Chart 2: Speedup ratio + speedups = [r.speedup for r in results] + + colors = ['#2ecc71' if s < 1 else '#e74c3c' for s in speedups] + ax2.bar(range(len(sizes)), speedups, color=colors) + ax2.axhline(y=1.0, color='black', linestyle='-', linewidth=1) + + ax2.set_xticks(range(len(sizes))) + ax2.set_xticklabels([f'{s/1000:.0f}K' if s >= 1000 else str(s) for s in sizes], rotation=45, ha='right') + ax2.set_xlabel('Text Size', fontsize=12) + ax2.set_ylabel('Speedup (Rayon / Sequential)', fontsize=12) + ax2.set_title('Rayon Speedup Ratio (>1 = Rayon faster)', fontsize=14, fontweight='bold') + ax2.grid(axis='y', alpha=0.3) + + # Add legend + from matplotlib.patches import Patch + legend_elements = [ + Patch(facecolor='#2ecc71', label='Sequential wins'), + Patch(facecolor='#e74c3c', label='Rayon wins'), + ] + ax2.legend(handles=legend_elements, loc='upper left') + + plt.tight_layout() + plt.savefig(output_path, dpi=150, bbox_inches='tight') + print(f"\nChart saved to: {output_path}") + plt.close() + + +def main(): + print("=" * 80) + print("SPLINTR: SEQUENTIAL vs RAYON SINGLE TEXT ENCODING") + print("=" * 80) + + if not check_rayon_available(): + return + + output_dir = Path(__file__).parent / "results" + output_dir.mkdir(exist_ok=True) + + # Warmup + print("\nWarming up...") + import splintr + enc = splintr.Tokenizer.from_pretrained("cl100k_base") + warmup_text = "warmup " * 1000 + for _ in range(50): + enc.encode(warmup_text) + enc.encode_rayon(warmup_text) + print("Warmup complete.") + + results = run_benchmarks() + + crossover = find_crossover(results) + print("\n" + "=" * 80) + if crossover: + print(f"CROSSOVER POINT: ~{crossover:,} bytes ({crossover/1024:.1f} KB)") + print(f"Recommendation: Use Rayon for texts > {crossover:,} bytes") + else: + print("No clear crossover found - Sequential is generally faster") + print("=" * 80) + + generate_chart(results, str(output_dir / "benchmark_splintr.png")) + + print("\nDone!") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/compare_tokenizers.py b/benchmarks/compare_tokenizers.py new file mode 100644 index 0000000..c814909 --- /dev/null +++ b/benchmarks/compare_tokenizers.py @@ -0,0 +1,509 @@ +#!/usr/bin/env python3 +""" +Benchmark comparison: splintr vs tiktoken vs HuggingFace Tokenizers vs TokenDagger + +Generates performance charts comparing encoding throughput across different tokenizers. + +Usage: + pip install tiktoken tokenizers matplotlib numpy + pip install tokendagger # optional + python benchmarks/compare_tokenizers.py +""" + +import gc +import json +import os +import statistics +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Callable + +# Sample texts for benchmarking +SAMPLE_TEXTS = { + "short": "Hello, world! This is a test.", + "medium": """The quick brown fox jumps over the lazy dog. + Machine learning models require tokenization to process text efficiently. + Tokenizers convert text into numerical representations that models can understand.""" * 10, + "long": """Artificial intelligence and machine learning have revolutionized + the way we process and understand natural language. Large language models (LLMs) + like GPT-4, Claude, and others rely heavily on efficient tokenization to handle + vast amounts of text data. The performance of tokenizers directly impacts the + overall throughput of these systems, making optimization crucial for production + deployments. BPE (Byte Pair Encoding) has become the de facto standard for + modern tokenizers due to its balance of vocabulary efficiency and handling of + out-of-vocabulary words.""" * 50, + "code": ''' +def fibonacci(n: int) -> int: + """Calculate the nth Fibonacci number.""" + if n <= 1: + return n + return fibonacci(n - 1) + fibonacci(n - 2) + +class TokenizerBenchmark: + def __init__(self, name: str): + self.name = name + self.results = [] + + def run(self, text: str, iterations: int = 100): + for _ in range(iterations): + tokens = self.encode(text) + self.results.append(len(tokens)) +''' * 20, + "multilingual": """ + English: The quick brown fox jumps over the lazy dog. + 中文: 快速的棕色狐狸跳过懒狗。 + 日本語: 素早い茶色の狐が怠惰な犬を飛び越える。 + 한국어: 빠른 갈색 여우가 게으른 개를 뛰어넘습니다. + العربية: الثعلب البني السريع يقفز فوق الكلب الكسول. + Русский: Быстрая коричневая лиса прыгает через ленивую собаку. + """ * 20, +} + + +@dataclass +class BenchmarkResult: + name: str + text_type: str + bytes_per_second: float + tokens_per_second: float + num_tokens: int + num_bytes: int + latency_ms: float + + +def benchmark_encode( + name: str, + encode_fn: Callable[[str], list], + text: str, + text_type: str, + warmup: int = 3, + iterations: int = 10, +) -> BenchmarkResult: + """Benchmark a single encode function.""" + num_bytes = len(text.encode("utf-8")) + + # Warmup + for _ in range(warmup): + encode_fn(text) + + # Force garbage collection before timing + gc.collect() + + # Benchmark + times = [] + num_tokens = 0 + for _ in range(iterations): + start = time.perf_counter_ns() + tokens = encode_fn(text) + end = time.perf_counter_ns() + times.append((end - start) / 1e9) # Convert to seconds + num_tokens = len(tokens) + + avg_time = statistics.mean(times) + bytes_per_second = num_bytes / avg_time + tokens_per_second = num_tokens / avg_time + + return BenchmarkResult( + name=name, + text_type=text_type, + bytes_per_second=bytes_per_second, + tokens_per_second=tokens_per_second, + num_tokens=num_tokens, + num_bytes=num_bytes, + latency_ms=avg_time * 1000, + ) + + +def benchmark_batch_encode( + name: str, + encode_batch_fn: Callable[[list[str]], list], + texts: list[str], + text_type: str, + warmup: int = 2, + iterations: int = 5, +) -> BenchmarkResult: + """Benchmark batch encoding.""" + num_bytes = sum(len(t.encode("utf-8")) for t in texts) + + # Warmup + for _ in range(warmup): + encode_batch_fn(texts) + + gc.collect() + + times = [] + total_tokens = 0 + for _ in range(iterations): + start = time.perf_counter_ns() + results = encode_batch_fn(texts) + end = time.perf_counter_ns() + times.append((end - start) / 1e9) + total_tokens = sum(len(r) for r in results) + + avg_time = statistics.mean(times) + bytes_per_second = num_bytes / avg_time + tokens_per_second = total_tokens / avg_time + + return BenchmarkResult( + name=name, + text_type=text_type, + bytes_per_second=bytes_per_second, + tokens_per_second=tokens_per_second, + num_tokens=total_tokens, + num_bytes=num_bytes, + latency_ms=avg_time * 1000, + ) + + +def load_tokenizers(): + """Load all available tokenizers.""" + tokenizers = {} + + # splintr + try: + import splintr + + enc = splintr.Tokenizer.from_pretrained("cl100k_base") + tokenizers["splintr"] = { + "encode": enc.encode, + "encode_batch": enc.encode_batch, + "color": "#2ecc71", # Green + } + print("Loaded: splintr") + except ImportError: + print("splintr not available - run: maturin develop --release") + + # tiktoken + try: + import tiktoken + + tik_enc = tiktoken.get_encoding("cl100k_base") + + def tik_encode_batch(texts): + return tik_enc.encode_ordinary_batch(texts) + + tokenizers["tiktoken"] = { + "encode": tik_enc.encode, + "encode_batch": tik_encode_batch, + "color": "#3498db", # Blue + } + print("Loaded: tiktoken") + except ImportError: + print("tiktoken not available - run: pip install tiktoken") + + # HuggingFace tokenizers + try: + from tokenizers import Tokenizer as HFTokenizer + + # Use GPT-2 tokenizer (similar to cl100k but available) + hf_enc = HFTokenizer.from_pretrained("gpt2") + + def hf_encode(text): + return hf_enc.encode(text).ids + + def hf_encode_batch(texts): + return [e.ids for e in hf_enc.encode_batch(texts)] + + tokenizers["huggingface"] = { + "encode": hf_encode, + "encode_batch": hf_encode_batch, + "color": "#e74c3c", # Red + } + print("Loaded: huggingface tokenizers") + except ImportError: + print("HuggingFace tokenizers not available - run: pip install tokenizers") + + # TokenDagger (if available) + try: + import tokendagger + + # TokenDagger requires loading vocab from tiktoken + import tiktoken + tik_enc = tiktoken.get_encoding("cl100k_base") + enc = tokendagger.Tokenizer( + name="cl100k_base", + pat_str=tik_enc._pat_str, + mergeable_ranks=tik_enc._mergeable_ranks, + special_tokens=tik_enc._special_tokens, + ) + tokenizers["tokendagger"] = { + "encode": enc.encode, + "encode_batch": enc.encode_batch, + "color": "#9b59b6", # Purple + } + print("Loaded: tokendagger") + except (ImportError, Exception) as e: + print(f"tokendagger not available: {e}") + + return tokenizers + + +def run_benchmarks(tokenizers: dict, text_types: list[str] = None): + """Run all benchmarks.""" + if text_types is None: + text_types = list(SAMPLE_TEXTS.keys()) + + results = [] + + print("\n" + "=" * 60) + print("SINGLE TEXT ENCODING BENCHMARKS") + print("=" * 60) + + for text_type in text_types: + text = SAMPLE_TEXTS[text_type] + num_bytes = len(text.encode("utf-8")) + print(f"\n--- {text_type.upper()} ({num_bytes:,} bytes) ---") + + for name, tok in tokenizers.items(): + result = benchmark_encode(name, tok["encode"], text, text_type) + results.append(result) + print( + f"{name:15} {result.bytes_per_second / 1e6:8.2f} MB/s " + f"{result.tokens_per_second / 1e3:8.2f} Ktok/s " + f"{result.latency_ms:8.3f} ms" + ) + + # Batch benchmarks + print("\n" + "=" * 60) + print("BATCH ENCODING BENCHMARKS (100 texts)") + print("=" * 60) + + for text_type in ["medium", "long"]: + texts = [SAMPLE_TEXTS[text_type]] * 100 + total_bytes = sum(len(t.encode("utf-8")) for t in texts) + print(f"\n--- {text_type.upper()} x100 ({total_bytes:,} bytes total) ---") + + for name, tok in tokenizers.items(): + result = benchmark_batch_encode( + f"{name}_batch", tok["encode_batch"], texts, f"{text_type}_batch" + ) + results.append(result) + print( + f"{name:15} {result.bytes_per_second / 1e6:8.2f} MB/s " + f"{result.tokens_per_second / 1e3:8.2f} Ktok/s " + f"{result.latency_ms:8.3f} ms" + ) + + return results + + +def generate_chart(results: list[BenchmarkResult], tokenizers: dict, output_path: str): + """Generate comparison bar chart.""" + try: + import matplotlib.pyplot as plt + import numpy as np + except ImportError: + print("matplotlib/numpy not available - run: pip install matplotlib numpy") + return + + # Filter for single-text benchmarks only + single_results = [r for r in results if "_batch" not in r.text_type] + + # Get unique tokenizers and text types + names = list(tokenizers.keys()) + text_types = list(dict.fromkeys(r.text_type for r in single_results)) + + # Create figure with two subplots + fig, axes = plt.subplots(1, 2, figsize=(14, 6)) + + # Chart 1: Throughput by text type + ax1 = axes[0] + x = np.arange(len(text_types)) + width = 0.8 / len(names) + + for i, name in enumerate(names): + throughputs = [] + for text_type in text_types: + for r in single_results: + if r.name == name and r.text_type == text_type: + throughputs.append(r.bytes_per_second / 1e6) + break + bars = ax1.bar( + x + i * width - width * len(names) / 2 + width / 2, + throughputs, + width, + label=name, + color=tokenizers[name]["color"], + ) + + ax1.set_xlabel("Text Type", fontsize=12) + ax1.set_ylabel("Throughput (MB/s)", fontsize=12) + ax1.set_title("Tokenizer Throughput Comparison", fontsize=14, fontweight="bold") + ax1.set_xticks(x) + ax1.set_xticklabels([t.capitalize() for t in text_types]) + ax1.legend() + ax1.grid(axis="y", alpha=0.3) + + # Chart 2: Batch encoding comparison + batch_results = [r for r in results if "_batch" in r.text_type] + if batch_results: + ax2 = axes[1] + batch_types = list(dict.fromkeys(r.text_type for r in batch_results)) + x2 = np.arange(len(batch_types)) + + for i, name in enumerate(names): + throughputs = [] + for text_type in batch_types: + for r in batch_results: + if r.name == f"{name}_batch" and r.text_type == text_type: + throughputs.append(r.bytes_per_second / 1e6) + break + else: + throughputs.append(0) + if any(t > 0 for t in throughputs): + ax2.bar( + x2 + i * width - width * len(names) / 2 + width / 2, + throughputs, + width, + label=name, + color=tokenizers[name]["color"], + ) + + ax2.set_xlabel("Batch Type", fontsize=12) + ax2.set_ylabel("Throughput (MB/s)", fontsize=12) + ax2.set_title( + "Batch Encoding (100 texts)", fontsize=14, fontweight="bold" + ) + ax2.set_xticks(x2) + ax2.set_xticklabels([t.replace("_batch", "").capitalize() for t in batch_types]) + ax2.legend() + ax2.grid(axis="y", alpha=0.3) + + plt.tight_layout() + plt.savefig(output_path, dpi=150, bbox_inches="tight") + print(f"\nChart saved to: {output_path}") + + # Also save as SVG for better quality + svg_path = output_path.replace(".png", ".svg") + plt.savefig(svg_path, format="svg", bbox_inches="tight") + print(f"SVG saved to: {svg_path}") + + plt.close() + + +def generate_speedup_chart(results: list[BenchmarkResult], tokenizers: dict, output_path: str): + """Generate speedup comparison chart (relative to tiktoken).""" + try: + import matplotlib.pyplot as plt + import numpy as np + except ImportError: + return + + if "tiktoken" not in tokenizers: + print("tiktoken not available for speedup comparison") + return + + # Filter for single-text benchmarks + single_results = [r for r in results if "_batch" not in r.text_type] + text_types = list(dict.fromkeys(r.text_type for r in single_results)) + names = [n for n in tokenizers.keys() if n != "tiktoken"] + + # Get tiktoken baseline + tiktoken_throughput = {} + for r in single_results: + if r.name == "tiktoken": + tiktoken_throughput[r.text_type] = r.bytes_per_second + + fig, ax = plt.subplots(figsize=(10, 6)) + + x = np.arange(len(text_types)) + width = 0.8 / len(names) + + for i, name in enumerate(names): + speedups = [] + for text_type in text_types: + for r in single_results: + if r.name == name and r.text_type == text_type: + if text_type in tiktoken_throughput: + speedup = r.bytes_per_second / tiktoken_throughput[text_type] + else: + speedup = 1.0 + speedups.append(speedup) + break + ax.bar( + x + i * width - width * len(names) / 2 + width / 2, + speedups, + width, + label=name, + color=tokenizers[name]["color"], + ) + + # Add baseline line at 1.0 + ax.axhline(y=1.0, color="gray", linestyle="--", linewidth=1, label="tiktoken (baseline)") + + ax.set_xlabel("Text Type", fontsize=12) + ax.set_ylabel("Speedup vs tiktoken", fontsize=12) + ax.set_title("Tokenizer Speedup Relative to tiktoken", fontsize=14, fontweight="bold") + ax.set_xticks(x) + ax.set_xticklabels([t.capitalize() for t in text_types]) + ax.legend() + ax.grid(axis="y", alpha=0.3) + + plt.tight_layout() + plt.savefig(output_path, dpi=150, bbox_inches="tight") + print(f"Speedup chart saved to: {output_path}") + plt.close() + + +def save_results_json(results: list[BenchmarkResult], output_path: str): + """Save benchmark results as JSON.""" + data = [ + { + "name": r.name, + "text_type": r.text_type, + "bytes_per_second": r.bytes_per_second, + "tokens_per_second": r.tokens_per_second, + "num_tokens": r.num_tokens, + "num_bytes": r.num_bytes, + "latency_ms": r.latency_ms, + } + for r in results + ] + with open(output_path, "w") as f: + json.dump(data, f, indent=2) + print(f"Results saved to: {output_path}") + + +def main(): + print("=" * 60) + print("TOKENIZER BENCHMARK COMPARISON") + print("splintr vs tiktoken vs HuggingFace vs TokenDagger") + print("=" * 60) + + # Create output directory + output_dir = Path(__file__).parent / "results" + output_dir.mkdir(exist_ok=True) + + # Load tokenizers + tokenizers = load_tokenizers() + + if len(tokenizers) < 2: + print("\nWarning: Less than 2 tokenizers available for comparison") + print("Install missing packages:") + print(" pip install tiktoken tokenizers matplotlib numpy") + print(" pip install tokendagger # optional") + if "splintr" not in tokenizers: + print(" maturin develop --release # for splintr") + + if not tokenizers: + print("No tokenizers available!") + return + + # Run benchmarks + results = run_benchmarks(tokenizers) + + # Generate outputs + print("\n" + "=" * 60) + print("GENERATING OUTPUTS") + print("=" * 60) + + generate_chart(results, tokenizers, str(output_dir / "benchmark_comparison.png")) + generate_speedup_chart(results, tokenizers, str(output_dir / "benchmark_speedup.png")) + save_results_json(results, str(output_dir / "benchmark_results.json")) + + print("\nDone!") + + +if __name__ == "__main__": + main() diff --git a/images/benchmark_batch.png b/images/benchmark_batch.png new file mode 100644 index 0000000..eaba341 Binary files /dev/null and b/images/benchmark_batch.png differ diff --git a/images/benchmark_batch_speedup.png b/images/benchmark_batch_speedup.png new file mode 100644 index 0000000..e6a66d0 Binary files /dev/null and b/images/benchmark_batch_speedup.png differ diff --git a/images/benchmark_single.png b/images/benchmark_single.png new file mode 100644 index 0000000..e70d7d2 Binary files /dev/null and b/images/benchmark_single.png differ diff --git a/images/benchmark_single_latency.png b/images/benchmark_single_latency.png new file mode 100644 index 0000000..b70a403 Binary files /dev/null and b/images/benchmark_single_latency.png differ diff --git a/images/benchmark_speedup.png b/images/benchmark_speedup.png new file mode 100644 index 0000000..1dec6ab Binary files /dev/null and b/images/benchmark_speedup.png differ diff --git a/images/benchmark_splintr.png b/images/benchmark_splintr.png new file mode 100644 index 0000000..4e1090f Binary files /dev/null and b/images/benchmark_splintr.png differ diff --git a/pyproject.toml b/pyproject.toml index d52ff08..494538e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "splintr-rs" -version = "0.2.0" +version = "0.3.0" description = "Fast Rust BPE tokenizer with Python bindings" readme = "README.md" license = {text = "MIT"} diff --git a/src/core/tokenizer.rs b/src/core/tokenizer.rs index ad77d85..5cd022c 100644 --- a/src/core/tokenizer.rs +++ b/src/core/tokenizer.rs @@ -517,9 +517,40 @@ const DEFAULT_CACHE_SIZE: usize = 4096; /// High-performance tokenizer using PCRE2 with JIT and Rayon parallelism. /// -/// Key optimizations: +/// # Performance Characteristics +/// +/// This tokenizer is optimized for high throughput across different workloads: +/// +/// - **Single text encoding**: Uses sequential processing via [`encode`]. +/// Benchmarks show sequential is faster for texts up to ~1MB due to Rayon +/// thread pool overhead. Sequential achieves ~50 MB/s consistently. +/// +/// - **Batch encoding**: Uses Rayon parallelism via [`encode_batch`]. +/// Parallelizes across texts (not within a single text), achieving ~110 MB/s +/// on batch workloads - approximately 10-12x faster than tiktoken. +/// +/// - **Very large single texts (>1MB)**: Use [`encode_rayon`] for texts larger +/// than ~1MB where Rayon parallelization within the text becomes beneficial. +/// +/// # Design Decision: Sequential by Default +/// +/// The [`encode`] method uses sequential processing because Rayon parallel +/// overhead is significant for typical text sizes: +/// +/// | Text Size | Sequential | Rayon | Speedup | +/// |-----------|------------|-------|---------| +/// | 100 bytes | 42 MB/s | 3 MB/s | Sequential 12x faster | +/// | 10 KB | 50 MB/s | 26 MB/s | Sequential 2x faster | +/// | 100 KB | 54 MB/s | 41 MB/s | Sequential 1.3x faster | +/// | 1 MB | 44 MB/s | 47 MB/s | Rayon 1.07x faster | +/// +/// Rayon only becomes beneficial at ~1MB, which is rare in typical workloads. +/// For batch processing, use [`encode_batch`] which parallelizes across texts. +/// +/// # Key Optimizations +/// /// - PCRE2 with JIT compilation (2-4x faster than fancy-regex) -/// - Rayon parallelism for encoding multiple chunks +/// - Rayon parallelism for batch encoding (across texts, not within) /// - Linked-list BPE algorithm (avoids O(N²) on pathological inputs) /// - FxHashMap for fast lookups /// - Aho-Corasick for fast multi-pattern special token matching @@ -665,7 +696,23 @@ impl Tokenizer { /// Encode text to token IDs (ignores special tokens in input). /// - /// Uses Rayon to parallelize BPE encoding across regex-matched chunks. + /// Uses sequential processing, which is faster than parallel for texts up to ~1MB. + /// Achieves ~50 MB/s throughput, approximately 3x faster than tiktoken. + /// + /// # Why Sequential? + /// + /// Rayon parallel processing has significant thread pool overhead that only + /// pays off for very large texts (~1MB+). Benchmarks show: + /// - 100 bytes: Sequential is 12x faster than Rayon + /// - 10 KB: Sequential is 2x faster + /// - 100 KB: Sequential is 1.3x faster + /// - 1 MB: Rayon becomes ~7% faster + /// + /// # When to Use Other Methods + /// + /// - **Multiple texts**: Use [`encode_batch`] for parallel encoding across texts + /// - **Very large texts (>1MB)**: Use [`encode_rayon`] for parallel within-text encoding + /// - **Special tokens**: Use [`encode_with_special`] to recognize special tokens pub fn encode(&self, text: &str) -> Vec { let text_bytes = text.as_bytes(); @@ -681,7 +728,56 @@ impl Tokenizer { return vec![]; } - // Parallel BPE encoding using Rayon + // Sequential encoding - Rayon overhead not worth it for texts < 1MB + // See struct-level docs for benchmark data + let results: Vec> = chunks + .iter() + .map(|&(start, end)| { + let slice = &text_bytes[start..end]; + self.encode_chunk(slice) + }) + .collect(); + + // Flatten results + results.into_iter().flatten().collect() + } + + /// Encode text to token IDs using Rayon parallel processing. + /// + /// Parallelizes BPE encoding of individual regex-matched chunks using Rayon. + /// Only beneficial for very large texts (>1MB) where parallelization overhead + /// is amortized across many chunks. + /// + /// # Performance + /// + /// | Text Size | Sequential | Rayon | Winner | + /// |-----------|------------|-------|--------| + /// | < 500 KB | ~50 MB/s | ~40 MB/s | Sequential | + /// | ~1 MB | ~44 MB/s | ~47 MB/s | Rayon (1.07x) | + /// + /// # When to Use + /// + /// - Single texts larger than ~1MB (e.g., entire books, large documents) + /// - When processing time is more critical than thread pool overhead + /// + /// For most use cases, prefer [`encode`] (sequential) or [`encode_batch`] + /// (parallel across multiple texts). + pub fn encode_rayon(&self, text: &str) -> Vec { + let text_bytes = text.as_bytes(); + + // Collect regex matches (chunks to encode) + let chunks: Vec<(usize, usize)> = self + .regex + .find_iter(text_bytes) + .filter_map(|m| m.ok()) + .map(|m| (m.start(), m.end())) + .collect(); + + if chunks.is_empty() { + return vec![]; + } + + // Parallel encoding using Rayon - each chunk encoded in parallel let results: Vec> = chunks .par_iter() .map(|&(start, end)| { @@ -767,12 +863,31 @@ impl Tokenizer { /// Batch encode multiple texts in parallel. /// - /// Uses Rayon to parallelize across texts AND within each text's BPE encoding. + /// Uses Rayon to parallelize **across texts** (not within each text). + /// This is the most efficient approach for batch workloads because: + /// + /// 1. Each text is encoded sequentially (optimal for texts < 1MB) + /// 2. Multiple texts are processed in parallel across CPU cores + /// 3. No thread coordination overhead within individual texts + /// + /// # Performance + /// + /// Achieves ~110 MB/s throughput on batch workloads, approximately + /// 10-12x faster than tiktoken's `encode_ordinary_batch`. + /// + /// # Example + /// + /// ```ignore + /// let texts = vec!["Hello".to_string(), "World".to_string()]; + /// let token_ids = tokenizer.encode_batch(&texts); + /// ``` pub fn encode_batch(&self, texts: &[String]) -> Vec> { texts.par_iter().map(|text| self.encode(text)).collect() } /// Batch encode multiple texts with special token handling. + /// + /// Like [`encode_batch`], but recognizes special tokens in the input. pub fn encode_batch_with_special(&self, texts: &[String]) -> Vec> { texts .par_iter() diff --git a/src/python/bindings.rs b/src/python/bindings.rs index 4f22886..78a75a5 100644 --- a/src/python/bindings.rs +++ b/src/python/bindings.rs @@ -478,6 +478,7 @@ impl PyTokenizer { /// Encode text to token IDs. /// /// Special tokens in the input are treated as regular text. + /// This method uses sequential encoding which is optimal for most use cases. /// /// Args: /// text: Input text to encode @@ -488,6 +489,25 @@ impl PyTokenizer { self.inner.encode(text) } + /// Encode text to token IDs using Rayon parallel processing. + /// + /// This method parallelizes the BPE encoding of individual chunks using Rayon. + /// It has higher overhead than `encode()` due to thread pool coordination, + /// but can be faster for very large texts (typically >1MB) where the + /// parallelization benefit outweighs the overhead. + /// + /// For most use cases, prefer `encode()` (sequential) or `encode_batch()` + /// (parallel across multiple texts). + /// + /// Args: + /// text: Input text to encode + /// + /// Returns: + /// List of token IDs + fn encode_rayon(&self, text: &str) -> Vec { + self.inner.encode_rayon(text) + } + /// Encode text with special token handling. /// /// Special tokens in the input are encoded directly without BPE.