Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ crate-type = ["cdylib", "rlib"]

[features]
default = []
python = ["dep:pyo3"]
pcre2 = ["dep:pcre2"]

[dependencies]
Expand All @@ -28,13 +29,13 @@ rustc-hash = "2.0"
# Error handling
thiserror = "2.0"
# Python bindings
pyo3 = { version = "0.23", features = ["extension-module"] }
pyo3 = { version = "0.27", features = ["extension-module"], optional = true }
# Base64 decoding for tiktoken vocab files
base64 = "0.22"
# Aho-Corasick for fast multi-pattern special token matching
aho-corasick = "1.1"
# LRU cache for frequent token sequences
lru = "0.12"
lru = "0.16"
# regexr regex engine (default backend)
regexr = { version = "0.1.0-beta.4", features = ["jit", "simd"] }

Expand Down
8 changes: 3 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,10 @@ name = "splintr-rs"
version = "0.6.0"
description = "Fast Rust BPE tokenizer with Python bindings"
readme = "README.md"
license = {text = "MIT"}
license = { text = "MIT" }
requires-python = ">=3.8"
keywords = ["tokenizer", "bpe", "tiktoken", "gpt", "llm"]
authors = [
{name = "Farhan"}
]
authors = [{ name = "Farhan" }]
classifiers = [
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
Expand All @@ -38,4 +36,4 @@ Documentation = "https://github.com/farhan/splintr#readme"
python-source = "python"
module-name = "splintr._core"
bindings = "pyo3"
features = ["pyo3/extension-module"]
features = ["python"]
30 changes: 1 addition & 29 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,36 +1,8 @@
pub mod core;
#[cfg(feature = "python")]
mod python;

use pyo3::prelude::*;

pub use core::{
ByteLevelStreamingDecoder, StreamingDecoder, Tokenizer, TokenizerError, CL100K_BASE_PATTERN,
LLAMA3_PATTERN, O200K_BASE_PATTERN,
};

/// Splintr - Fast Rust BPE tokenizer with Python bindings
///
/// A high-performance tokenizer featuring:
/// - Regexr with JIT and SIMD (default, pure Rust)
/// - Optional PCRE2 with JIT (requires `pcre2` feature)
/// - Rayon parallelism for multi-core encoding
/// - Linked-list BPE algorithm (avoids O(N²) on pathological inputs)
/// - FxHashMap for fast lookups
/// - Aho-Corasick for fast special token matching
/// - LRU cache for frequently encoded chunks
/// - UTF-8 streaming decoder for LLM output
/// - Agent tokens for chat/reasoning/tool-use applications
#[pymodule]
fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<python::PyTokenizer>()?;
m.add_class::<python::PyStreamingDecoder>()?;
m.add_class::<python::PyByteLevelStreamingDecoder>()?;
m.add_class::<python::PyCL100KAgentTokens>()?;
m.add_class::<python::PyO200KAgentTokens>()?;
m.add_class::<python::PyLlama3AgentTokens>()?;
m.add_class::<python::PyDeepSeekV3AgentTokens>()?;
m.add("CL100K_BASE_PATTERN", CL100K_BASE_PATTERN)?;
m.add("O200K_BASE_PATTERN", O200K_BASE_PATTERN)?;
m.add("LLAMA3_PATTERN", LLAMA3_PATTERN)?;
Ok(())
}
30 changes: 30 additions & 0 deletions src/python/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,36 @@
mod bindings;

use crate::core::{CL100K_BASE_PATTERN, LLAMA3_PATTERN, O200K_BASE_PATTERN};
pub use bindings::{
PyByteLevelStreamingDecoder, PyCL100KAgentTokens, PyDeepSeekV3AgentTokens, PyLlama3AgentTokens,
PyO200KAgentTokens, PyStreamingDecoder, PyTokenizer,
};

use pyo3::prelude::*;

/// Splintr - Fast Rust BPE tokenizer with Python bindings
///
/// A high-performance tokenizer featuring:
/// - Regexr with JIT and SIMD (default, pure Rust)
/// - Optional PCRE2 with JIT (requires `pcre2` feature)
/// - Rayon parallelism for multi-core encoding
/// - Linked-list BPE algorithm (avoids O(N²) on pathological inputs)
/// - FxHashMap for fast lookups
/// - Aho-Corasick for fast special token matching
/// - LRU cache for frequently encoded chunks
/// - UTF-8 streaming decoder for LLM output
/// - Agent tokens for chat/reasoning/tool-use applications
#[pymodule]
fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<PyTokenizer>()?;
m.add_class::<PyStreamingDecoder>()?;
m.add_class::<PyByteLevelStreamingDecoder>()?;
m.add_class::<PyCL100KAgentTokens>()?;
m.add_class::<PyO200KAgentTokens>()?;
m.add_class::<PyLlama3AgentTokens>()?;
m.add_class::<PyDeepSeekV3AgentTokens>()?;
m.add("CL100K_BASE_PATTERN", CL100K_BASE_PATTERN)?;
m.add("O200K_BASE_PATTERN", O200K_BASE_PATTERN)?;
m.add("LLAMA3_PATTERN", LLAMA3_PATTERN)?;
Ok(())
}
Loading