From 298c2b6394bb7a53a73320fd584d33eebbbe8fe1 Mon Sep 17 00:00:00 2001 From: SamBroomy <36888606+SamBroomy@users.noreply.github.com> Date: Mon, 8 Dec 2025 00:33:44 +0000 Subject: [PATCH] feat: python feature flag --- Cargo.toml | 5 +++-- pyproject.toml | 8 +++----- src/lib.rs | 30 +----------------------------- src/python/mod.rs | 30 ++++++++++++++++++++++++++++++ 4 files changed, 37 insertions(+), 36 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index adf14f1..1111093 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ crate-type = ["cdylib", "rlib"] [features] default = [] +python = ["dep:pyo3"] pcre2 = ["dep:pcre2"] [dependencies] @@ -28,13 +29,13 @@ rustc-hash = "2.0" # Error handling thiserror = "2.0" # Python bindings -pyo3 = { version = "0.23", features = ["extension-module"] } +pyo3 = { version = "0.27", features = ["extension-module"], optional = true } # Base64 decoding for tiktoken vocab files base64 = "0.22" # Aho-Corasick for fast multi-pattern special token matching aho-corasick = "1.1" # LRU cache for frequent token sequences -lru = "0.12" +lru = "0.16" # regexr regex engine (default backend) regexr = { version = "0.1.0-beta.4", features = ["jit", "simd"] } diff --git a/pyproject.toml b/pyproject.toml index 774903d..17361f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,12 +7,10 @@ name = "splintr-rs" version = "0.6.0" description = "Fast Rust BPE tokenizer with Python bindings" readme = "README.md" -license = {text = "MIT"} +license = { text = "MIT" } requires-python = ">=3.8" keywords = ["tokenizer", "bpe", "tiktoken", "gpt", "llm"] -authors = [ - {name = "Farhan"} -] +authors = [{ name = "Farhan" }] classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", @@ -38,4 +36,4 @@ Documentation = "https://github.com/farhan/splintr#readme" python-source = "python" module-name = "splintr._core" bindings = "pyo3" -features = ["pyo3/extension-module"] +features = ["python"] diff --git a/src/lib.rs b/src/lib.rs index 7dbc18a..48aebe7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,36 +1,8 @@ pub mod core; +#[cfg(feature = "python")] mod python; -use pyo3::prelude::*; - pub use core::{ ByteLevelStreamingDecoder, StreamingDecoder, Tokenizer, TokenizerError, CL100K_BASE_PATTERN, LLAMA3_PATTERN, O200K_BASE_PATTERN, }; - -/// Splintr - Fast Rust BPE tokenizer with Python bindings -/// -/// A high-performance tokenizer featuring: -/// - Regexr with JIT and SIMD (default, pure Rust) -/// - Optional PCRE2 with JIT (requires `pcre2` feature) -/// - Rayon parallelism for multi-core encoding -/// - Linked-list BPE algorithm (avoids O(N²) on pathological inputs) -/// - FxHashMap for fast lookups -/// - Aho-Corasick for fast special token matching -/// - LRU cache for frequently encoded chunks -/// - UTF-8 streaming decoder for LLM output -/// - Agent tokens for chat/reasoning/tool-use applications -#[pymodule] -fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add("CL100K_BASE_PATTERN", CL100K_BASE_PATTERN)?; - m.add("O200K_BASE_PATTERN", O200K_BASE_PATTERN)?; - m.add("LLAMA3_PATTERN", LLAMA3_PATTERN)?; - Ok(()) -} diff --git a/src/python/mod.rs b/src/python/mod.rs index a28d4f1..1f294e7 100644 --- a/src/python/mod.rs +++ b/src/python/mod.rs @@ -1,6 +1,36 @@ mod bindings; +use crate::core::{CL100K_BASE_PATTERN, LLAMA3_PATTERN, O200K_BASE_PATTERN}; pub use bindings::{ PyByteLevelStreamingDecoder, PyCL100KAgentTokens, PyDeepSeekV3AgentTokens, PyLlama3AgentTokens, PyO200KAgentTokens, PyStreamingDecoder, PyTokenizer, }; + +use pyo3::prelude::*; + +/// Splintr - Fast Rust BPE tokenizer with Python bindings +/// +/// A high-performance tokenizer featuring: +/// - Regexr with JIT and SIMD (default, pure Rust) +/// - Optional PCRE2 with JIT (requires `pcre2` feature) +/// - Rayon parallelism for multi-core encoding +/// - Linked-list BPE algorithm (avoids O(N²) on pathological inputs) +/// - FxHashMap for fast lookups +/// - Aho-Corasick for fast special token matching +/// - LRU cache for frequently encoded chunks +/// - UTF-8 streaming decoder for LLM output +/// - Agent tokens for chat/reasoning/tool-use applications +#[pymodule] +fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add("CL100K_BASE_PATTERN", CL100K_BASE_PATTERN)?; + m.add("O200K_BASE_PATTERN", O200K_BASE_PATTERN)?; + m.add("LLAMA3_PATTERN", LLAMA3_PATTERN)?; + Ok(()) +}