From 298c2b6394bb7a53a73320fd584d33eebbbe8fe1 Mon Sep 17 00:00:00 2001
From: SamBroomy <36888606+SamBroomy@users.noreply.github.com>
Date: Mon, 8 Dec 2025 00:33:44 +0000
Subject: [PATCH] feat: python feature flag

---
 Cargo.toml        |  5 +++--
 pyproject.toml    |  8 +++-----
 src/lib.rs        | 30 +-----------------------------
 src/python/mod.rs | 30 ++++++++++++++++++++++++++++++
 4 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index adf14f1..1111093 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,6 +16,7 @@ crate-type = ["cdylib", "rlib"]
 
 [features]
 default = []
+python = ["dep:pyo3"]
 pcre2 = ["dep:pcre2"]
 
 [dependencies]
@@ -28,13 +29,13 @@ rustc-hash = "2.0"
 # Error handling
 thiserror = "2.0"
 # Python bindings
-pyo3 = { version = "0.23", features = ["extension-module"] }
+pyo3 = { version = "0.27", features = ["extension-module"], optional = true }
 # Base64 decoding for tiktoken vocab files
 base64 = "0.22"
 # Aho-Corasick for fast multi-pattern special token matching
 aho-corasick = "1.1"
 # LRU cache for frequent token sequences
-lru = "0.12"
+lru = "0.16"
 # regexr regex engine (default backend)
 regexr = { version = "0.1.0-beta.4", features = ["jit", "simd"] }
 
diff --git a/pyproject.toml b/pyproject.toml
index 774903d..17361f6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,12 +7,10 @@ name = "splintr-rs"
 version = "0.6.0"
 description = "Fast Rust BPE tokenizer with Python bindings"
 readme = "README.md"
-license = {text = "MIT"}
+license = { text = "MIT" }
 requires-python = ">=3.8"
 keywords = ["tokenizer", "bpe", "tiktoken", "gpt", "llm"]
-authors = [
-    {name = "Farhan"}
-]
+authors = [{ name = "Farhan" }]
 classifiers = [
     "Development Status :: 4 - Beta",
     "Intended Audience :: Developers",
@@ -38,4 +36,4 @@ Documentation = "https://github.com/farhan/splintr#readme"
 python-source = "python"
 module-name = "splintr._core"
 bindings = "pyo3"
-features = ["pyo3/extension-module"]
+features = ["python"]
diff --git a/src/lib.rs b/src/lib.rs
index 7dbc18a..48aebe7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,36 +1,8 @@
 pub mod core;
+#[cfg(feature = "python")]
 mod python;
 
-use pyo3::prelude::*;
-
 pub use core::{
     ByteLevelStreamingDecoder, StreamingDecoder, Tokenizer, TokenizerError, CL100K_BASE_PATTERN,
     LLAMA3_PATTERN, O200K_BASE_PATTERN,
 };
-
-/// Splintr - Fast Rust BPE tokenizer with Python bindings
-///
-/// A high-performance tokenizer featuring:
-/// - Regexr with JIT and SIMD (default, pure Rust)
-/// - Optional PCRE2 with JIT (requires `pcre2` feature)
-/// - Rayon parallelism for multi-core encoding
-/// - Linked-list BPE algorithm (avoids O(N²) on pathological inputs)
-/// - FxHashMap for fast lookups
-/// - Aho-Corasick for fast special token matching
-/// - LRU cache for frequently encoded chunks
-/// - UTF-8 streaming decoder for LLM output
-/// - Agent tokens for chat/reasoning/tool-use applications
-#[pymodule]
-fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
-    m.add_class::<python::PyTokenizer>()?;
-    m.add_class::<python::PyStreamingDecoder>()?;
-    m.add_class::<python::PyByteLevelStreamingDecoder>()?;
-    m.add_class::<python::PyCL100KAgentTokens>()?;
-    m.add_class::<python::PyO200KAgentTokens>()?;
-    m.add_class::<python::PyLlama3AgentTokens>()?;
-    m.add_class::<python::PyDeepSeekV3AgentTokens>()?;
-    m.add("CL100K_BASE_PATTERN", CL100K_BASE_PATTERN)?;
-    m.add("O200K_BASE_PATTERN", O200K_BASE_PATTERN)?;
-    m.add("LLAMA3_PATTERN", LLAMA3_PATTERN)?;
-    Ok(())
-}
diff --git a/src/python/mod.rs b/src/python/mod.rs
index a28d4f1..1f294e7 100644
--- a/src/python/mod.rs
+++ b/src/python/mod.rs
@@ -1,6 +1,36 @@
 mod bindings;
 
+use crate::core::{CL100K_BASE_PATTERN, LLAMA3_PATTERN, O200K_BASE_PATTERN};
 pub use bindings::{
     PyByteLevelStreamingDecoder, PyCL100KAgentTokens, PyDeepSeekV3AgentTokens, PyLlama3AgentTokens,
     PyO200KAgentTokens, PyStreamingDecoder, PyTokenizer,
 };
+
+use pyo3::prelude::*;
+
+/// Splintr - Fast Rust BPE tokenizer with Python bindings
+///
+/// A high-performance tokenizer featuring:
+/// - Regexr with JIT and SIMD (default, pure Rust)
+/// - Optional PCRE2 with JIT (requires `pcre2` feature)
+/// - Rayon parallelism for multi-core encoding
+/// - Linked-list BPE algorithm (avoids O(N²) on pathological inputs)
+/// - FxHashMap for fast lookups
+/// - Aho-Corasick for fast special token matching
+/// - LRU cache for frequently encoded chunks
+/// - UTF-8 streaming decoder for LLM output
+/// - Agent tokens for chat/reasoning/tool-use applications
+#[pymodule]
+fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    m.add_class::<PyTokenizer>()?;
+    m.add_class::<PyStreamingDecoder>()?;
+    m.add_class::<PyByteLevelStreamingDecoder>()?;
+    m.add_class::<PyCL100KAgentTokens>()?;
+    m.add_class::<PyO200KAgentTokens>()?;
+    m.add_class::<PyLlama3AgentTokens>()?;
+    m.add_class::<PyDeepSeekV3AgentTokens>()?;
+    m.add("CL100K_BASE_PATTERN", CL100K_BASE_PATTERN)?;
+    m.add("O200K_BASE_PATTERN", O200K_BASE_PATTERN)?;
+    m.add("LLAMA3_PATTERN", LLAMA3_PATTERN)?;
+    Ok(())
+}