From e5f30399248d0bbacd24b1f4c1fbbeca7e22ac60 Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Sun, 22 Mar 2026 21:45:02 -0700 Subject: [PATCH 1/2] make tiktoken token counting optional via enable_token_counting override By default, use cheap len/4 estimate for input token counting (metrics and ratelimit). When enable_token_counting is set to true in overrides, use tiktoken BPE for exact counts. This eliminates ~80ms of per-request latency from tiktoken in the WASM filter while keeping metrics and ratelimit functional. Made-with: Cursor --- config/plano_config_schema.yaml | 3 +++ crates/common/src/configuration.rs | 1 + crates/llm_gateway/src/stream_context.rs | 23 +++++++++++++++-------- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml index 5190fecf7..f7817a091 100644 --- a/config/plano_config_schema.yaml +++ b/config/plano_config_schema.yaml @@ -285,6 +285,9 @@ properties: agent_orchestration_model: type: string description: "Model name for the agent orchestrator (e.g., 'Plano-Orchestrator'). Must match a model in model_providers." + enable_token_counting: + type: boolean + description: "Enable tiktoken-based input token counting for metrics and rate limiting. Default is false." system_prompt: type: string prompt_targets: diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs index df1790594..2d7a7f22f 100644 --- a/crates/common/src/configuration.rs +++ b/crates/common/src/configuration.rs @@ -131,6 +131,7 @@ pub struct Overrides { pub use_agent_orchestrator: Option, pub llm_routing_model: Option, pub agent_orchestration_model: Option, + pub enable_token_counting: Option, } #[derive(Debug, Clone, Serialize, Deserialize, Default)] diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index f62631fab..20b8d3e60 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -48,7 +48,7 @@ pub struct StreamContext { ttft_time: Option, traceparent: Option, request_body_sent_time: Option, - _overrides: Rc>, + overrides: Rc>, user_message: Option, upstream_status_code: Option, binary_frame_decoder: Option>, @@ -66,7 +66,7 @@ impl StreamContext { ) -> Self { StreamContext { metrics, - _overrides: overrides, + overrides, ratelimit_selector: None, streaming_response: false, response_tokens: 0, @@ -269,22 +269,29 @@ impl StreamContext { model: &str, json_string: &str, ) -> Result<(), ratelimit::Error> { - // Tokenize and record token count. - let token_count = tokenizer::token_count(model, json_string).unwrap_or(0); + let use_tiktoken = (*self.overrides) + .as_ref() + .and_then(|o| o.enable_token_counting) + .unwrap_or(false); + + let token_count = if use_tiktoken { + tokenizer::token_count(model, json_string).unwrap_or(0) + } else { + json_string.len() / 4 + }; debug!( - "request_id={}: token count, model='{}' input_tokens={}", + "request_id={}: token count, model='{}' input_tokens={} method={}", self.request_identifier(), model, - token_count + token_count, + if use_tiktoken { "tiktoken" } else { "estimate" } ); - // Record the token count to metrics. self.metrics .input_sequence_length .record(token_count as u64); - // Check if rate limiting needs to be applied. if let Some(selector) = self.ratelimit_selector.take() { info!( "request_id={}: ratelimit check, model='{}' selector='{}:{}'", From 20e8e0c51e928c6b630da3aef31cf67125975fe5 Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Wed, 25 Mar 2026 05:35:27 +0000 Subject: [PATCH 2/2] replace enable_token_counting bool with token_counting_strategy enum (estimate|auto) --- config/plano_config_schema.yaml | 11 ++++++--- crates/common/src/configuration.rs | 11 ++++++++- crates/llm_gateway/src/stream_context.rs | 30 ++++++++++++++++-------- 3 files changed, 38 insertions(+), 14 deletions(-) diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml index f7817a091..a80d7d0e4 100644 --- a/config/plano_config_schema.yaml +++ b/config/plano_config_schema.yaml @@ -285,9 +285,14 @@ properties: agent_orchestration_model: type: string description: "Model name for the agent orchestrator (e.g., 'Plano-Orchestrator'). Must match a model in model_providers." - enable_token_counting: - type: boolean - description: "Enable tiktoken-based input token counting for metrics and rate limiting. Default is false." + token_counting_strategy: + type: string + enum: [estimate, auto] + description: > + Strategy for counting input tokens used in metrics and rate limiting. + "estimate" (default): fast character-based approximation (~1 token per 4 chars). + "auto": uses the best available tokenizer for each provider (e.g., tiktoken for + OpenAI models), falling back to estimate for unsupported providers. system_prompt: type: string prompt_targets: diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs index 2d7a7f22f..a08f69db7 100644 --- a/crates/common/src/configuration.rs +++ b/crates/common/src/configuration.rs @@ -124,6 +124,15 @@ pub struct Configuration { pub state_storage: Option, } +#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)] +pub enum TokenCountingStrategy { + #[default] + #[serde(rename = "estimate")] + Estimate, + #[serde(rename = "auto")] + Auto, +} + #[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct Overrides { pub prompt_target_intent_matching_threshold: Option, @@ -131,7 +140,7 @@ pub struct Overrides { pub use_agent_orchestrator: Option, pub llm_routing_model: Option, pub agent_orchestration_model: Option, - pub enable_token_counting: Option, + pub token_counting_strategy: Option, } #[derive(Debug, Clone, Serialize, Deserialize, Default)] diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 20b8d3e60..f8ad82516 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -10,7 +10,7 @@ use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use crate::metrics::Metrics; -use common::configuration::{LlmProvider, LlmProviderType, Overrides}; +use common::configuration::{LlmProvider, LlmProviderType, Overrides, TokenCountingStrategy}; use common::consts::{ ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, HEALTHZ_PATH, RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER, TRACE_PARENT_HEADER, @@ -269,15 +269,25 @@ impl StreamContext { model: &str, json_string: &str, ) -> Result<(), ratelimit::Error> { - let use_tiktoken = (*self.overrides) + let strategy = (*self.overrides) .as_ref() - .and_then(|o| o.enable_token_counting) - .unwrap_or(false); - - let token_count = if use_tiktoken { - tokenizer::token_count(model, json_string).unwrap_or(0) - } else { - json_string.len() / 4 + .and_then(|o| o.token_counting_strategy.clone()) + .unwrap_or_default(); + + let (token_count, method) = match strategy { + TokenCountingStrategy::Auto => { + let provider_id = self.get_provider_id(); + match provider_id { + ProviderId::OpenAI => ( + tokenizer::token_count(model, json_string).unwrap_or(json_string.len() / 4), + "tiktoken", + ), + // Future: add provider-specific tokenizers here + // ProviderId::Mistral => (mistral_tokenizer::count(...), "mistral"), + _ => (json_string.len() / 4, "estimate"), + } + } + TokenCountingStrategy::Estimate => (json_string.len() / 4, "estimate"), }; debug!( @@ -285,7 +295,7 @@ impl StreamContext { self.request_identifier(), model, token_count, - if use_tiktoken { "tiktoken" } else { "estimate" } + method ); self.metrics