From e5f30399248d0bbacd24b1f4c1fbbeca7e22ac60 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Sun, 22 Mar 2026 21:45:02 -0700
Subject: [PATCH 1/2] make tiktoken token counting optional via
 enable_token_counting override

By default, use cheap len/4 estimate for input token counting (metrics
and ratelimit). When enable_token_counting is set to true in overrides,
use tiktoken BPE for exact counts. This eliminates ~80ms of per-request
latency from tiktoken in the WASM filter while keeping metrics and
ratelimit functional.

Made-with: Cursor
---
 config/plano_config_schema.yaml          |  3 +++
 crates/common/src/configuration.rs       |  1 +
 crates/llm_gateway/src/stream_context.rs | 23 +++++++++++++++--------
 3 files changed, 19 insertions(+), 8 deletions(-)
diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml
index 5190fecf7..f7817a091 100644
--- a/config/plano_config_schema.yaml
+++ b/config/plano_config_schema.yaml
@@ -285,6 +285,9 @@ properties:
       agent_orchestration_model:
         type: string
         description: "Model name for the agent orchestrator (e.g., 'Plano-Orchestrator'). Must match a model in model_providers."
+      enable_token_counting:
+        type: boolean
+        description: "Enable tiktoken-based input token counting for metrics and rate limiting. Default is false."
   system_prompt:
     type: string
   prompt_targets:
diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs
index df1790594..2d7a7f22f 100644
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@@ -131,6 +131,7 @@ pub struct Overrides {
     pub use_agent_orchestrator: Option<bool>,
     pub llm_routing_model: Option<String>,
     pub agent_orchestration_model: Option<String>,
+    pub enable_token_counting: Option<bool>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs
index f62631fab..20b8d3e60 100644
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@@ -48,7 +48,7 @@ pub struct StreamContext {
     ttft_time: Option<u128>,
     traceparent: Option<String>,
     request_body_sent_time: Option<u128>,
-    _overrides: Rc<Option<Overrides>>,
+    overrides: Rc<Option<Overrides>>,
     user_message: Option<String>,
     upstream_status_code: Option<StatusCode>,
     binary_frame_decoder: Option<BedrockBinaryFrameDecoder<bytes::BytesMut>>,
@@ -66,7 +66,7 @@ impl StreamContext {
     ) -> Self {
         StreamContext {
             metrics,
-            _overrides: overrides,
+            overrides,
             ratelimit_selector: None,
             streaming_response: false,
             response_tokens: 0,
@@ -269,22 +269,29 @@ impl StreamContext {
         model: &str,
         json_string: &str,
     ) -> Result<(), ratelimit::Error> {
-        // Tokenize and record token count.
-        let token_count = tokenizer::token_count(model, json_string).unwrap_or(0);
+        let use_tiktoken = (*self.overrides)
+            .as_ref()
+            .and_then(|o| o.enable_token_counting)
+            .unwrap_or(false);
+
+        let token_count = if use_tiktoken {
+            tokenizer::token_count(model, json_string).unwrap_or(0)
+        } else {
+            json_string.len() / 4
+        };
 
         debug!(
-            "request_id={}: token count, model='{}' input_tokens={}",
+            "request_id={}: token count, model='{}' input_tokens={} method={}",
             self.request_identifier(),
             model,
-            token_count
+            token_count,
+            if use_tiktoken { "tiktoken" } else { "estimate" }
         );
 
-        // Record the token count to metrics.
         self.metrics
             .input_sequence_length
             .record(token_count as u64);
 
-        // Check if rate limiting needs to be applied.
         if let Some(selector) = self.ratelimit_selector.take() {
             info!(
                 "request_id={}: ratelimit check, model='{}' selector='{}:{}'",

From 20e8e0c51e928c6b630da3aef31cf67125975fe5 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Wed, 25 Mar 2026 05:35:27 +0000
Subject: [PATCH 2/2] replace enable_token_counting bool with
 token_counting_strategy enum (estimate|auto)

---
 config/plano_config_schema.yaml          | 11 ++++++---
 crates/common/src/configuration.rs       | 11 ++++++++-
 crates/llm_gateway/src/stream_context.rs | 30 ++++++++++++++++--------
 3 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml
index f7817a091..a80d7d0e4 100644
--- a/config/plano_config_schema.yaml
+++ b/config/plano_config_schema.yaml
@@ -285,9 +285,14 @@ properties:
       agent_orchestration_model:
         type: string
         description: "Model name for the agent orchestrator (e.g., 'Plano-Orchestrator'). Must match a model in model_providers."
-      enable_token_counting:
-        type: boolean
-        description: "Enable tiktoken-based input token counting for metrics and rate limiting. Default is false."
+      token_counting_strategy:
+        type: string
+        enum: [estimate, auto]
+        description: >
+          Strategy for counting input tokens used in metrics and rate limiting.
+          "estimate" (default): fast character-based approximation (~1 token per 4 chars).
+          "auto": uses the best available tokenizer for each provider (e.g., tiktoken for
+          OpenAI models), falling back to estimate for unsupported providers.
   system_prompt:
     type: string
   prompt_targets:
diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs
index 2d7a7f22f..a08f69db7 100644
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@@ -124,6 +124,15 @@ pub struct Configuration {
     pub state_storage: Option<StateStorageConfig>,
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
+pub enum TokenCountingStrategy {
+    #[default]
+    #[serde(rename = "estimate")]
+    Estimate,
+    #[serde(rename = "auto")]
+    Auto,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
 pub struct Overrides {
     pub prompt_target_intent_matching_threshold: Option<f64>,
@@ -131,7 +140,7 @@ pub struct Overrides {
     pub use_agent_orchestrator: Option<bool>,
     pub llm_routing_model: Option<String>,
     pub agent_orchestration_model: Option<String>,
-    pub enable_token_counting: Option<bool>,
+    pub token_counting_strategy: Option<TokenCountingStrategy>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs
index 20b8d3e60..f8ad82516 100644
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@@ -10,7 +10,7 @@ use std::sync::Arc;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
 use crate::metrics::Metrics;
-use common::configuration::{LlmProvider, LlmProviderType, Overrides};
+use common::configuration::{LlmProvider, LlmProviderType, Overrides, TokenCountingStrategy};
 use common::consts::{
     ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, HEALTHZ_PATH,
     RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER, TRACE_PARENT_HEADER,
@@ -269,15 +269,25 @@ impl StreamContext {
         model: &str,
         json_string: &str,
     ) -> Result<(), ratelimit::Error> {
-        let use_tiktoken = (*self.overrides)
+        let strategy = (*self.overrides)
             .as_ref()
-            .and_then(|o| o.enable_token_counting)
-            .unwrap_or(false);
-
-        let token_count = if use_tiktoken {
-            tokenizer::token_count(model, json_string).unwrap_or(0)
-        } else {
-            json_string.len() / 4
+            .and_then(|o| o.token_counting_strategy.clone())
+            .unwrap_or_default();
+
+        let (token_count, method) = match strategy {
+            TokenCountingStrategy::Auto => {
+                let provider_id = self.get_provider_id();
+                match provider_id {
+                    ProviderId::OpenAI => (
+                        tokenizer::token_count(model, json_string).unwrap_or(json_string.len() / 4),
+                        "tiktoken",
+                    ),
+                    // Future: add provider-specific tokenizers here
+                    // ProviderId::Mistral => (mistral_tokenizer::count(...), "mistral"),
+                    _ => (json_string.len() / 4, "estimate"),
+                }
+            }
+            TokenCountingStrategy::Estimate => (json_string.len() / 4, "estimate"),
         };
 
         debug!(
@@ -285,7 +295,7 @@ impl StreamContext {
             self.request_identifier(),
             model,
             token_count,
-            if use_tiktoken { "tiktoken" } else { "estimate" }
+            method
         );
 
         self.metrics