katanemo · adilhafeez · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026
diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml
@@ -9,6 +9,7 @@ properties:
       - 0.1-beta
       - 0.2.0
       - v0.3.0
+      - v0.4.0
 
   agents:
     type: array
@@ -470,6 +471,107 @@ properties:
         additionalProperties: false
         required:
           - jailbreak
+  routing_preferences:
+    type: array
+    items:
+      type: object
+      properties:
+        name:
+          type: string
+        description:
+          type: string
+        models:
+          type: array
+          items:
+            type: string
+          minItems: 1
+        selection_policy:
+          type: object
+          properties:
+            prefer:
+              type: string
+              enum:
+                - cheapest
+                - fastest
+                - random
+                - none
+          additionalProperties: false
+          required:
+            - prefer
+      additionalProperties: false
+      required:
+        - name
+        - description
+        - models
+        - selection_policy
+
+  model_metrics_sources:
+    type: array
+    items:
+      oneOf:
+        - type: object
+          properties:
+            type:
+              type: string
+              const: cost_metrics
+            url:
+              type: string
+            refresh_interval:
+              type: integer
+              minimum: 1
+            auth:
+              type: object
+              properties:
+                type:
+                  type: string
+                  enum:
+                    - bearer
+                token:
+                  type: string
+              required:
+                - type
+                - token
+              additionalProperties: false
+          required:
+            - type
+            - url
+          additionalProperties: false
+        - type: object
+          properties:
+            type:
+              type: string
+              const: prometheus_metrics
+            url:
+              type: string
+            query:
+              type: string
+            refresh_interval:
+              type: integer
+              minimum: 1
+              description: "Refresh interval in seconds"
+          required:
+            - type
+            - url
+            - query
+          additionalProperties: false
+        - type: object
+          properties:
+            type:
+              type: string
+              const: digitalocean_pricing
+            refresh_interval:
+              type: integer
+              minimum: 1
+              description: "Refresh interval in seconds"
+            model_aliases:
+              type: object
+              description: "Map DO catalog keys (lowercase(creator)/model_id) to Plano model names used in routing_preferences. Example: 'openai/openai-gpt-oss-120b: openai/gpt-4o'"
+              additionalProperties:
+                type: string
+          required:
+            - type
+          additionalProperties: false
+
 additionalProperties: false
 required:
   - version

diff --git a/crates/brightstaff/src/handlers/llm/mod.rs b/crates/brightstaff/src/handlers/llm/mod.rs
@@ -119,7 +119,7 @@ async fn llm_chat_inner(
         temperature,
         tool_names,
         user_message_preview,
-        inline_routing_policy,
+        inline_routing_preferences,
         client_api,
         provider_id,
     } = parsed;
@@ -261,7 +261,7 @@ async fn llm_chat_inner(
             &traceparent,
             &request_path,
             &request_id,
-            inline_routing_policy,
+            inline_routing_preferences,
         )
         .await
     }
@@ -323,7 +323,7 @@ struct PreparedRequest {
     temperature: Option<f32>,
     tool_names: Option<Vec<String>>,
     user_message_preview: Option<String>,
-    inline_routing_policy: Option<Vec<common::configuration::ModelUsagePreference>>,
+    inline_routing_preferences: Option<Vec<common::configuration::TopLevelRoutingPreference>>,
     client_api: Option<SupportedAPIsFromClient>,
     provider_id: hermesllm::ProviderId,
 }
@@ -352,16 +352,14 @@ async fn parse_and_validate_request(
         "request body received"
     );
 
-    // Extract routing_policy from request body if present
-    let (chat_request_bytes, inline_routing_policy) =
-        crate::handlers::routing_service::extract_routing_policy(&raw_bytes, false).map_err(
-            |err| {
-                warn!(error = %err, "failed to parse request JSON");
-                let mut r = Response::new(full(format!("Failed to parse request: {}", err)));
-                *r.status_mut() = StatusCode::BAD_REQUEST;
-                r
-            },
-        )?;
+    // Extract routing_preferences from request body if present
+    let (chat_request_bytes, inline_routing_preferences) =
+        crate::handlers::routing_service::extract_routing_policy(&raw_bytes).map_err(|err| {
+            warn!(error = %err, "failed to parse request JSON");
+            let mut r = Response::new(full(format!("Failed to parse request: {}", err)));
+            *r.status_mut() = StatusCode::BAD_REQUEST;
+            r
+        })?;
 
     let api_type = SupportedAPIsFromClient::from_endpoint(request_path).ok_or_else(|| {
         warn!(path = %request_path, "unsupported endpoint");
@@ -439,7 +437,7 @@ async fn parse_and_validate_request(
         temperature,
         tool_names,
         user_message_preview,
-        inline_routing_policy,
+        inline_routing_preferences,
         client_api,
         provider_id,
     })

diff --git a/crates/brightstaff/src/handlers/llm/model_selection.rs b/crates/brightstaff/src/handlers/llm/model_selection.rs
@@ -1,6 +1,6 @@
-use common::configuration::ModelUsagePreference;
+use common::configuration::TopLevelRoutingPreference;
 use hermesllm::clients::endpoints::SupportedUpstreamAPIs;
-use hermesllm::{ProviderRequest, ProviderRequestType};
+use hermesllm::ProviderRequestType;
 use hyper::StatusCode;
 use std::sync::Arc;
 use tracing::{debug, info, warn};
@@ -10,7 +10,10 @@ use crate::streaming::truncate_message;
 use crate::tracing::routing;
 
 pub struct RoutingResult {
+    /// Primary model to use (first in the ranked list).
     pub model_name: String,
+    /// Full ranked list — use subsequent entries as fallbacks on 429/5xx.
+    pub models: Vec<String>,
     pub route_name: Option<String>,
 }
 
@@ -39,11 +42,8 @@ pub async fn router_chat_get_upstream_model(
     traceparent: &str,
     request_path: &str,
     request_id: &str,
-    inline_usage_preferences: Option<Vec<ModelUsagePreference>>,
+    inline_routing_preferences: Option<Vec<TopLevelRoutingPreference>>,
 ) -> Result<RoutingResult, RoutingError> {
-    // Clone metadata for routing before converting (which consumes client_request)
-    let routing_metadata = client_request.metadata().clone();
-
     // Convert to ChatCompletionsRequest for routing (regardless of input type)
     let chat_request = match ProviderRequestType::try_from((
         client_request,
@@ -78,22 +78,6 @@ pub async fn router_chat_get_upstream_model(
         "router request"
     );
 
-    // Use inline preferences if provided, otherwise fall back to metadata extraction
-    let usage_preferences: Option<Vec<ModelUsagePreference>> = if inline_usage_preferences.is_some()
-    {
-        inline_usage_preferences
-    } else {
-        let usage_preferences_str: Option<String> =
-            routing_metadata.as_ref().and_then(|metadata| {
-                metadata
-                    .get("plano_preference_config")
-                    .map(|value| value.to_string())
-            });
-        usage_preferences_str
-            .as_ref()
-            .and_then(|s| serde_yaml::from_str(s).ok())
-    };
-
     // Prepare log message with latest message from chat request
     let latest_message_for_log = chat_request
         .messages
@@ -107,7 +91,6 @@ pub async fn router_chat_get_upstream_model(
     let latest_message_for_log = truncate_message(&latest_message_for_log, 50);
 
     info!(
-        has_usage_preferences = usage_preferences.is_some(),
         path = %request_path,
         latest_message = %latest_message_for_log,
         "processing router request"
@@ -121,7 +104,7 @@ pub async fn router_chat_get_upstream_model(
         .determine_route(
             &chat_request.messages,
             traceparent,
-            usage_preferences,
+            inline_routing_preferences,
             request_id,
         )
         .await;
@@ -132,10 +115,12 @@ pub async fn router_chat_get_upstream_model(
 
     match routing_result {
         Ok(route) => match route {
-            Some((route_name, model_name)) => {
+            Some((route_name, ranked_models)) => {
+                let model_name = ranked_models.first().cloned().unwrap_or_default();
                 current_span.record("route.selected_model", model_name.as_str());
                 Ok(RoutingResult {
                     model_name,
+                    models: ranked_models,
                     route_name: Some(route_name),
                 })
             }
@@ -147,6 +132,7 @@ pub async fn router_chat_get_upstream_model(
 
                 Ok(RoutingResult {
                     model_name: "none".to_string(),
+                    models: vec!["none".to_string()],
                     route_name: None,
                 })
             }