From cc22169955470c1a044df0cc7bef37374f04c30e Mon Sep 17 00:00:00 2001
From: JustAGhosT <smit.jurie@gmail.com>
Date: Sun, 15 Mar 2026 02:53:34 +0200
Subject: [PATCH] feat(telemetry): add OpenTelemetry support for
 request-to-token attribution

- Add otel callback to LiteLLM success/failure callbacks
- Add OTEL env vars: OTEL_SERVICE_NAME, OTEL_TRACER_NAME, OTEL_EXPORTER_OTLP_ENDPOINT
- Add new variables: otel_exporter_endpoint, otel_service_name
- Update planning doc with implementation details and upstream requirements
---
 docs/planning/request_to_token_attribution.md | 61 +++++++++++++++----
 infra/modules/aigateway_aca/main.tf           | 39 ++++++++++--
 infra/modules/aigateway_aca/variables.tf      | 13 ++++
 3 files changed, 96 insertions(+), 17 deletions(-)

diff --git a/docs/planning/request_to_token_attribution.md b/docs/planning/request_to_token_attribution.md
index 9e3674d..5d51ea8 100644
--- a/docs/planning/request_to_token_attribution.md
+++ b/docs/planning/request_to_token_attribution.md
@@ -29,20 +29,37 @@ Document the current architecture:
 
 ## What Can Be Done in This Repo
 
-### Phase 1: LiteLLM Custom Callback (Core Implementation)
+### Phase 1: OpenTelemetry Integration (Core Implementation)
 
-**Files to modify:**
+**Implementation Approach:**
 
-- `infra/modules/aigateway_aca/main.tf` - Add custom callback config
-- Create new custom callback module
+Instead of a custom callback (which requires a custom LiteLLM image), we're using LiteLLM's built-in OpenTelemetry support. This provides structured traces with token telemetry out of the box.
 
-**Approach:** Use LiteLLM's CustomLogger class or success_callback to emit structured token telemetry
+**Changes Made:**
 
-**Challenge:** LiteLLM runs as a container image - need to either:
+- Added `otel` to LiteLLM `success_callback` and `failure_callback` in `infra/modules/aigateway_aca/main.tf`
+- Added OTEL environment variables:
+  - `OTEL_SERVICE_NAME` - service name for traces
+  - `OTEL_TRACER_NAME` - tracer name
+  - `OTEL_EXPORTER_OTLP_ENDPOINT` - OTLP collector endpoint
+  - `OTEL_EXPORTER_OTLP_PROTOCOL` - protocol (http/json)
+- Added new variables in `infra/modules/aigateway_aca/variables.tf`:
+  - `otel_exporter_endpoint` - OTLP collector URL
+  - `otel_service_name` - custom service name
 
-1. Build a custom image with callback baked in
-2. Use environment variables + config-based callback
-3. Add a sidecar container
+**How It Works:**
+
+LiteLLM's OTEL callback automatically emits spans with:
+
+- Model name, provider, deployment
+- Token usage (prompt_tokens, completion_tokens, total_tokens)
+- Duration
+- Request/response metadata
+
+**Files Modified:**
+
+- `infra/modules/aigateway_aca/main.tf` - Added OTEL callback and env vars
+- `infra/modules/aigateway_aca/variables.tf` - Added OTEL configuration variables
 
 ### Phase 2: Correlation ID Propagation
 
@@ -59,7 +76,27 @@ Document the current architecture:
 
 ### 1. cognitive-mesh (Upstream Caller)
 
-Required: Must pass correlation headers when calling gateway:
+**Required:** Must pass correlation headers when calling gateway. There are two methods:
+
+**Method A: Via Request Metadata (Recommended)**
+Pass correlation IDs in the request body `metadata` field:
+
+```json
+{
+  "model": "gpt-5.3-codex",
+  "messages": [{ "role": "user", "content": "Hello" }],
+  "metadata": {
+    "request_id": "req_123",
+    "session_id": "sess_456",
+    "workflow": "manual_orchestration",
+    "stage": "writer",
+    "endpoint": "/api/manual-orchestration/sessions/start",
+    "user_id": "user_abc"
+  }
+}
+```
+
+**Method B: Via HTTP Headers**
 
 - x-request-id
 - x-session-id
@@ -68,9 +105,11 @@ Required: Must pass correlation headers when calling gateway:
 - x-stage-name
 - x-user-id
 
+_Note: Method B requires additional LiteLLM configuration or middleware._
+
 ### 2. pvc-costops-analytics (Downstream Analytics)
 
-Required: KQL queries and dashboards to:
+**Required:** KQL queries and dashboards to:
 
 - Join requests table to token events by operation_Id/request_id
 - Aggregate rollups by endpoint, workflow, stage, model, deployment
diff --git a/infra/modules/aigateway_aca/main.tf b/infra/modules/aigateway_aca/main.tf
index 53e2fca..412723c 100644
--- a/infra/modules/aigateway_aca/main.tf
+++ b/infra/modules/aigateway_aca/main.tf
@@ -41,6 +41,7 @@ locals {
   # Features enabled here:
   #   - JSON structured logging → Log Analytics Workspace via Container Apps stdout
   #   - Prometheus /metrics endpoint (built-in, no extra infra)
+  #   - OpenTelemetry tracing for request-to-token attribution
   #   - Langfuse tracing (when both langfuse_public_key and langfuse_secret_key are provided)
   #   - Redis semantic caching (when enable_redis_cache = true)
   #   - Global budget / rate limits (when set above 0)
@@ -73,16 +74,13 @@ locals {
     num_retries: 2
     json_logs: true
     # Prometheus /metrics: token usage, latency and error rate at <gateway>/metrics
+    # OpenTelemetry: structured traces with token telemetry for attribution
     success_callback:
       - prometheus
-  %{if var.langfuse_public_key != "" && var.langfuse_secret_key != ""~}
-      - langfuse
-  %{endif}
+      - otel
     failure_callback:
       - prometheus
-  %{if var.langfuse_public_key != "" && var.langfuse_secret_key != ""~}
-      - langfuse
-  %{endif}
+      - otel
   %{if var.enable_redis_cache~}
     # Redis: deduplicate identical requests to reduce Azure OpenAI token spend
     cache: true
@@ -402,6 +400,35 @@ resource "azurerm_container_app" "ca" {
         }
       }
 
+      # OpenTelemetry configuration for request-to-token attribution
+      # OTEL_EXPORTER_OTLP_ENDPOINT should be set to your OTLP collector URL
+      # e.g., https://your-collector.eastus.azure.com:4318
+      env {
+        name  = "OTEL_SERVICE_NAME"
+        value = "ai-gateway-${var.env}"
+      }
+
+      env {
+        name  = "OTEL_TRACER_NAME"
+        value = "litellm"
+      }
+
+      dynamic "env" {
+        for_each = var.otel_exporter_endpoint != "" ? [var.otel_exporter_endpoint] : []
+        content {
+          name  = "OTEL_EXPORTER_OTLP_ENDPOINT"
+          value = env.value
+        }
+      }
+
+      dynamic "env" {
+        for_each = var.otel_exporter_endpoint != "" ? ["http/json"] : []
+        content {
+          name  = "OTEL_EXPORTER_OTLP_PROTOCOL"
+          value = env.value
+        }
+      }
+
       # LiteLLM commonly listens on 4000; set port as needed
     }
   }
diff --git a/infra/modules/aigateway_aca/variables.tf b/infra/modules/aigateway_aca/variables.tf
index d5a5d87..377b408 100644
--- a/infra/modules/aigateway_aca/variables.tf
+++ b/infra/modules/aigateway_aca/variables.tf
@@ -211,3 +211,16 @@ variable "tpm_limit" {
   description = "Global tokens-per-minute cap across all API keys (0 = no limit)."
   default     = 0
 }
+
+# OpenTelemetry configuration for request-to-token attribution
+variable "otel_exporter_endpoint" {
+  type        = string
+  description = "OpenTelemetry OTLP exporter endpoint (e.g., https://collector.example.com:4318). Leave empty to disable OTEL tracing."
+  default     = ""
+}
+
+variable "otel_service_name" {
+  type        = string
+  description = "OpenTelemetry service name for tracing."
+  default     = "ai-gateway"
+}