diff --git a/scripts/telemetry/llama-stack-inference-metrics.json b/scripts/telemetry/llama-stack-inference-metrics.json
new file mode 100644
index 0000000000..caef48f12b
--- /dev/null
+++ b/scripts/telemetry/llama-stack-inference-metrics.json
@@ -0,0 +1,1463 @@
+{
+    "annotations": {
+      "list": [
+        {
+          "builtIn": 1,
+          "datasource": {
+            "type": "grafana",
+            "uid": "-- Grafana --"
+          },
+          "enable": true,
+          "hide": true,
+          "iconColor": "rgba(0, 211, 255, 1)",
+          "name": "Annotations & Alerts",
+          "type": "dashboard"
+        }
+      ]
+    },
+    "editable": true,
+    "fiscalYearStartMonth": 0,
+    "graphTooltip": 0,
+    "id": null,
+    "links": [],
+    "liveNow": false,
+    "panels": [
+      {
+        "collapsed": false,
+        "gridPos": {
+          "h": 1,
+          "w": 24,
+          "x": 0,
+          "y": 0
+        },
+        "id": 20,
+        "title": "Overview",
+        "type": "row"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "thresholds"
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                }
+              ]
+            },
+            "unit": "short"
+          },
+          "overrides": []
+        },
+        "gridPos": {
+          "h": 4,
+          "w": 6,
+          "x": 0,
+          "y": 1
+        },
+        "id": 1,
+        "options": {
+          "colorMode": "value",
+          "graphMode": "area",
+          "justifyMode": "auto",
+          "orientation": "auto",
+          "reduceOptions": {
+            "calcs": [
+              "lastNotNull"
+            ],
+            "fields": "",
+            "values": false
+          },
+          "textMode": "auto"
+        },
+        "pluginVersion": "10.2.3",
+        "targets": [
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "sum(llama_stack_llama_stack_inference_duration_seconds_count)",
+            "legendFormat": "__auto",
+            "range": true,
+            "refId": "A"
+          }
+        ],
+        "title": "Total Inference Requests",
+        "type": "stat"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "thresholds"
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "yellow",
+                  "value": 1
+                },
+                {
+                  "color": "red",
+                  "value": 5
+                }
+              ]
+            },
+            "unit": "s"
+          },
+          "overrides": []
+        },
+        "gridPos": {
+          "h": 4,
+          "w": 6,
+          "x": 6,
+          "y": 1
+        },
+        "id": 2,
+        "options": {
+          "colorMode": "value",
+          "graphMode": "area",
+          "justifyMode": "auto",
+          "orientation": "auto",
+          "reduceOptions": {
+            "calcs": [
+              "lastNotNull"
+            ],
+            "fields": "",
+            "values": false
+          },
+          "textMode": "auto"
+        },
+        "pluginVersion": "10.2.3",
+        "targets": [
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "histogram_quantile(0.95, sum by (le) (rate(llama_stack_llama_stack_inference_duration_seconds_bucket[5m])))",
+            "legendFormat": "__auto",
+            "range": true,
+            "refId": "A"
+          }
+        ],
+        "title": "P95 Inference Duration",
+        "type": "stat"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "thresholds"
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "yellow",
+                  "value": 0.5
+                },
+                {
+                  "color": "red",
+                  "value": 2
+                }
+              ]
+            },
+            "unit": "s"
+          },
+          "overrides": []
+        },
+        "gridPos": {
+          "h": 4,
+          "w": 6,
+          "x": 12,
+          "y": 1
+        },
+        "id": 3,
+        "options": {
+          "colorMode": "value",
+          "graphMode": "area",
+          "justifyMode": "auto",
+          "orientation": "auto",
+          "reduceOptions": {
+            "calcs": [
+              "lastNotNull"
+            ],
+            "fields": "",
+            "values": false
+          },
+          "textMode": "auto"
+        },
+        "pluginVersion": "10.2.3",
+        "targets": [
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "histogram_quantile(0.95, sum by (le) (rate(llama_stack_llama_stack_inference_time_to_first_token_seconds_bucket{stream=\"true\"}[5m])))",
+            "legendFormat": "__auto",
+            "range": true,
+            "refId": "A"
+          }
+        ],
+        "title": "P95 Time to First Token",
+        "type": "stat"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "thresholds"
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "red",
+                  "value": null
+                },
+                {
+                  "color": "yellow",
+                  "value": 20
+                },
+                {
+                  "color": "green",
+                  "value": 50
+                }
+              ]
+            },
+            "unit": "none"
+          },
+          "overrides": []
+        },
+        "gridPos": {
+          "h": 4,
+          "w": 6,
+          "x": 18,
+          "y": 1
+        },
+        "id": 4,
+        "options": {
+          "colorMode": "value",
+          "graphMode": "area",
+          "justifyMode": "auto",
+          "orientation": "auto",
+          "reduceOptions": {
+            "calcs": [
+              "lastNotNull"
+            ],
+            "fields": "",
+            "values": false
+          },
+          "textMode": "auto"
+        },
+        "pluginVersion": "10.2.3",
+        "targets": [
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "histogram_quantile(0.50, sum by (le) (rate(llama_stack_llama_stack_inference_tokens_per_second_bucket[5m])))",
+            "legendFormat": "__auto",
+            "range": true,
+            "refId": "A"
+          }
+        ],
+        "title": "P50 Tokens Per Second",
+        "type": "stat"
+      },
+      {
+        "collapsed": false,
+        "gridPos": {
+          "h": 1,
+          "w": 24,
+          "x": 0,
+          "y": 5
+        },
+        "id": 21,
+        "title": "Inference Duration",
+        "type": "row"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "axisCenteredZero": false,
+              "axisColorMode": "text",
+              "axisLabel": "",
+              "axisPlacement": "auto",
+              "barAlignment": 0,
+              "drawStyle": "line",
+              "fillOpacity": 10,
+              "gradientMode": "none",
+              "hideFrom": {
+                "tooltip": false,
+                "viz": false,
+                "legend": false
+              },
+              "lineInterpolation": "smooth",
+              "lineWidth": 2,
+              "pointSize": 5,
+              "scaleDistribution": {
+                "type": "linear"
+              },
+              "showPoints": "auto",
+              "spanNulls": false,
+              "stacking": {
+                "group": "A",
+                "mode": "none"
+              },
+              "thresholdsStyle": {
+                "mode": "off"
+              }
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                }
+              ]
+            },
+            "unit": "s"
+          },
+          "overrides": [
+            {
+              "matcher": {
+                "id": "byRegexp",
+                "options": ".*p50.*"
+              },
+              "properties": [
+                {
+                  "id": "color",
+                  "value": {
+                    "fixedColor": "green",
+                    "mode": "fixed"
+                  }
+                }
+              ]
+            },
+            {
+              "matcher": {
+                "id": "byRegexp",
+                "options": ".*p95.*"
+              },
+              "properties": [
+                {
+                  "id": "color",
+                  "value": {
+                    "fixedColor": "yellow",
+                    "mode": "fixed"
+                  }
+                }
+              ]
+            },
+            {
+              "matcher": {
+                "id": "byRegexp",
+                "options": ".*p99.*"
+              },
+              "properties": [
+                {
+                  "id": "color",
+                  "value": {
+                    "fixedColor": "red",
+                    "mode": "fixed"
+                  }
+                }
+              ]
+            }
+          ]
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 6
+        },
+        "id": 5,
+        "options": {
+          "legend": {
+            "calcs": [
+              "last",
+              "max"
+            ],
+            "displayMode": "table",
+            "placement": "bottom",
+            "showLegend": true
+          },
+          "tooltip": {
+            "mode": "multi",
+            "sort": "none"
+          }
+        },
+        "targets": [
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "histogram_quantile(0.50, sum by (le) (rate(llama_stack_llama_stack_inference_duration_seconds_bucket[1m])))",
+            "legendFormat": "All models (p50)",
+            "range": true,
+            "refId": "A"
+          },
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "histogram_quantile(0.95, sum by (le) (rate(llama_stack_llama_stack_inference_duration_seconds_bucket[1m])))",
+            "hide": false,
+            "legendFormat": "All models (p95)",
+            "range": true,
+            "refId": "B"
+          },
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "histogram_quantile(0.99, sum by (le) (rate(llama_stack_llama_stack_inference_duration_seconds_bucket[1m])))",
+            "hide": false,
+            "legendFormat": "All models (p99)",
+            "range": true,
+            "refId": "C"
+          }
+        ],
+        "title": "Inference Duration (P50, P95, P99)",
+        "type": "timeseries"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "axisCenteredZero": false,
+              "axisColorMode": "text",
+              "axisLabel": "",
+              "axisPlacement": "auto",
+              "barAlignment": 0,
+              "drawStyle": "line",
+              "fillOpacity": 10,
+              "gradientMode": "none",
+              "hideFrom": {
+                "tooltip": false,
+                "viz": false,
+                "legend": false
+              },
+              "lineInterpolation": "smooth",
+              "lineWidth": 2,
+              "pointSize": 5,
+              "scaleDistribution": {
+                "type": "linear"
+              },
+              "showPoints": "auto",
+              "spanNulls": false,
+              "stacking": {
+                "group": "A",
+                "mode": "none"
+              },
+              "thresholdsStyle": {
+                "mode": "off"
+              }
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                }
+              ]
+            },
+            "unit": "s"
+          },
+          "overrides": []
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 6
+        },
+        "id": 6,
+        "options": {
+          "legend": {
+            "calcs": [
+              "last",
+              "max"
+            ],
+            "displayMode": "table",
+            "placement": "bottom",
+            "showLegend": true
+          },
+          "tooltip": {
+            "mode": "multi",
+            "sort": "none"
+          }
+        },
+        "targets": [
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "histogram_quantile(0.95, sum by (model, le) (rate(llama_stack_llama_stack_inference_duration_seconds_bucket[1m])))",
+            "legendFormat": "{{model}}",
+            "range": true,
+            "refId": "A"
+          }
+        ],
+        "title": "P95 Inference Duration by Model",
+        "type": "timeseries"
+      },
+      {
+        "collapsed": false,
+        "gridPos": {
+          "h": 1,
+          "w": 24,
+          "x": 0,
+          "y": 14
+        },
+        "id": 22,
+        "title": "Time to First Token",
+        "type": "row"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "axisCenteredZero": false,
+              "axisColorMode": "text",
+              "axisLabel": "",
+              "axisPlacement": "auto",
+              "barAlignment": 0,
+              "drawStyle": "line",
+              "fillOpacity": 10,
+              "gradientMode": "none",
+              "hideFrom": {
+                "tooltip": false,
+                "viz": false,
+                "legend": false
+              },
+              "lineInterpolation": "smooth",
+              "lineWidth": 2,
+              "pointSize": 5,
+              "scaleDistribution": {
+                "type": "linear"
+              },
+              "showPoints": "auto",
+              "spanNulls": false,
+              "stacking": {
+                "group": "A",
+                "mode": "none"
+              },
+              "thresholdsStyle": {
+                "mode": "off"
+              }
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                }
+              ]
+            },
+            "unit": "s"
+          },
+          "overrides": [
+            {
+              "matcher": {
+                "id": "byRegexp",
+                "options": ".*p50.*"
+              },
+              "properties": [
+                {
+                  "id": "color",
+                  "value": {
+                    "fixedColor": "green",
+                    "mode": "fixed"
+                  }
+                }
+              ]
+            },
+            {
+              "matcher": {
+                "id": "byRegexp",
+                "options": ".*p95.*"
+              },
+              "properties": [
+                {
+                  "id": "color",
+                  "value": {
+                    "fixedColor": "yellow",
+                    "mode": "fixed"
+                  }
+                }
+              ]
+            },
+            {
+              "matcher": {
+                "id": "byRegexp",
+                "options": ".*p99.*"
+              },
+              "properties": [
+                {
+                  "id": "color",
+                  "value": {
+                    "fixedColor": "red",
+                    "mode": "fixed"
+                  }
+                }
+              ]
+            }
+          ]
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 15
+        },
+        "id": 7,
+        "options": {
+          "legend": {
+            "calcs": [
+              "last",
+              "max"
+            ],
+            "displayMode": "table",
+            "placement": "bottom",
+            "showLegend": true
+          },
+          "tooltip": {
+            "mode": "multi",
+            "sort": "none"
+          }
+        },
+        "targets": [
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "histogram_quantile(0.50, sum by (le) (rate(llama_stack_llama_stack_inference_time_to_first_token_seconds_bucket{stream=\"true\"}[1m])))",
+            "legendFormat": "All models (p50)",
+            "range": true,
+            "refId": "A"
+          },
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "histogram_quantile(0.95, sum by (le) (rate(llama_stack_llama_stack_inference_time_to_first_token_seconds_bucket{stream=\"true\"}[1m])))",
+            "hide": false,
+            "legendFormat": "All models (p95)",
+            "range": true,
+            "refId": "B"
+          },
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "histogram_quantile(0.99, sum by (le) (rate(llama_stack_llama_stack_inference_time_to_first_token_seconds_bucket{stream=\"true\"}[1m])))",
+            "hide": false,
+            "legendFormat": "All models (p99)",
+            "range": true,
+            "refId": "C"
+          }
+        ],
+        "title": "Time to First Token (P50, P95, P99)",
+        "type": "timeseries"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "axisCenteredZero": false,
+              "axisColorMode": "text",
+              "axisLabel": "",
+              "axisPlacement": "auto",
+              "barAlignment": 0,
+              "drawStyle": "line",
+              "fillOpacity": 10,
+              "gradientMode": "none",
+              "hideFrom": {
+                "tooltip": false,
+                "viz": false,
+                "legend": false
+              },
+              "lineInterpolation": "smooth",
+              "lineWidth": 2,
+              "pointSize": 5,
+              "scaleDistribution": {
+                "type": "linear"
+              },
+              "showPoints": "auto",
+              "spanNulls": false,
+              "stacking": {
+                "group": "A",
+                "mode": "none"
+              },
+              "thresholdsStyle": {
+                "mode": "off"
+              }
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                }
+              ]
+            },
+            "unit": "s"
+          },
+          "overrides": []
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 15
+        },
+        "id": 8,
+        "options": {
+          "legend": {
+            "calcs": [
+              "last",
+              "max"
+            ],
+            "displayMode": "table",
+            "placement": "bottom",
+            "showLegend": true
+          },
+          "tooltip": {
+            "mode": "multi",
+            "sort": "none"
+          }
+        },
+        "targets": [
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "histogram_quantile(0.95, sum by (model, le) (rate(llama_stack_llama_stack_inference_time_to_first_token_seconds_bucket{stream=\"true\"}[1m])))",
+            "legendFormat": "{{model}}",
+            "range": true,
+            "refId": "A"
+          }
+        ],
+        "title": "P95 Time to First Token by Model",
+        "type": "timeseries"
+      },
+      {
+        "collapsed": false,
+        "gridPos": {
+          "h": 1,
+          "w": 24,
+          "x": 0,
+          "y": 23
+        },
+        "id": 23,
+        "title": "Throughput",
+        "type": "row"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "axisCenteredZero": false,
+              "axisColorMode": "text",
+              "axisLabel": "tokens/s",
+              "axisPlacement": "auto",
+              "barAlignment": 0,
+              "drawStyle": "line",
+              "fillOpacity": 10,
+              "gradientMode": "none",
+              "hideFrom": {
+                "tooltip": false,
+                "viz": false,
+                "legend": false
+              },
+              "lineInterpolation": "smooth",
+              "lineWidth": 2,
+              "pointSize": 5,
+              "scaleDistribution": {
+                "type": "linear"
+              },
+              "showPoints": "auto",
+              "spanNulls": false,
+              "stacking": {
+                "group": "A",
+                "mode": "none"
+              },
+              "thresholdsStyle": {
+                "mode": "off"
+              }
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                }
+              ]
+            },
+            "unit": "none"
+          },
+          "overrides": [
+            {
+              "matcher": {
+                "id": "byRegexp",
+                "options": ".*p50.*"
+              },
+              "properties": [
+                {
+                  "id": "color",
+                  "value": {
+                    "fixedColor": "green",
+                    "mode": "fixed"
+                  }
+                }
+              ]
+            },
+            {
+              "matcher": {
+                "id": "byRegexp",
+                "options": ".*p95.*"
+              },
+              "properties": [
+                {
+                  "id": "color",
+                  "value": {
+                    "fixedColor": "yellow",
+                    "mode": "fixed"
+                  }
+                }
+              ]
+            }
+          ]
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 24
+        },
+        "id": 9,
+        "options": {
+          "legend": {
+            "calcs": [
+              "last",
+              "max"
+            ],
+            "displayMode": "table",
+            "placement": "bottom",
+            "showLegend": true
+          },
+          "tooltip": {
+            "mode": "multi",
+            "sort": "none"
+          }
+        },
+        "targets": [
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "histogram_quantile(0.50, sum by (le) (rate(llama_stack_llama_stack_inference_tokens_per_second_bucket[1m])))",
+            "legendFormat": "All models (p50)",
+            "range": true,
+            "refId": "A"
+          },
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "histogram_quantile(0.95, sum by (le) (rate(llama_stack_llama_stack_inference_tokens_per_second_bucket[1m])))",
+            "hide": false,
+            "legendFormat": "All models (p95)",
+            "range": true,
+            "refId": "B"
+          }
+        ],
+        "title": "Tokens per Second (P50, P95)",
+        "type": "timeseries"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "axisCenteredZero": false,
+              "axisColorMode": "text",
+              "axisLabel": "tokens/s",
+              "axisPlacement": "auto",
+              "barAlignment": 0,
+              "drawStyle": "line",
+              "fillOpacity": 10,
+              "gradientMode": "none",
+              "hideFrom": {
+                "tooltip": false,
+                "viz": false,
+                "legend": false
+              },
+              "lineInterpolation": "smooth",
+              "lineWidth": 2,
+              "pointSize": 5,
+              "scaleDistribution": {
+                "type": "linear"
+              },
+              "showPoints": "auto",
+              "spanNulls": false,
+              "stacking": {
+                "group": "A",
+                "mode": "none"
+              },
+              "thresholdsStyle": {
+                "mode": "off"
+              }
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                }
+              ]
+            },
+            "unit": "none"
+          },
+          "overrides": []
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 24
+        },
+        "id": 10,
+        "options": {
+          "legend": {
+            "calcs": [
+              "last",
+              "max"
+            ],
+            "displayMode": "table",
+            "placement": "bottom",
+            "showLegend": true
+          },
+          "tooltip": {
+            "mode": "multi",
+            "sort": "none"
+          }
+        },
+        "targets": [
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "histogram_quantile(0.50, sum by (model, le) (rate(llama_stack_llama_stack_inference_tokens_per_second_bucket[1m])))",
+            "legendFormat": "{{model}}",
+            "range": true,
+            "refId": "A"
+          }
+        ],
+        "title": "Median Tokens/s by Model",
+        "type": "timeseries"
+      },
+      {
+        "collapsed": false,
+        "gridPos": {
+          "h": 1,
+          "w": 24,
+          "x": 0,
+          "y": 32
+        },
+        "id": 24,
+        "title": "Provider Comparison",
+        "type": "row"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "axisCenteredZero": false,
+              "axisColorMode": "text",
+              "axisLabel": "",
+              "axisPlacement": "auto",
+              "barAlignment": 0,
+              "drawStyle": "line",
+              "fillOpacity": 10,
+              "gradientMode": "none",
+              "hideFrom": {
+                "tooltip": false,
+                "viz": false,
+                "legend": false
+              },
+              "lineInterpolation": "smooth",
+              "lineWidth": 2,
+              "pointSize": 5,
+              "scaleDistribution": {
+                "type": "linear"
+              },
+              "showPoints": "auto",
+              "spanNulls": false,
+              "stacking": {
+                "group": "A",
+                "mode": "none"
+              },
+              "thresholdsStyle": {
+                "mode": "off"
+              }
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                }
+              ]
+            },
+            "unit": "s"
+          },
+          "overrides": []
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 33
+        },
+        "id": 11,
+        "options": {
+          "legend": {
+            "calcs": [
+              "last",
+              "max"
+            ],
+            "displayMode": "table",
+            "placement": "bottom",
+            "showLegend": true
+          },
+          "tooltip": {
+            "mode": "multi",
+            "sort": "none"
+          }
+        },
+        "targets": [
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "histogram_quantile(0.95, sum by (provider, le) (rate(llama_stack_llama_stack_inference_duration_seconds_bucket[1m])))",
+            "legendFormat": "{{provider}}",
+            "range": true,
+            "refId": "A"
+          }
+        ],
+        "title": "P95 Inference Duration by Provider",
+        "type": "timeseries"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "thresholds"
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                }
+              ]
+            },
+            "unit": "short"
+          },
+          "overrides": []
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 33
+        },
+        "id": 12,
+        "options": {
+          "displayLabels": [
+            "percent"
+          ],
+          "legend": {
+            "displayMode": "table",
+            "placement": "right",
+            "showLegend": true,
+            "values": [
+              "value",
+              "percent"
+            ]
+          },
+          "pieType": "donut",
+          "reduceOptions": {
+            "calcs": [
+              "lastNotNull"
+            ],
+            "fields": "",
+            "values": false
+          },
+          "tooltip": {
+            "mode": "single",
+            "sort": "none"
+          }
+        },
+        "targets": [
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "sum by (provider) (llama_stack_llama_stack_inference_duration_seconds_count)",
+            "legendFormat": "{{provider}}",
+            "range": true,
+            "refId": "A"
+          }
+        ],
+        "title": "Request Distribution by Provider",
+        "type": "piechart"
+      },
+      {
+        "collapsed": false,
+        "gridPos": {
+          "h": 1,
+          "w": 24,
+          "x": 0,
+          "y": 41
+        },
+        "id": 25,
+        "title": "Errors & Details",
+        "type": "row"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "thresholds"
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                }
+              ]
+            },
+            "unit": "short"
+          },
+          "overrides": []
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 42
+        },
+        "id": 13,
+        "options": {
+          "displayLabels": [
+            "percent"
+          ],
+          "legend": {
+            "displayMode": "table",
+            "placement": "right",
+            "showLegend": true,
+            "values": [
+              "value",
+              "percent"
+            ]
+          },
+          "pieType": "pie",
+          "reduceOptions": {
+            "calcs": [
+              "lastNotNull"
+            ],
+            "fields": "",
+            "values": false
+          },
+          "tooltip": {
+            "mode": "single",
+            "sort": "none"
+          }
+        },
+        "targets": [
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "expr": "sum by (status) (llama_stack_llama_stack_inference_duration_seconds_count)",
+            "legendFormat": "{{status}}",
+            "range": true,
+            "refId": "A"
+          }
+        ],
+        "title": "Success vs Error Rate",
+        "type": "piechart"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "thresholds"
+            },
+            "custom": {
+              "align": "auto",
+              "cellOptions": {
+                "type": "auto"
+              },
+              "inspect": false
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                }
+              ]
+            }
+          },
+          "overrides": [
+            {
+              "matcher": {
+                "id": "byName",
+                "options": "Value"
+              },
+              "properties": [
+                {
+                  "id": "displayName",
+                  "value": "Requests"
+                }
+              ]
+            }
+          ]
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 42
+        },
+        "id": 14,
+        "options": {
+          "cellHeight": "sm",
+          "footer": {
+            "countRows": false,
+            "fields": "",
+            "reducer": [
+              "sum"
+            ],
+            "show": false
+          },
+          "showHeader": true
+        },
+        "pluginVersion": "10.2.3",
+        "targets": [
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "prometheus"
+            },
+            "editorMode": "code",
+            "exemplar": false,
+            "expr": "sum by (model, provider, stream, status) (llama_stack_llama_stack_inference_duration_seconds_count)",
+            "format": "table",
+            "instant": true,
+            "legendFormat": "__auto",
+            "range": false,
+            "refId": "A"
+          }
+        ],
+        "title": "Inference Request Details",
+        "transformations": [
+          {
+            "id": "organize",
+            "options": {
+              "excludeByName": {
+                "Time": true
+              },
+              "indexByName": {},
+              "renameByName": {
+                "model": "Model",
+                "provider": "Provider",
+                "status": "Status",
+                "stream": "Stream"
+              }
+            }
+          }
+        ],
+        "type": "table"
+      }
+    ],
+    "refresh": "5s",
+    "schemaVersion": 38,
+    "style": "dark",
+    "tags": [
+      "llama-stack",
+      "metrics",
+      "inference"
+    ],
+    "templating": {
+      "list": []
+    },
+    "time": {
+      "from": "now-15m",
+      "to": "now"
+    },
+    "timepicker": {},
+    "timezone": "",
+    "title": "Llama Stack - Inference Metrics",
+    "uid": "llama-stack-inference-metrics",
+    "version": 0,
+    "weekStart": ""
+  }
diff --git a/scripts/telemetry/setup_telemetry.sh b/scripts/telemetry/setup_telemetry.sh
index e9ce64e023..1d121f64bf 100755
--- a/scripts/telemetry/setup_telemetry.sh
+++ b/scripts/telemetry/setup_telemetry.sh
@@ -141,6 +141,7 @@ $CONTAINER_RUNTIME run -d --name grafana \
   -v "$SCRIPT_DIR/llama-stack-vector-io-metrics.json:/etc/grafana/provisioning/dashboards/llama-stack-vector-io-metrics.json:Z" \
   -v "$SCRIPT_DIR/llama-stack-request-metrics.json:/etc/grafana/provisioning/dashboards/llama-stack-request-metrics.json:Z" \
   -v "$SCRIPT_DIR/llama-stack-responses-metrics.json:/etc/grafana/provisioning/dashboards/llama-stack-responses-metrics.json:Z" \
+  -v "$SCRIPT_DIR/llama-stack-inference-metrics.json:/etc/grafana/provisioning/dashboards/llama-stack-inference-metrics.json:Z" \
   docker.io/grafana/grafana:11.0.0
 
 # Wait for services to start
diff --git a/src/llama_stack/core/routers/inference.py b/src/llama_stack/core/routers/inference.py
index e3f14fc056..2a72cd084d 100644
--- a/src/llama_stack/core/routers/inference.py
+++ b/src/llama_stack/core/routers/inference.py
@@ -19,6 +19,12 @@
 from llama_stack.core.request_headers import get_authenticated_user
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.inference_store import InferenceStore
+from llama_stack.telemetry.inference_metrics import (
+    create_inference_metric_attributes,
+    inference_duration,
+    inference_time_to_first_token,
+    inference_tokens_per_second,
+)
 from llama_stack_api import (
     GetChatCompletionRequest,
     HealthResponse,
@@ -208,9 +214,40 @@ async def openai_chat_completion(
                 messages=params.messages,
             )
 
-        response = await self._nonstream_openai_chat_completion(provider, params)
+        start_time = time.perf_counter()
+        status = "success"
+        try:
+            response = await self._nonstream_openai_chat_completion(provider, params)
+        except asyncio.CancelledError:
+            status = "error"
+            raise
+        except Exception:
+            status = "error"
+            raise
+        finally:
+            duration = time.perf_counter() - start_time
+            attrs = create_inference_metric_attributes(
+                model=request_model_id,
+                provider=provider.__provider_id__,
+                stream=False,
+                status=status,
+            )
+            inference_duration.record(duration, attributes=attrs)
+
         response.model = request_model_id
 
+        if response.usage and response.usage.completion_tokens and duration > 0:
+            tokens_per_sec = response.usage.completion_tokens / duration
+            inference_tokens_per_second.record(
+                tokens_per_sec,
+                attributes=create_inference_metric_attributes(
+                    model=request_model_id,
+                    provider=provider.__provider_id__,
+                    stream=False,
+                    status="success",
+                ),
+            )
+
         # Store the response with the ID that will be returned to the client
         if self.store:
             asyncio.create_task(self.store.store_chat_completion(response, params.messages))
@@ -295,6 +332,10 @@ async def stream_tokens_and_compute_metrics_openai_chat(
         id = None
         created = None
         choices_data: dict[int, dict[str, Any]] = {}
+        start_time = time.perf_counter()
+        first_token_time: float | None = None
+        completion_tokens: int | None = None
+        status = "success"
 
         try:
             async for chunk in response:
@@ -326,6 +367,8 @@ async def stream_tokens_and_compute_metrics_openai_chat(
                         if choice_delta.delta:
                             delta = choice_delta.delta
                             if delta.content:
+                                if first_token_time is None:
+                                    first_token_time = time.perf_counter()
                                 current_choice_data["content_parts"].append(delta.content)
                             if delta.tool_calls:
                                 for tool_call_delta in delta.tool_calls:
@@ -380,6 +423,10 @@ async def stream_tokens_and_compute_metrics_openai_chat(
                             choice_delta.logprobs.content = converted_logprobs
                             current_choice_data["logprobs_content_parts"].extend(converted_logprobs)
 
+                # Capture usage from the final chunk (providers send usage in the last chunk)
+                if chunk.usage and chunk.usage.completion_tokens:
+                    completion_tokens = chunk.usage.completion_tokens
+
                 # Compute metrics on final chunk
                 if chunk.choices and chunk.choices[0].finish_reason:
                     completion_text = ""
@@ -387,7 +434,30 @@ async def stream_tokens_and_compute_metrics_openai_chat(
                         completion_text += "".join(choice_data["content_parts"])
 
                 yield chunk
+        except asyncio.CancelledError:
+            status = "error"
+            raise
+        except Exception:
+            status = "error"
+            raise
         finally:
+            duration = time.perf_counter() - start_time
+            attrs = create_inference_metric_attributes(
+                model=fully_qualified_model_id,
+                provider=provider_id,
+                stream=True,
+                status=status,
+            )
+            inference_duration.record(duration, attributes=attrs)
+
+            if first_token_time is not None:
+                ttft = first_token_time - start_time
+                inference_time_to_first_token.record(ttft, attributes=attrs)
+
+            if completion_tokens and duration > 0:
+                tokens_per_sec = completion_tokens / duration
+                inference_tokens_per_second.record(tokens_per_sec, attributes=attrs)
+
             # Store the final assembled completion
             if id and self.store and messages:
                 assembled_choices: list[OpenAIChoice] = []
diff --git a/src/llama_stack/telemetry/constants.py b/src/llama_stack/telemetry/constants.py
index 8a1e692410..858565f475 100644
--- a/src/llama_stack/telemetry/constants.py
+++ b/src/llama_stack/telemetry/constants.py
@@ -58,6 +58,14 @@
 REQUEST_DURATION_SECONDS = f"{REQUEST_PREFIX}_duration_seconds"
 CONCURRENT_REQUESTS = f"{llama_stack_prefix}.concurrent_requests"
 
+# Inference Metrics
+# These constants define the names for OpenTelemetry metrics tracking inference operations
+INFERENCE_PREFIX = f"{llama_stack_prefix}.inference"
+
+INFERENCE_DURATION = f"{INFERENCE_PREFIX}.duration_seconds"
+INFERENCE_TIME_TO_FIRST_TOKEN = f"{INFERENCE_PREFIX}.time_to_first_token_seconds"
+INFERENCE_TOKENS_PER_SECOND = f"{INFERENCE_PREFIX}.tokens_per_second"
+
 # Responses API Metrics
 RESPONSES_PREFIX = f"{llama_stack_prefix}.responses"
 RESPONSES_PARAMETER_USAGE_TOTAL = f"{RESPONSES_PREFIX}.parameter_usage_total"
diff --git a/src/llama_stack/telemetry/inference_metrics.py b/src/llama_stack/telemetry/inference_metrics.py
new file mode 100644
index 0000000000..6c4f3cff72
--- /dev/null
+++ b/src/llama_stack/telemetry/inference_metrics.py
@@ -0,0 +1,77 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+OpenTelemetry metrics for llama-stack inference operations.
+
+This module provides centralized metric definitions for tracking:
+- Inference duration (end-to-end latency for chat completions)
+- Time to first token (streaming requests only)
+- Tokens per second (output throughput)
+
+All metrics follow OpenTelemetry semantic conventions and use the llama_stack prefix
+for consistent naming across the telemetry stack.
+"""
+
+from opentelemetry import metrics
+from opentelemetry.metrics import Histogram
+
+from .constants import (
+    INFERENCE_DURATION,
+    INFERENCE_TIME_TO_FIRST_TOKEN,
+    INFERENCE_TOKENS_PER_SECOND,
+)
+
+# Get or create meter for llama_stack.inference
+meter = metrics.get_meter("llama_stack.inference", version="1.0.0")
+
+inference_duration: Histogram = meter.create_histogram(
+    name=INFERENCE_DURATION,
+    description="Duration of inference requests from start to completion",
+    unit="s",
+)
+
+inference_time_to_first_token: Histogram = meter.create_histogram(
+    name=INFERENCE_TIME_TO_FIRST_TOKEN,
+    description="Time from request start to first content token (streaming only)",
+    unit="s",
+)
+
+inference_tokens_per_second: Histogram = meter.create_histogram(
+    name=INFERENCE_TOKENS_PER_SECOND,
+    description="Output token throughput (completion tokens / duration)",
+)
+
+
+def create_inference_metric_attributes(
+    model: str | None = None,
+    provider: str | None = None,
+    stream: bool | None = None,
+    status: str | None = None,
+) -> dict[str, str]:
+    """Create a consistent attribute dictionary for inference metrics.
+
+    Args:
+        model: Fully qualified model ID (e.g., "openai/gpt-4o-mini")
+        provider: Provider ID (e.g., "openai")
+        stream: Whether this is a streaming request
+        status: Request outcome ("success", "error")
+
+    Returns:
+        Dictionary of attributes with non-None values
+    """
+    attributes: dict[str, str] = {}
+
+    if model is not None:
+        attributes["model"] = model
+    if provider is not None:
+        attributes["provider"] = provider
+    if stream is not None:
+        attributes["stream"] = str(stream).lower()
+    if status is not None:
+        attributes["status"] = status
+
+    return attributes
diff --git a/tests/unit/telemetry/test_inference_metrics.py b/tests/unit/telemetry/test_inference_metrics.py
new file mode 100644
index 0000000000..245870c5ef
--- /dev/null
+++ b/tests/unit/telemetry/test_inference_metrics.py
@@ -0,0 +1,391 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""Unit tests for inference metrics."""
+
+import time
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from llama_stack.core.routers.inference import InferenceRouter
+from llama_stack.telemetry.inference_metrics import (
+    create_inference_metric_attributes,
+    inference_duration,
+    inference_time_to_first_token,
+    inference_tokens_per_second,
+)
+from llama_stack_api import (
+    ModelType,
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAIChatCompletionRequestWithExtraBody,
+    OpenAIChatCompletionResponseMessage,
+    OpenAIChoice,
+)
+
+
+class TestInferenceMetricAttributes:
+    """Test metric attribute creation utility."""
+
+    def test_all_fields(self):
+        attrs = create_inference_metric_attributes(
+            model="openai/gpt-4o-mini",
+            provider="openai",
+            stream=True,
+            status="success",
+        )
+        assert attrs == {
+            "model": "openai/gpt-4o-mini",
+            "provider": "openai",
+            "stream": "true",
+            "status": "success",
+        }
+
+    def test_partial_fields(self):
+        attrs = create_inference_metric_attributes(
+            model="openai/gpt-4o-mini",
+            status="error",
+        )
+        assert attrs == {
+            "model": "openai/gpt-4o-mini",
+            "status": "error",
+        }
+        assert "provider" not in attrs
+        assert "stream" not in attrs
+
+    def test_empty(self):
+        attrs = create_inference_metric_attributes()
+        assert attrs == {}
+
+    def test_stream_false(self):
+        attrs = create_inference_metric_attributes(stream=False)
+        assert attrs == {"stream": "false"}
+
+
+class TestInferenceMetricInstruments:
+    """Test that metric instruments are properly defined."""
+
+    def test_inference_duration_exists(self):
+        assert inference_duration is not None
+        assert hasattr(inference_duration, "record")
+
+    def test_inference_time_to_first_token_exists(self):
+        assert inference_time_to_first_token is not None
+        assert hasattr(inference_time_to_first_token, "record")
+
+    def test_inference_tokens_per_second_exists(self):
+        assert inference_tokens_per_second is not None
+        assert hasattr(inference_tokens_per_second, "record")
+
+    def test_inference_duration_can_record(self):
+        attrs = create_inference_metric_attributes(
+            model="openai/gpt-4o-mini",
+            provider="openai",
+            stream=False,
+            status="success",
+        )
+        inference_duration.record(1.234, attrs)
+
+    def test_inference_time_to_first_token_can_record(self):
+        attrs = create_inference_metric_attributes(
+            model="openai/gpt-4o-mini",
+            provider="openai",
+            stream=True,
+            status="success",
+        )
+        inference_time_to_first_token.record(0.123, attrs)
+
+    def test_inference_tokens_per_second_can_record(self):
+        attrs = create_inference_metric_attributes(
+            model="openai/gpt-4o-mini",
+            provider="openai",
+            stream=True,
+            status="success",
+        )
+        inference_tokens_per_second.record(42.5, attrs)
+
+
+class TestInferenceMetricsConstants:
+    """Test that metric constants are properly defined."""
+
+    def test_metric_names_follow_convention(self):
+        from llama_stack.telemetry.constants import (
+            INFERENCE_DURATION,
+            INFERENCE_TIME_TO_FIRST_TOKEN,
+            INFERENCE_TOKENS_PER_SECOND,
+        )
+
+        assert INFERENCE_DURATION.startswith("llama_stack.")
+        assert INFERENCE_TIME_TO_FIRST_TOKEN.startswith("llama_stack.")
+        assert INFERENCE_TOKENS_PER_SECOND.startswith("llama_stack.")
+
+        assert "inference" in INFERENCE_DURATION
+        assert "inference" in INFERENCE_TIME_TO_FIRST_TOKEN
+        assert "inference" in INFERENCE_TOKENS_PER_SECOND
+
+        assert INFERENCE_DURATION.endswith("_seconds")
+        assert INFERENCE_TIME_TO_FIRST_TOKEN.endswith("_seconds")
+
+
+def _make_router_and_provider():
+    """Create a mock routing table and provider for testing."""
+    routing_table = MagicMock()
+
+    mock_model = MagicMock()
+    mock_model.identifier = "openai/gpt-4o-mini"
+    mock_model.model_type = ModelType.llm
+    mock_model.provider_resource_id = "gpt-4o-mini"
+
+    mock_provider = AsyncMock()
+    mock_provider.__provider_id__ = "openai"
+
+    routing_table.get_object_by_identifier = AsyncMock(return_value=mock_model)
+    routing_table.get_provider_impl = AsyncMock(return_value=mock_provider)
+
+    router = InferenceRouter(routing_table=routing_table)
+    return router, mock_provider
+
+
+def _make_chat_params(**kwargs):
+    """Create minimal chat completion params."""
+    defaults = {
+        "model": "openai/gpt-4o-mini",
+        "messages": [{"role": "user", "content": "Hello"}],
+    }
+    defaults.update(kwargs)
+    return OpenAIChatCompletionRequestWithExtraBody(**defaults)
+
+
+def _make_completion_response(**kwargs):
+    """Create a minimal non-streaming chat completion response."""
+    defaults = {
+        "id": "chatcmpl-123",
+        "choices": [
+            OpenAIChoice(
+                index=0,
+                finish_reason="stop",
+                message=OpenAIChatCompletionResponseMessage(
+                    role="assistant",
+                    content="Hello!",
+                ),
+            )
+        ],
+        "created": int(time.time()),
+        "model": "gpt-4o-mini",
+        "object": "chat.completion",
+    }
+    defaults.update(kwargs)
+    return OpenAIChatCompletion(**defaults)
+
+
+class TestNonStreamingInferenceMetrics:
+    """Test that non-streaming chat completions record metrics."""
+
+    async def test_records_duration_on_success(self):
+        router, mock_provider = _make_router_and_provider()
+        mock_provider.openai_chat_completion = AsyncMock(return_value=_make_completion_response())
+        params = _make_chat_params(stream=False)
+
+        with patch.object(inference_duration, "record") as mock_record:
+            await router.openai_chat_completion(params)
+
+            mock_record.assert_called_once()
+            duration_val = mock_record.call_args[0][0]
+            attrs = mock_record.call_args[1]["attributes"]
+            assert duration_val > 0
+            assert attrs["model"] == "openai/gpt-4o-mini"
+            assert attrs["provider"] == "openai"
+            assert attrs["stream"] == "false"
+            assert attrs["status"] == "success"
+
+    async def test_records_duration_on_error(self):
+        router, mock_provider = _make_router_and_provider()
+        mock_provider.openai_chat_completion = AsyncMock(side_effect=RuntimeError("provider error"))
+        params = _make_chat_params(stream=False)
+
+        with patch.object(inference_duration, "record") as mock_record:
+            with pytest.raises(RuntimeError, match="provider error"):
+                await router.openai_chat_completion(params)
+
+            mock_record.assert_called_once()
+            attrs = mock_record.call_args[1]["attributes"]
+            assert attrs["status"] == "error"
+
+    async def test_records_tokens_per_second_when_usage_present(self):
+        from llama_stack_api.inference.models import OpenAIChatCompletionUsage
+
+        router, mock_provider = _make_router_and_provider()
+        usage = OpenAIChatCompletionUsage(completion_tokens=50, prompt_tokens=10, total_tokens=60)
+        response = _make_completion_response(usage=usage)
+        mock_provider.openai_chat_completion = AsyncMock(return_value=response)
+        params = _make_chat_params(stream=False)
+
+        with patch.object(inference_tokens_per_second, "record") as mock_record:
+            await router.openai_chat_completion(params)
+
+            mock_record.assert_called_once()
+            tps_val = mock_record.call_args[0][0]
+            assert tps_val > 0
+            attrs = mock_record.call_args[1]["attributes"]
+            assert attrs["status"] == "success"
+
+    async def test_no_tokens_per_second_without_usage(self):
+        router, mock_provider = _make_router_and_provider()
+        response = _make_completion_response(usage=None)
+        mock_provider.openai_chat_completion = AsyncMock(return_value=response)
+        params = _make_chat_params(stream=False)
+
+        with patch.object(inference_tokens_per_second, "record") as mock_record:
+            await router.openai_chat_completion(params)
+            mock_record.assert_not_called()
+
+
+async def _make_streaming_chunks(chunks):
+    """Create an async iterator from a list of chunks."""
+    for chunk in chunks:
+        yield chunk
+
+
+def _make_chunk(
+    chunk_id="chatcmpl-123",
+    content=None,
+    finish_reason=None,
+    usage=None,
+):
+    """Create a minimal streaming chunk."""
+    from llama_stack_api.inference.models import OpenAIChoiceDelta, OpenAIChunkChoice
+
+    delta = OpenAIChoiceDelta(content=content, role="assistant" if content else None)
+    choices = [OpenAIChunkChoice(index=0, delta=delta, finish_reason=finish_reason)]
+
+    return OpenAIChatCompletionChunk(
+        id=chunk_id,
+        choices=choices,
+        created=int(time.time()),
+        model="gpt-4o-mini",
+        object="chat.completion.chunk",
+        usage=usage,
+    )
+
+
+class TestStreamingInferenceMetrics:
+    """Test that streaming chat completions record metrics."""
+
+    async def test_records_duration(self):
+        router, mock_provider = _make_router_and_provider()
+        chunks = [
+            _make_chunk(content="Hello"),
+            _make_chunk(content=" world"),
+            _make_chunk(finish_reason="stop"),
+        ]
+        mock_provider.openai_chat_completion = AsyncMock(return_value=_make_streaming_chunks(chunks))
+        params = _make_chat_params(stream=True)
+
+        with patch.object(inference_duration, "record") as mock_record:
+            stream = await router.openai_chat_completion(params)
+            async for _ in stream:
+                pass
+
+            mock_record.assert_called_once()
+            duration_val = mock_record.call_args[0][0]
+            attrs = mock_record.call_args[1]["attributes"]
+            assert duration_val > 0
+            assert attrs["stream"] == "true"
+            assert attrs["status"] == "success"
+
+    async def test_records_ttft_on_first_content(self):
+        router, mock_provider = _make_router_and_provider()
+        chunks = [
+            _make_chunk(content="Hello"),
+            _make_chunk(content=" world"),
+            _make_chunk(finish_reason="stop"),
+        ]
+        mock_provider.openai_chat_completion = AsyncMock(return_value=_make_streaming_chunks(chunks))
+        params = _make_chat_params(stream=True)
+
+        with patch.object(inference_time_to_first_token, "record") as mock_record:
+            stream = await router.openai_chat_completion(params)
+            async for _ in stream:
+                pass
+
+            mock_record.assert_called_once()
+            ttft_val = mock_record.call_args[0][0]
+            assert ttft_val >= 0
+            attrs = mock_record.call_args[1]["attributes"]
+            assert attrs["stream"] == "true"
+
+    async def test_no_ttft_without_content(self):
+        router, mock_provider = _make_router_and_provider()
+        chunks = [
+            _make_chunk(finish_reason="stop"),
+        ]
+        mock_provider.openai_chat_completion = AsyncMock(return_value=_make_streaming_chunks(chunks))
+        params = _make_chat_params(stream=True)
+
+        with patch.object(inference_time_to_first_token, "record") as mock_record:
+            stream = await router.openai_chat_completion(params)
+            async for _ in stream:
+                pass
+
+            mock_record.assert_not_called()
+
+    async def test_records_tokens_per_second_from_usage(self):
+        router, mock_provider = _make_router_and_provider()
+        from llama_stack_api.inference.models import OpenAIChatCompletionUsage
+
+        usage = OpenAIChatCompletionUsage(completion_tokens=100, prompt_tokens=10, total_tokens=110)
+        chunks = [
+            _make_chunk(content="Hello"),
+            _make_chunk(finish_reason="stop", usage=usage),
+        ]
+        mock_provider.openai_chat_completion = AsyncMock(return_value=_make_streaming_chunks(chunks))
+        params = _make_chat_params(stream=True)
+
+        with patch.object(inference_tokens_per_second, "record") as mock_record:
+            stream = await router.openai_chat_completion(params)
+            async for _ in stream:
+                pass
+
+            mock_record.assert_called_once()
+            tps_val = mock_record.call_args[0][0]
+            assert tps_val > 0
+
+    async def test_no_tokens_per_second_without_usage(self):
+        router, mock_provider = _make_router_and_provider()
+        chunks = [
+            _make_chunk(content="Hello"),
+            _make_chunk(finish_reason="stop"),
+        ]
+        mock_provider.openai_chat_completion = AsyncMock(return_value=_make_streaming_chunks(chunks))
+        params = _make_chat_params(stream=True)
+
+        with patch.object(inference_tokens_per_second, "record") as mock_record:
+            stream = await router.openai_chat_completion(params)
+            async for _ in stream:
+                pass
+
+            mock_record.assert_not_called()
+
+    async def test_records_error_status_on_exception(self):
+        router, mock_provider = _make_router_and_provider()
+
+        async def failing_stream():
+            yield _make_chunk(content="Hello")
+            raise RuntimeError("stream error")
+
+        mock_provider.openai_chat_completion = AsyncMock(return_value=failing_stream())
+        params = _make_chat_params(stream=True)
+
+        with patch.object(inference_duration, "record") as mock_record:
+            stream = await router.openai_chat_completion(params)
+            with pytest.raises(RuntimeError, match="stream error"):
+                async for _ in stream:
+                    pass
+
+            mock_record.assert_called_once()
+            attrs = mock_record.call_args[1]["attributes"]
+            assert attrs["status"] == "error"