diff --git a/scripts/telemetry/llama-stack-inference-metrics.json b/scripts/telemetry/llama-stack-inference-metrics.json new file mode 100644 index 0000000000..caef48f12b --- /dev/null +++ b/scripts/telemetry/llama-stack-inference-metrics.json @@ -0,0 +1,1463 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 20, + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(llama_stack_llama_stack_inference_duration_seconds_count)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total Inference Requests", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(llama_stack_llama_stack_inference_duration_seconds_bucket[5m])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "P95 Inference Duration", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.5 + }, + { + "color": "red", + "value": 2 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(llama_stack_llama_stack_inference_time_to_first_token_seconds_bucket{stream=\"true\"}[5m])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "P95 Time to First Token", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 20 + }, + { + "color": "green", + "value": 50 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by (le) (rate(llama_stack_llama_stack_inference_tokens_per_second_bucket[5m])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "P50 Tokens Per Second", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 21, + "title": "Inference Duration", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*p50.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*p95.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*p99.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "last", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by (le) (rate(llama_stack_llama_stack_inference_duration_seconds_bucket[1m])))", + "legendFormat": "All models (p50)", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(llama_stack_llama_stack_inference_duration_seconds_bucket[1m])))", + "hide": false, + "legendFormat": "All models (p95)", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(llama_stack_llama_stack_inference_duration_seconds_bucket[1m])))", + "hide": false, + "legendFormat": "All models (p99)", + "range": true, + "refId": "C" + } + ], + "title": "Inference Duration (P50, P95, P99)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "last", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (model, le) (rate(llama_stack_llama_stack_inference_duration_seconds_bucket[1m])))", + "legendFormat": "{{model}}", + "range": true, + "refId": "A" + } + ], + "title": "P95 Inference Duration by Model", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 22, + "title": "Time to First Token", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*p50.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*p95.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*p99.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "last", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by (le) (rate(llama_stack_llama_stack_inference_time_to_first_token_seconds_bucket{stream=\"true\"}[1m])))", + "legendFormat": "All models (p50)", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(llama_stack_llama_stack_inference_time_to_first_token_seconds_bucket{stream=\"true\"}[1m])))", + "hide": false, + "legendFormat": "All models (p95)", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(llama_stack_llama_stack_inference_time_to_first_token_seconds_bucket{stream=\"true\"}[1m])))", + "hide": false, + "legendFormat": "All models (p99)", + "range": true, + "refId": "C" + } + ], + "title": "Time to First Token (P50, P95, P99)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "last", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (model, le) (rate(llama_stack_llama_stack_inference_time_to_first_token_seconds_bucket{stream=\"true\"}[1m])))", + "legendFormat": "{{model}}", + "range": true, + "refId": "A" + } + ], + "title": "P95 Time to First Token by Model", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 23, + "title": "Throughput", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "tokens/s", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*p50.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*p95.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "last", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by (le) (rate(llama_stack_llama_stack_inference_tokens_per_second_bucket[1m])))", + "legendFormat": "All models (p50)", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(llama_stack_llama_stack_inference_tokens_per_second_bucket[1m])))", + "hide": false, + "legendFormat": "All models (p95)", + "range": true, + "refId": "B" + } + ], + "title": "Tokens per Second (P50, P95)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "tokens/s", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "last", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by (model, le) (rate(llama_stack_llama_stack_inference_tokens_per_second_bucket[1m])))", + "legendFormat": "{{model}}", + "range": true, + "refId": "A" + } + ], + "title": "Median Tokens/s by Model", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 24, + "title": "Provider Comparison", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 33 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "last", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (provider, le) (rate(llama_stack_llama_stack_inference_duration_seconds_bucket[1m])))", + "legendFormat": "{{provider}}", + "range": true, + "refId": "A" + } + ], + "title": "P95 Inference Duration by Provider", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 33 + }, + "id": 12, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value", + "percent" + ] + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (provider) (llama_stack_llama_stack_inference_duration_seconds_count)", + "legendFormat": "{{provider}}", + "range": true, + "refId": "A" + } + ], + "title": "Request Distribution by Provider", + "type": "piechart" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 25, + "title": "Errors & Details", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 13, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value", + "percent" + ] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (status) (llama_stack_llama_stack_inference_duration_seconds_count)", + "legendFormat": "{{status}}", + "range": true, + "refId": "A" + } + ], + "title": "Success vs Error Rate", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Requests" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 14, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by (model, provider, stream, status) (llama_stack_llama_stack_inference_duration_seconds_count)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Inference Request Details", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": {}, + "renameByName": { + "model": "Model", + "provider": "Provider", + "status": "Status", + "stream": "Stream" + } + } + } + ], + "type": "table" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": [ + "llama-stack", + "metrics", + "inference" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Llama Stack - Inference Metrics", + "uid": "llama-stack-inference-metrics", + "version": 0, + "weekStart": "" + } diff --git a/scripts/telemetry/setup_telemetry.sh b/scripts/telemetry/setup_telemetry.sh index e9ce64e023..1d121f64bf 100755 --- a/scripts/telemetry/setup_telemetry.sh +++ b/scripts/telemetry/setup_telemetry.sh @@ -141,6 +141,7 @@ $CONTAINER_RUNTIME run -d --name grafana \ -v "$SCRIPT_DIR/llama-stack-vector-io-metrics.json:/etc/grafana/provisioning/dashboards/llama-stack-vector-io-metrics.json:Z" \ -v "$SCRIPT_DIR/llama-stack-request-metrics.json:/etc/grafana/provisioning/dashboards/llama-stack-request-metrics.json:Z" \ -v "$SCRIPT_DIR/llama-stack-responses-metrics.json:/etc/grafana/provisioning/dashboards/llama-stack-responses-metrics.json:Z" \ + -v "$SCRIPT_DIR/llama-stack-inference-metrics.json:/etc/grafana/provisioning/dashboards/llama-stack-inference-metrics.json:Z" \ docker.io/grafana/grafana:11.0.0 # Wait for services to start diff --git a/src/llama_stack/core/routers/inference.py b/src/llama_stack/core/routers/inference.py index e3f14fc056..2a72cd084d 100644 --- a/src/llama_stack/core/routers/inference.py +++ b/src/llama_stack/core/routers/inference.py @@ -19,6 +19,12 @@ from llama_stack.core.request_headers import get_authenticated_user from llama_stack.log import get_logger from llama_stack.providers.utils.inference.inference_store import InferenceStore +from llama_stack.telemetry.inference_metrics import ( + create_inference_metric_attributes, + inference_duration, + inference_time_to_first_token, + inference_tokens_per_second, +) from llama_stack_api import ( GetChatCompletionRequest, HealthResponse, @@ -208,9 +214,40 @@ async def openai_chat_completion( messages=params.messages, ) - response = await self._nonstream_openai_chat_completion(provider, params) + start_time = time.perf_counter() + status = "success" + try: + response = await self._nonstream_openai_chat_completion(provider, params) + except asyncio.CancelledError: + status = "error" + raise + except Exception: + status = "error" + raise + finally: + duration = time.perf_counter() - start_time + attrs = create_inference_metric_attributes( + model=request_model_id, + provider=provider.__provider_id__, + stream=False, + status=status, + ) + inference_duration.record(duration, attributes=attrs) + response.model = request_model_id + if response.usage and response.usage.completion_tokens and duration > 0: + tokens_per_sec = response.usage.completion_tokens / duration + inference_tokens_per_second.record( + tokens_per_sec, + attributes=create_inference_metric_attributes( + model=request_model_id, + provider=provider.__provider_id__, + stream=False, + status="success", + ), + ) + # Store the response with the ID that will be returned to the client if self.store: asyncio.create_task(self.store.store_chat_completion(response, params.messages)) @@ -295,6 +332,10 @@ async def stream_tokens_and_compute_metrics_openai_chat( id = None created = None choices_data: dict[int, dict[str, Any]] = {} + start_time = time.perf_counter() + first_token_time: float | None = None + completion_tokens: int | None = None + status = "success" try: async for chunk in response: @@ -326,6 +367,8 @@ async def stream_tokens_and_compute_metrics_openai_chat( if choice_delta.delta: delta = choice_delta.delta if delta.content: + if first_token_time is None: + first_token_time = time.perf_counter() current_choice_data["content_parts"].append(delta.content) if delta.tool_calls: for tool_call_delta in delta.tool_calls: @@ -380,6 +423,10 @@ async def stream_tokens_and_compute_metrics_openai_chat( choice_delta.logprobs.content = converted_logprobs current_choice_data["logprobs_content_parts"].extend(converted_logprobs) + # Capture usage from the final chunk (providers send usage in the last chunk) + if chunk.usage and chunk.usage.completion_tokens: + completion_tokens = chunk.usage.completion_tokens + # Compute metrics on final chunk if chunk.choices and chunk.choices[0].finish_reason: completion_text = "" @@ -387,7 +434,30 @@ async def stream_tokens_and_compute_metrics_openai_chat( completion_text += "".join(choice_data["content_parts"]) yield chunk + except asyncio.CancelledError: + status = "error" + raise + except Exception: + status = "error" + raise finally: + duration = time.perf_counter() - start_time + attrs = create_inference_metric_attributes( + model=fully_qualified_model_id, + provider=provider_id, + stream=True, + status=status, + ) + inference_duration.record(duration, attributes=attrs) + + if first_token_time is not None: + ttft = first_token_time - start_time + inference_time_to_first_token.record(ttft, attributes=attrs) + + if completion_tokens and duration > 0: + tokens_per_sec = completion_tokens / duration + inference_tokens_per_second.record(tokens_per_sec, attributes=attrs) + # Store the final assembled completion if id and self.store and messages: assembled_choices: list[OpenAIChoice] = [] diff --git a/src/llama_stack/telemetry/constants.py b/src/llama_stack/telemetry/constants.py index 8a1e692410..858565f475 100644 --- a/src/llama_stack/telemetry/constants.py +++ b/src/llama_stack/telemetry/constants.py @@ -58,6 +58,14 @@ REQUEST_DURATION_SECONDS = f"{REQUEST_PREFIX}_duration_seconds" CONCURRENT_REQUESTS = f"{llama_stack_prefix}.concurrent_requests" +# Inference Metrics +# These constants define the names for OpenTelemetry metrics tracking inference operations +INFERENCE_PREFIX = f"{llama_stack_prefix}.inference" + +INFERENCE_DURATION = f"{INFERENCE_PREFIX}.duration_seconds" +INFERENCE_TIME_TO_FIRST_TOKEN = f"{INFERENCE_PREFIX}.time_to_first_token_seconds" +INFERENCE_TOKENS_PER_SECOND = f"{INFERENCE_PREFIX}.tokens_per_second" + # Responses API Metrics RESPONSES_PREFIX = f"{llama_stack_prefix}.responses" RESPONSES_PARAMETER_USAGE_TOTAL = f"{RESPONSES_PREFIX}.parameter_usage_total" diff --git a/src/llama_stack/telemetry/inference_metrics.py b/src/llama_stack/telemetry/inference_metrics.py new file mode 100644 index 0000000000..6c4f3cff72 --- /dev/null +++ b/src/llama_stack/telemetry/inference_metrics.py @@ -0,0 +1,77 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +""" +OpenTelemetry metrics for llama-stack inference operations. + +This module provides centralized metric definitions for tracking: +- Inference duration (end-to-end latency for chat completions) +- Time to first token (streaming requests only) +- Tokens per second (output throughput) + +All metrics follow OpenTelemetry semantic conventions and use the llama_stack prefix +for consistent naming across the telemetry stack. +""" + +from opentelemetry import metrics +from opentelemetry.metrics import Histogram + +from .constants import ( + INFERENCE_DURATION, + INFERENCE_TIME_TO_FIRST_TOKEN, + INFERENCE_TOKENS_PER_SECOND, +) + +# Get or create meter for llama_stack.inference +meter = metrics.get_meter("llama_stack.inference", version="1.0.0") + +inference_duration: Histogram = meter.create_histogram( + name=INFERENCE_DURATION, + description="Duration of inference requests from start to completion", + unit="s", +) + +inference_time_to_first_token: Histogram = meter.create_histogram( + name=INFERENCE_TIME_TO_FIRST_TOKEN, + description="Time from request start to first content token (streaming only)", + unit="s", +) + +inference_tokens_per_second: Histogram = meter.create_histogram( + name=INFERENCE_TOKENS_PER_SECOND, + description="Output token throughput (completion tokens / duration)", +) + + +def create_inference_metric_attributes( + model: str | None = None, + provider: str | None = None, + stream: bool | None = None, + status: str | None = None, +) -> dict[str, str]: + """Create a consistent attribute dictionary for inference metrics. + + Args: + model: Fully qualified model ID (e.g., "openai/gpt-4o-mini") + provider: Provider ID (e.g., "openai") + stream: Whether this is a streaming request + status: Request outcome ("success", "error") + + Returns: + Dictionary of attributes with non-None values + """ + attributes: dict[str, str] = {} + + if model is not None: + attributes["model"] = model + if provider is not None: + attributes["provider"] = provider + if stream is not None: + attributes["stream"] = str(stream).lower() + if status is not None: + attributes["status"] = status + + return attributes diff --git a/tests/unit/telemetry/test_inference_metrics.py b/tests/unit/telemetry/test_inference_metrics.py new file mode 100644 index 0000000000..245870c5ef --- /dev/null +++ b/tests/unit/telemetry/test_inference_metrics.py @@ -0,0 +1,391 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +"""Unit tests for inference metrics.""" + +import time +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from llama_stack.core.routers.inference import InferenceRouter +from llama_stack.telemetry.inference_metrics import ( + create_inference_metric_attributes, + inference_duration, + inference_time_to_first_token, + inference_tokens_per_second, +) +from llama_stack_api import ( + ModelType, + OpenAIChatCompletion, + OpenAIChatCompletionChunk, + OpenAIChatCompletionRequestWithExtraBody, + OpenAIChatCompletionResponseMessage, + OpenAIChoice, +) + + +class TestInferenceMetricAttributes: + """Test metric attribute creation utility.""" + + def test_all_fields(self): + attrs = create_inference_metric_attributes( + model="openai/gpt-4o-mini", + provider="openai", + stream=True, + status="success", + ) + assert attrs == { + "model": "openai/gpt-4o-mini", + "provider": "openai", + "stream": "true", + "status": "success", + } + + def test_partial_fields(self): + attrs = create_inference_metric_attributes( + model="openai/gpt-4o-mini", + status="error", + ) + assert attrs == { + "model": "openai/gpt-4o-mini", + "status": "error", + } + assert "provider" not in attrs + assert "stream" not in attrs + + def test_empty(self): + attrs = create_inference_metric_attributes() + assert attrs == {} + + def test_stream_false(self): + attrs = create_inference_metric_attributes(stream=False) + assert attrs == {"stream": "false"} + + +class TestInferenceMetricInstruments: + """Test that metric instruments are properly defined.""" + + def test_inference_duration_exists(self): + assert inference_duration is not None + assert hasattr(inference_duration, "record") + + def test_inference_time_to_first_token_exists(self): + assert inference_time_to_first_token is not None + assert hasattr(inference_time_to_first_token, "record") + + def test_inference_tokens_per_second_exists(self): + assert inference_tokens_per_second is not None + assert hasattr(inference_tokens_per_second, "record") + + def test_inference_duration_can_record(self): + attrs = create_inference_metric_attributes( + model="openai/gpt-4o-mini", + provider="openai", + stream=False, + status="success", + ) + inference_duration.record(1.234, attrs) + + def test_inference_time_to_first_token_can_record(self): + attrs = create_inference_metric_attributes( + model="openai/gpt-4o-mini", + provider="openai", + stream=True, + status="success", + ) + inference_time_to_first_token.record(0.123, attrs) + + def test_inference_tokens_per_second_can_record(self): + attrs = create_inference_metric_attributes( + model="openai/gpt-4o-mini", + provider="openai", + stream=True, + status="success", + ) + inference_tokens_per_second.record(42.5, attrs) + + +class TestInferenceMetricsConstants: + """Test that metric constants are properly defined.""" + + def test_metric_names_follow_convention(self): + from llama_stack.telemetry.constants import ( + INFERENCE_DURATION, + INFERENCE_TIME_TO_FIRST_TOKEN, + INFERENCE_TOKENS_PER_SECOND, + ) + + assert INFERENCE_DURATION.startswith("llama_stack.") + assert INFERENCE_TIME_TO_FIRST_TOKEN.startswith("llama_stack.") + assert INFERENCE_TOKENS_PER_SECOND.startswith("llama_stack.") + + assert "inference" in INFERENCE_DURATION + assert "inference" in INFERENCE_TIME_TO_FIRST_TOKEN + assert "inference" in INFERENCE_TOKENS_PER_SECOND + + assert INFERENCE_DURATION.endswith("_seconds") + assert INFERENCE_TIME_TO_FIRST_TOKEN.endswith("_seconds") + + +def _make_router_and_provider(): + """Create a mock routing table and provider for testing.""" + routing_table = MagicMock() + + mock_model = MagicMock() + mock_model.identifier = "openai/gpt-4o-mini" + mock_model.model_type = ModelType.llm + mock_model.provider_resource_id = "gpt-4o-mini" + + mock_provider = AsyncMock() + mock_provider.__provider_id__ = "openai" + + routing_table.get_object_by_identifier = AsyncMock(return_value=mock_model) + routing_table.get_provider_impl = AsyncMock(return_value=mock_provider) + + router = InferenceRouter(routing_table=routing_table) + return router, mock_provider + + +def _make_chat_params(**kwargs): + """Create minimal chat completion params.""" + defaults = { + "model": "openai/gpt-4o-mini", + "messages": [{"role": "user", "content": "Hello"}], + } + defaults.update(kwargs) + return OpenAIChatCompletionRequestWithExtraBody(**defaults) + + +def _make_completion_response(**kwargs): + """Create a minimal non-streaming chat completion response.""" + defaults = { + "id": "chatcmpl-123", + "choices": [ + OpenAIChoice( + index=0, + finish_reason="stop", + message=OpenAIChatCompletionResponseMessage( + role="assistant", + content="Hello!", + ), + ) + ], + "created": int(time.time()), + "model": "gpt-4o-mini", + "object": "chat.completion", + } + defaults.update(kwargs) + return OpenAIChatCompletion(**defaults) + + +class TestNonStreamingInferenceMetrics: + """Test that non-streaming chat completions record metrics.""" + + async def test_records_duration_on_success(self): + router, mock_provider = _make_router_and_provider() + mock_provider.openai_chat_completion = AsyncMock(return_value=_make_completion_response()) + params = _make_chat_params(stream=False) + + with patch.object(inference_duration, "record") as mock_record: + await router.openai_chat_completion(params) + + mock_record.assert_called_once() + duration_val = mock_record.call_args[0][0] + attrs = mock_record.call_args[1]["attributes"] + assert duration_val > 0 + assert attrs["model"] == "openai/gpt-4o-mini" + assert attrs["provider"] == "openai" + assert attrs["stream"] == "false" + assert attrs["status"] == "success" + + async def test_records_duration_on_error(self): + router, mock_provider = _make_router_and_provider() + mock_provider.openai_chat_completion = AsyncMock(side_effect=RuntimeError("provider error")) + params = _make_chat_params(stream=False) + + with patch.object(inference_duration, "record") as mock_record: + with pytest.raises(RuntimeError, match="provider error"): + await router.openai_chat_completion(params) + + mock_record.assert_called_once() + attrs = mock_record.call_args[1]["attributes"] + assert attrs["status"] == "error" + + async def test_records_tokens_per_second_when_usage_present(self): + from llama_stack_api.inference.models import OpenAIChatCompletionUsage + + router, mock_provider = _make_router_and_provider() + usage = OpenAIChatCompletionUsage(completion_tokens=50, prompt_tokens=10, total_tokens=60) + response = _make_completion_response(usage=usage) + mock_provider.openai_chat_completion = AsyncMock(return_value=response) + params = _make_chat_params(stream=False) + + with patch.object(inference_tokens_per_second, "record") as mock_record: + await router.openai_chat_completion(params) + + mock_record.assert_called_once() + tps_val = mock_record.call_args[0][0] + assert tps_val > 0 + attrs = mock_record.call_args[1]["attributes"] + assert attrs["status"] == "success" + + async def test_no_tokens_per_second_without_usage(self): + router, mock_provider = _make_router_and_provider() + response = _make_completion_response(usage=None) + mock_provider.openai_chat_completion = AsyncMock(return_value=response) + params = _make_chat_params(stream=False) + + with patch.object(inference_tokens_per_second, "record") as mock_record: + await router.openai_chat_completion(params) + mock_record.assert_not_called() + + +async def _make_streaming_chunks(chunks): + """Create an async iterator from a list of chunks.""" + for chunk in chunks: + yield chunk + + +def _make_chunk( + chunk_id="chatcmpl-123", + content=None, + finish_reason=None, + usage=None, +): + """Create a minimal streaming chunk.""" + from llama_stack_api.inference.models import OpenAIChoiceDelta, OpenAIChunkChoice + + delta = OpenAIChoiceDelta(content=content, role="assistant" if content else None) + choices = [OpenAIChunkChoice(index=0, delta=delta, finish_reason=finish_reason)] + + return OpenAIChatCompletionChunk( + id=chunk_id, + choices=choices, + created=int(time.time()), + model="gpt-4o-mini", + object="chat.completion.chunk", + usage=usage, + ) + + +class TestStreamingInferenceMetrics: + """Test that streaming chat completions record metrics.""" + + async def test_records_duration(self): + router, mock_provider = _make_router_and_provider() + chunks = [ + _make_chunk(content="Hello"), + _make_chunk(content=" world"), + _make_chunk(finish_reason="stop"), + ] + mock_provider.openai_chat_completion = AsyncMock(return_value=_make_streaming_chunks(chunks)) + params = _make_chat_params(stream=True) + + with patch.object(inference_duration, "record") as mock_record: + stream = await router.openai_chat_completion(params) + async for _ in stream: + pass + + mock_record.assert_called_once() + duration_val = mock_record.call_args[0][0] + attrs = mock_record.call_args[1]["attributes"] + assert duration_val > 0 + assert attrs["stream"] == "true" + assert attrs["status"] == "success" + + async def test_records_ttft_on_first_content(self): + router, mock_provider = _make_router_and_provider() + chunks = [ + _make_chunk(content="Hello"), + _make_chunk(content=" world"), + _make_chunk(finish_reason="stop"), + ] + mock_provider.openai_chat_completion = AsyncMock(return_value=_make_streaming_chunks(chunks)) + params = _make_chat_params(stream=True) + + with patch.object(inference_time_to_first_token, "record") as mock_record: + stream = await router.openai_chat_completion(params) + async for _ in stream: + pass + + mock_record.assert_called_once() + ttft_val = mock_record.call_args[0][0] + assert ttft_val >= 0 + attrs = mock_record.call_args[1]["attributes"] + assert attrs["stream"] == "true" + + async def test_no_ttft_without_content(self): + router, mock_provider = _make_router_and_provider() + chunks = [ + _make_chunk(finish_reason="stop"), + ] + mock_provider.openai_chat_completion = AsyncMock(return_value=_make_streaming_chunks(chunks)) + params = _make_chat_params(stream=True) + + with patch.object(inference_time_to_first_token, "record") as mock_record: + stream = await router.openai_chat_completion(params) + async for _ in stream: + pass + + mock_record.assert_not_called() + + async def test_records_tokens_per_second_from_usage(self): + router, mock_provider = _make_router_and_provider() + from llama_stack_api.inference.models import OpenAIChatCompletionUsage + + usage = OpenAIChatCompletionUsage(completion_tokens=100, prompt_tokens=10, total_tokens=110) + chunks = [ + _make_chunk(content="Hello"), + _make_chunk(finish_reason="stop", usage=usage), + ] + mock_provider.openai_chat_completion = AsyncMock(return_value=_make_streaming_chunks(chunks)) + params = _make_chat_params(stream=True) + + with patch.object(inference_tokens_per_second, "record") as mock_record: + stream = await router.openai_chat_completion(params) + async for _ in stream: + pass + + mock_record.assert_called_once() + tps_val = mock_record.call_args[0][0] + assert tps_val > 0 + + async def test_no_tokens_per_second_without_usage(self): + router, mock_provider = _make_router_and_provider() + chunks = [ + _make_chunk(content="Hello"), + _make_chunk(finish_reason="stop"), + ] + mock_provider.openai_chat_completion = AsyncMock(return_value=_make_streaming_chunks(chunks)) + params = _make_chat_params(stream=True) + + with patch.object(inference_tokens_per_second, "record") as mock_record: + stream = await router.openai_chat_completion(params) + async for _ in stream: + pass + + mock_record.assert_not_called() + + async def test_records_error_status_on_exception(self): + router, mock_provider = _make_router_and_provider() + + async def failing_stream(): + yield _make_chunk(content="Hello") + raise RuntimeError("stream error") + + mock_provider.openai_chat_completion = AsyncMock(return_value=failing_stream()) + params = _make_chat_params(stream=True) + + with patch.object(inference_duration, "record") as mock_record: + stream = await router.openai_chat_completion(params) + with pytest.raises(RuntimeError, match="stream error"): + async for _ in stream: + pass + + mock_record.assert_called_once() + attrs = mock_record.call_args[1]["attributes"] + assert attrs["status"] == "error"