diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index dedcaf7..411818c 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -150,4 +150,31 @@ All session context fields fall back to `"not in session"` (use `SessionLogging. - [ ] New iRacing SDK event handler → structured log with `domain="iracing"` - [ ] `iracing_incident` / `incident_detected` log → full uniqueness signature (`unique_user_id`, start/end frame, camera) -**Canonical reference:** [docs/RULES-ActionCoverage.md](../docs/RULES-ActionCoverage.md) \ No newline at end of file +**Canonical reference:** [docs/RULES-ActionCoverage.md](../docs/RULES-ActionCoverage.md) + +--- + +## Grafana Alert Covenant + +Every behavioral change to the plugin, dashboard, or LLM integration MUST include a Grafana alert review. **Alert silence ≠ alert passing.** + +### Change → Domain quick-reference + +| Change type | Domain to check | +|---|---| +| New `DispatchAction` branch | Domain 3 — `action-failure-streak` thresholds | +| New iRacing SDK event | Domains 3 + 7 — session/replay rules | +| New Claude API integration | Domains 4 + 5 — session health + cost | +| New MCP tool | Domain 4 — `mcp-service-errors`, `tool-loop-detected` | +| Log event renamed/removed | Search alert YAMLs — alert will go **silent**, not fire | +| New log event/field | Consider whether a new alert rule is warranted | +| Sentinel code change | Domain 6 — self-health rules | + +### PR Checklist addition + +- [ ] Reviewed impacted Grafana alert domains (see table above) +- [ ] Verified no alert queries break silently if log events were renamed/removed +- [ ] Considered new alert rule if new log events were added + +**Alert YAML files:** `observability/local/grafana/provisioning/alerting/` (46 rules, 8 domains) +**Canonical reference:** [docs/RULES-GrafanaAlerts.md](../docs/RULES-GrafanaAlerts.md) \ No newline at end of file diff --git a/.playwright-mcp/console-2026-03-26T21-06-47-084Z.log b/.playwright-mcp/console-2026-03-26T21-06-47-084Z.log new file mode 100644 index 0000000..c4778cd --- /dev/null +++ b/.playwright-mcp/console-2026-03-26T21-06-47-084Z.log @@ -0,0 +1,2 @@ +[ 885ms] [WARNING] is deprecated. Please include @ http://localhost:3000/login:0 +[ 23600ms] [ERROR] Failed to load resource: the server responded with a status of 401 (Unauthorized) @ http://localhost:3000/login:0 diff --git a/.playwright-mcp/console-2026-03-26T21-07-25-658Z.log b/.playwright-mcp/console-2026-03-26T21-07-25-658Z.log new file mode 100644 index 0000000..6b1e95a --- /dev/null +++ b/.playwright-mcp/console-2026-03-26T21-07-25-658Z.log @@ -0,0 +1,3 @@ +[ 292ms] [WARNING] is deprecated. Please include @ http://localhost:3000/login:0 +[ 15683ms] [ERROR] Failed to load resource: the server responded with a status of 401 (Unauthorized) @ http://localhost:3000/login:0 +[ 49060ms] [ERROR] Failed to load resource: the server responded with a status of 401 (Unauthorized) @ http://localhost:3000/login:0 diff --git a/.playwright-mcp/console-2026-03-26T21-11-17-962Z.log b/.playwright-mcp/console-2026-03-26T21-11-17-962Z.log new file mode 100644 index 0000000..1b67327 --- /dev/null +++ b/.playwright-mcp/console-2026-03-26T21-11-17-962Z.log @@ -0,0 +1,2 @@ +[ 602ms] [WARNING] is deprecated. Please include @ http://localhost:3000/login:0 +[ 18682ms] [WARNING] is deprecated. Please include @ http://localhost:3000/d/claude-token-cost?orgId=1&from=now-7d&to=now&kiosk:0 diff --git a/.playwright-mcp/console-2026-03-26T21-11-55-773Z.log b/.playwright-mcp/console-2026-03-26T21-11-55-773Z.log new file mode 100644 index 0000000..a28d881 --- /dev/null +++ b/.playwright-mcp/console-2026-03-26T21-11-55-773Z.log @@ -0,0 +1 @@ +[ 648ms] [WARNING] is deprecated. Please include @ http://localhost:3000/d/claude-cache-context?orgId=1&from=now-7d&to=now:0 diff --git a/.playwright-mcp/console-2026-03-26T21-12-23-787Z.log b/.playwright-mcp/console-2026-03-26T21-12-23-787Z.log new file mode 100644 index 0000000..c11747f --- /dev/null +++ b/.playwright-mcp/console-2026-03-26T21-12-23-787Z.log @@ -0,0 +1,2 @@ +[ 636ms] [WARNING] is deprecated. Please include @ http://localhost:3000/d/simsteward-log-sentinel?orgId=1&from=now-6h&to=now:0 +[ 59082ms] [ERROR] WebSocket connection to 'ws://localhost:3000/api/live/ws' failed: Connection closed before receiving a handshake response @ http://localhost:3000/public/build/3855.c53eb219979d7cb3b2d4.js:1312 diff --git a/cache-context-dashboard.png b/cache-context-dashboard.png new file mode 100644 index 0000000..000cc30 Binary files /dev/null and b/cache-context-dashboard.png differ diff --git a/docs/RULES-GrafanaAlerts.md b/docs/RULES-GrafanaAlerts.md new file mode 100644 index 0000000..7692acd --- /dev/null +++ b/docs/RULES-GrafanaAlerts.md @@ -0,0 +1,93 @@ +# Grafana Alert Rules — Development Covenant + +Every behavioral change to the plugin, dashboard, or LLM integration **must include a +corresponding Grafana alert review**. Silence is not the same as passing. + +**Canonical spec:** `docs/superpowers/specs/2026-03-30-grafana-alerts-design.md` +**Alert YAML files:** `observability/local/grafana/provisioning/alerting/` + +--- + +## Change → Domain Mapping + +| Change type | Domain(s) to review | +|---|---| +| New action handler in `DispatchAction` | Domain 3 — check `action-failure-streak` thresholds | +| New iRacing SDK event handler | Domain 3 and/or Domain 7 — check incident/replay rules | +| New Claude API integration | Domains 4 + 5 — session health and cost rules | +| New MCP tool added | Domain 4 — `mcp-service-errors`, `tool-loop-detected` | +| New log event or field added | Check all domains — does it need a new alert? | +| Removing or renaming a log event | Search alert YAMLs for old name — alert will go **silent**, not fire | +| Changing cost fields in token metrics | Domain 5 — all cost threshold alerts | +| Changing session lifecycle events | Domains 3, 4, 8 — session start/end correlation | +| Sentinel code change | Domain 6 — self-health rules | +| Grafana dashboard change | Domain 8 — cross-stream rules may need annotation updates | + +--- + +## Alert Silence ≠ Alert Passing + +When you rename or remove a log event: +- The alert query will return **no data** (not 0) +- If `noDataState: OK` — the alert silently stops firing +- This is a **silent regression** — harder to detect than a real alert + +Always check `noDataState` when modifying events that existing alerts depend on. + +--- + +## Testing New Alerts + +To verify an alert fires correctly before relying on it: + +1. **Write a test event to Loki** via the gateway: + ```bash + curl -X POST http://localhost:3500/loki/api/v1/push \ + -H "Content-Type: application/json" \ + -d '{ + "streams": [{ + "stream": {"app": "sim-steward", "env": "local", "level": "ERROR"}, + "values": [["'"$(date +%s%N)"'", "{\"level\":\"ERROR\",\"event\":\"test\",\"message\":\"test alert\"}"]] + }] + }' + ``` + +2. **Temporarily lower the threshold** in the alert rule to `0` and set the evaluation interval to `10s` in Grafana UI (do not commit this change). + +3. **Verify the alert fires** in Grafana UI → Alerting → Alert Rules within the evaluation window. + +4. **Verify the `/trigger` webhook** receives the payload: + ```bash + # Check log-sentinel logs + docker compose logs log-sentinel --tail=20 + ``` + +5. **Restore the threshold** before committing any YAML changes. + +--- + +## Alert Catalog Summary + +| File | Domains | Count | +|---|---|---| +| `rules-infrastructure.yml` | 1+2: Infrastructure & Deploy Quality | 10 | +| `rules-iracing.yml` | 3+7: iRacing Session + Replay | 10 | +| `rules-claude-sessions.yml` | 4: Claude Code Session Health | 7 | +| `rules-token-cost.yml` | 5: Token & Cost Budget | 7 | +| `rules-sentinel-health.yml` | 6: Sentinel Self-Health | 7 | +| `rules-cross-stream.yml` | 8: Cross-Stream Correlation | 5 | +| **Total** | | **46** | + +T2-tier alerts (skip `needs_t2` gate, escalate immediately): +`subagent-explosion`, `tool-loop-detected`, `session-cost-critical`, `daily-spend-critical`, +`ws-claude-coinflict`, `session-token-abandon`, `action-fail-session-fail`, `deploy-triple-signal` + +--- + +## PR Checklist Addition + +For any PR modifying plugin behavior, add to the review checklist: + +- [ ] Reviewed Grafana alert domains for impacted change type (see table above) +- [ ] If log events were renamed/removed: verified no alert queries silently break +- [ ] If new log events added: considered whether a new alert rule is warranted diff --git a/docs/superpowers/specs/2026-03-30-grafana-alerts-design.md b/docs/superpowers/specs/2026-03-30-grafana-alerts-design.md new file mode 100644 index 0000000..88c365c --- /dev/null +++ b/docs/superpowers/specs/2026-03-30-grafana-alerts-design.md @@ -0,0 +1,218 @@ +# Grafana Alerts Design — Log Sentinel Layer 0 +**Date:** 2026-03-30 +**Status:** Approved + +--- + +## Context + +The log-sentinel V2 LLM investigation pipeline (T1 triage + T2 agentic tool loop) is expensive to run continuously — qwen3:8b T1 scan on a 6700 XT takes 60-90 seconds, T2 takes 3-4 minutes. Running this on a fixed hourly poll means real incidents can sit undetected for up to 60 minutes, and the models waste cycles on quiet periods. + +Grafana Alerts solves this as **Layer 0**: always-on, no GPU cost, fires webhooks only when something is actually wrong. The sentinel switches from polling to event-driven. When Grafana fires, it delivers structured alert context (labels, values, timeframe) directly in the webhook payload — T1 skips cold-start gathering for the relevant domain and goes straight to targeted investigation. + +**Layer 0 (Grafana Alerts) → Layer 1 (T1 fast triage) → Layer 2 (T2 agentic tool loop)** + +--- + +## Alert Architecture + +### Transport: Webhook-Only +Grafana alert notifications route exclusively to log-sentinel's `/trigger` HTTP endpoint. No email, Slack, or PagerDuty at this stage. The sentinel logs every trigger, runs the appropriate tier, and emits findings to Loki (queryable by Grafana dashboards). + +### Provisioning Structure +All alerts are provisioned as code — no manual UI configuration: +``` +observability/local/grafana/provisioning/alerting/ + contact-points.yml # webhook endpoint definition + notification-policies.yml # routing: all alerts → webhook + rules-infrastructure.yml # Domains 1+2 + rules-iracing.yml # Domain 3+7 + rules-claude-sessions.yml # Domain 4 + rules-token-cost.yml # Domain 5 + rules-sentinel-health.yml # Domain 6 + rules-cross-stream.yml # Domain 8 +``` + +### Trigger Tier Labeling +Every alert rule carries a `trigger_tier` label (`t1` or `t2`). The sentinel's `/trigger` endpoint reads this label and routes accordingly — T1 for most alerts, T2 for critical multi-signal correlations. + +--- + +## Alert Catalog + +### Domain 1+2: Infrastructure & Deploy Quality (10 alerts) + +| Alert ID | LogQL / Condition | Severity | Tier | +|---|---|---|---| +| `bridge-start-failed` | `count_over_time({app="sim-steward"} \| json \| event="plugin_lifecycle" \| level="ERROR" [5m]) > 0` | critical | T1 | +| `plugin-never-ready` | plugin_lifecycle start, no ready within 60s | warn | T1 | +| `sentinel-cycle-stalled` | No `sentinel_cycle` event in 90 min | critical | T1 | +| `ollama-unreachable` | `sentinel_health` event with `ollama_reachable=false` | critical | T1 | +| `loki-circuit-open` | `sentinel_health` with `loki_circuit_open=true` | critical | T1 | +| `post-deploy-warn-rate` | WARN rate > 5/min in 10 min after lifecycle event | warn | T1 | +| `bridge-failure-post-deploy` | ERROR in sim-steward within 15 min of plugin_start | critical | T1 | +| `plugin-slow-start` | Time from plugin_lifecycle start → ready > 30s | warn | T1 | +| `error-spike-post-deploy` | Error count doubles vs prior 15 min window after deploy | warn | T1 | +| `error-spike-general` | `count_over_time({app="sim-steward"} \| json \| level="ERROR" [10m]) > 10` | warn | T1 | + +### Domain 3: iRacing Session Behavior (5 alerts) + +| Alert ID | Condition | Severity | Tier | +|---|---|---|---| +| `session-no-actions` | Session active 15+ min, zero `action_dispatched` events | warn | T1 | +| `session-no-end` | `iracing_session_start` with no `iracing_session_end` within 4h | warn | T1 | +| `action-failure-streak` | 3+ consecutive `action_result` errors in same session | critical | T1 | +| `websocket-disconnect-spike` | 3+ `websocket_disconnect` events in 5 min | warn | T1 | +| `incident-detection-zero` | iRacing session > 30 min, zero `iracing_incident` events | warn | T1 | + +### Domain 4: Claude Code Session Health (7 alerts) + +| Alert ID | Condition | Severity | Tier | +|---|---|---|---| +| `session-abandoned` | Session start, no completion token entry, no activity for 30 min | warn | T1 | +| `claude-error-spike` | 5+ ERROR entries in claude-dev-logging in 5 min | warn | T1 | +| `permission-flood` | 10+ permission-related log entries in 5 min | warn | T1 | +| `subagent-explosion` | Subagent spawn count > 20 in single session | warn | T2 | +| `mcp-service-errors` | MCP call failures > 5 in 10 min | warn | T1 | +| `tool-loop-detected` | Same tool called 5+ times in same session without progress | warn | T2 | +| `session-zero-output` | Session completes (token entry exists), zero assistant messages logged | warn | T1 | + +### Domain 5: Token/Cost Budget (7 alerts) + +| Alert ID | Condition | Severity | Tier | +|---|---|---|---| +| `session-cost-spike` | Single session cost > $1.00 | warn | T1 | +| `session-cost-critical` | Single session cost > $3.00 | critical | T2 | +| `daily-spend-warning` | Rolling 24h spend > $10.00 | warn | T1 | +| `daily-spend-critical` | Rolling 24h spend > $25.00 | critical | T2 | +| `tool-use-flood` | Tool calls per session > 100 | warn | T1 | +| `unexpected-model` | Model field not in approved set (claude-opus-4, claude-sonnet-4-6, etc.) | warn | T1 | +| `cache-hit-rate-low` | Cache hit rate < 20% over 1h (when sessions active) | info | T1 | + +### Domain 6: Sentinel Self-Health (7 alerts) + +| Alert ID | Condition | Severity | Tier | +|---|---|---|---| +| `sentinel-cycle-stalled` | No `sentinel_cycle` event in 90 min | critical | T1 | +| `detector-error-rate` | Detector errors > 3 in single cycle | warn | T1 | +| `t1-slow` | T1 inference duration > 120s | warn | T1 | +| `t2-slow` | T2 tool loop duration > 300s | warn | T1 | +| `sentry-flood` | Sentry-worthy findings > 5 in 1h | warn | T1 | +| `findings-flood` | Total findings > 20 in single cycle | warn | T1 | +| `zero-findings-48h` | No findings at all in 48h (system may be suppressing) | info | T1 | + +### Domain 7: Replay & Incident Investigation (5 alerts) + +| Alert ID | Condition | Severity | Tier | +|---|---|---|---| +| `replay-no-seeks` | Replay started, zero `iracing_replay_seek` in 5 min | warn | T1 | +| `incident-detection-stall` | iRacing session active > 30 min, zero `iracing_incident` events in replay mode | warn | T1 | +| `incident-camera-stuck` | Same `camera_view` on 3+ consecutive incidents | info | T1 | +| `replay-session-no-close` | Replay session start, no session_end within 2h | warn | T1 | +| `action-incident-gap` | Incident detected, no `action_dispatched` within 10 min | info | T1 | + +### Domain 8: Cross-Stream Correlation (5 alerts) +*Implemented as multi-query rules using Grafana `math` expressions — fires only when both conditions true simultaneously.* + +| Alert ID | Streams | Condition | Severity | Tier | +|---|---|---|---|---| +| `ws-claude-coinflict` | sim-steward + claude-dev-logging | WebSocket disconnect + Claude ERROR in same 5-min window | warn | T2 | +| `session-token-abandon` | claude-dev-logging + claude-token-metrics | Session ERROR + no token entry for that session_id | warn | T2 | +| `action-fail-session-fail` | sim-steward + claude-dev-logging | `action_result` errors + Claude session ERROR within 10 min | critical | T2 | +| `deploy-triple-signal` | all 3 streams | 2+ streams show elevated error rate within 15 min of plugin lifecycle event | critical | T2 | +| `cost-spike-tool-flood` | claude-dev-logging + claude-token-metrics | Tool call count spike + session cost spike in same cycle | warn | T1 | + +**Total: 46 alerts across 8 domains.** + +--- + +## `/trigger` Endpoint Design + +The log-sentinel app gains a new HTTP endpoint: + +``` +POST /trigger +Content-Type: application/json + +{ + "alerts": [{ + "labels": { + "alertname": "ws-claude-coinflict", + "trigger_tier": "t2", + "severity": "warn" + }, + "annotations": { + "summary": "WebSocket disconnects co-occurring with Claude errors", + "description": "3 ws_disconnect events and 2 Claude ERROR entries in 5-min window ending 14:32:00" + }, + "startsAt": "2026-03-30T14:32:00Z", + "endsAt": "0001-01-01T00:00:00Z" + }] +} +``` + +Sentinel behavior on receipt: +1. Parse alert labels — extract `alertname`, `trigger_tier`, `severity` +2. Derive lookback window from `startsAt` (default: 30 min before alert fired) +3. If `trigger_tier=t1`: run T1 with alert context injected into summary prompt +4. If `trigger_tier=t2`: run T1 (for context) then immediately run T2 — skip the `needs_t2` gate +5. Deduplicate: if the same `alertname` triggered within `SENTINEL_DEDUP_WINDOW_SEC`, skip +6. Log `sentinel_trigger` event to Loki with alert metadata + +Alert context injection into T1 prompt: +``` +ALERT CONTEXT (from Grafana): + Alert: ws-claude-coinflict (warn) + Fired: 2026-03-30 14:32:00 UTC + Description: 3 ws_disconnect events and 2 Claude ERROR entries in 5-min window + → Focus investigation on this signal. Do not suppress even if recent history is quiet. +``` + +--- + +## Alert Covenant (Living Document) + +**Every behavioral change to the plugin, dashboard, or LLM integration must include a corresponding Grafana alert review.** + +When adding or changing: +- A new action handler → check Domain 3 (action-failure-streak thresholds) +- A new Claude integration → check Domain 4 + 5 +- A new log event or field → check if it should trigger an alert in the relevant domain +- Removing a log event → check if any alert depends on it (alert will go silent, not fire) + +Alert silence ≠ alert passing. Test new alerts by writing a test event to Loki via the gateway and verifying the alert fires within its evaluation window. + +**Canonical reference: `docs/RULES-GrafanaAlerts.md`** (to be added to CLAUDE.md) + +--- + +## Implementation Files + +### New files +- `observability/local/grafana/provisioning/alerting/contact-points.yml` +- `observability/local/grafana/provisioning/alerting/notification-policies.yml` +- `observability/local/grafana/provisioning/alerting/rules-infrastructure.yml` +- `observability/local/grafana/provisioning/alerting/rules-iracing.yml` +- `observability/local/grafana/provisioning/alerting/rules-claude-sessions.yml` +- `observability/local/grafana/provisioning/alerting/rules-token-cost.yml` +- `observability/local/grafana/provisioning/alerting/rules-sentinel-health.yml` +- `observability/local/grafana/provisioning/alerting/rules-cross-stream.yml` +- `docs/RULES-GrafanaAlerts.md` + +### Modified files +- `observability/local/log-sentinel/app.py` — add `POST /trigger` endpoint +- `observability/local/log-sentinel/sentinel.py` — add `trigger_cycle()` method (alert-context-aware T1/T2 dispatch) +- `observability/local/log-sentinel/config.py` — no new fields needed (uses existing dedup window) +- `observability/local/docker-compose.yml` — no changes needed (grafana already provisioned, port 3000) +- `.claude/CLAUDE.md` — add alert covenant reference + +--- + +## Verification + +1. **Provisioning loads**: `docker compose up grafana` — check Grafana UI → Alerting → Alert Rules shows all 46 rules +2. **Webhook fires**: Manually set an alert rule to always-firing in Grafana UI, verify `/trigger` receives POST and logs `sentinel_trigger` event to Loki +3. **T1 trigger path**: Confirm T1 runs after a non-critical alert fires, `sentinel_analyst_run` appears in logs with `trigger_source=grafana_alert` +4. **T2 direct trigger**: Confirm T2 runs immediately (skipping `needs_t2` gate) when `trigger_tier=t2` alert fires +5. **Dedup**: Fire same alert twice within dedup window, verify second is silently skipped +6. **Cross-stream rule**: Write test events to both sim-steward and claude-dev-logging streams via Loki push API, verify `ws-claude-coinflict` fires diff --git a/log-sentinel-dashboard.png b/log-sentinel-dashboard.png new file mode 100644 index 0000000..533bef0 Binary files /dev/null and b/log-sentinel-dashboard.png differ diff --git a/observability/local/docker-compose.yml b/observability/local/docker-compose.yml index df097fa..6164380 100644 --- a/observability/local/docker-compose.yml +++ b/observability/local/docker-compose.yml @@ -60,3 +60,33 @@ services: interval: 10s timeout: 5s retries: 5 + + log-sentinel: + build: ./log-sentinel + depends_on: + loki: + condition: service_healthy + ports: + - "8081:8081" + environment: + - LOKI_URL=http://loki:3100 + - GRAFANA_URL=http://grafana:3000 + - GRAFANA_USER=${GRAFANA_ADMIN_USER:-admin} + - GRAFANA_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin} + - OLLAMA_URL=http://host.docker.internal:11434 + - OLLAMA_MODEL_FAST=qwen3:8b + - OLLAMA_MODEL_DEEP=qwen3:32b + - SENTINEL_POLL_INTERVAL_SEC=3600 + - SENTINEL_LOOKBACK_SEC=3600 + - SENTINEL_T2_ENABLED=true + - SENTINEL_T2_PROACTIVE_INTERVAL_SEC=3600 + - SENTINEL_DEDUP_WINDOW_SEC=300 + - SENTINEL_SENTRY_DSN=${SENTINEL_SENTRY_DSN:-} + - SIMSTEWARD_LOG_ENV=${SIMSTEWARD_LOG_ENV:-local} + volumes: + - ${GRAFANA_STORAGE_PATH:-S:/sim-steward-grafana-storage}/log-sentinel:/data + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8081/health').read()"] + interval: 10s + timeout: 5s + retries: 5 diff --git a/observability/local/grafana/provisioning/alerting/contact-points.yml b/observability/local/grafana/provisioning/alerting/contact-points.yml new file mode 100644 index 0000000..4414ef7 --- /dev/null +++ b/observability/local/grafana/provisioning/alerting/contact-points.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +contactPoints: + - orgId: 1 + name: log-sentinel-webhook + receivers: + - uid: log-sentinel-webhook-recv + type: webhook + settings: + url: http://log-sentinel:8081/trigger + httpMethod: POST + disableResolveMessage: true diff --git a/observability/local/grafana/provisioning/alerting/notification-policies.yml b/observability/local/grafana/provisioning/alerting/notification-policies.yml new file mode 100644 index 0000000..f1d6e22 --- /dev/null +++ b/observability/local/grafana/provisioning/alerting/notification-policies.yml @@ -0,0 +1,10 @@ +apiVersion: 1 + +policies: + - orgId: 1 + receiver: log-sentinel-webhook + group_by: ['alertname'] + group_wait: 0s + group_interval: 1m + repeat_interval: 4h + routes: [] diff --git a/observability/local/grafana/provisioning/alerting/rules-claude-sessions.yml b/observability/local/grafana/provisioning/alerting/rules-claude-sessions.yml new file mode 100644 index 0000000..0e03d97 --- /dev/null +++ b/observability/local/grafana/provisioning/alerting/rules-claude-sessions.yml @@ -0,0 +1,246 @@ +apiVersion: 1 + +groups: + - orgId: 1 + name: Claude Code Session Health + folder: Log Sentinel + interval: 1m + rules: + + - uid: session-abandoned + title: Session Abandoned + condition: B + data: + - refId: A + relativeTimeRange: { from: 1800, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="claude-dev-logging"} | json | level="ERROR" [30m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: lt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: Alerting + execErrState: Error + for: 30m + annotations: + summary: Claude session started but no activity for 30 minutes + description: Session start detected with no activity, errors, or completion events for 30 minutes + labels: + alertname: session-abandoned + severity: warn + trigger_tier: t1 + + - uid: claude-error-spike + title: Claude Error Spike + condition: B + data: + - refId: A + relativeTimeRange: { from: 300, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="claude-dev-logging"} | json | level="ERROR" [5m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [4], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: 5+ ERROR entries in claude-dev-logging in 5 minutes + description: Elevated error rate in Claude session logging — possible API or tool failure + labels: + alertname: claude-error-spike + severity: warn + trigger_tier: t1 + + - uid: permission-flood + title: Permission Flood + condition: B + data: + - refId: A + relativeTimeRange: { from: 300, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="claude-dev-logging"} | json | message=~"(?i).*permission.*" [5m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [9], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: 10+ permission-related log entries in 5 minutes + description: Possible permission configuration problem or tool permission loop + labels: + alertname: permission-flood + severity: warn + trigger_tier: t1 + + - uid: subagent-explosion + title: Subagent Explosion + condition: B + data: + - refId: A + relativeTimeRange: { from: 3600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="claude-dev-logging"} | json | message=~"(?i).*subagent.*spawn.*" [60m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [19], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: Subagent spawn count exceeded 20 in single session + description: Unusually high subagent spawning — possible recursive agent loop or over-parallelization + labels: + alertname: subagent-explosion + severity: warn + trigger_tier: t2 + + - uid: mcp-service-errors + title: MCP Service Errors + condition: B + data: + - refId: A + relativeTimeRange: { from: 600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="claude-dev-logging"} | json | message=~"(?i).*mcp.*error.*|.*error.*mcp.*" [10m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [4], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: 5+ MCP call failures in 10 minutes + description: MCP server appears to be failing — multiple call errors detected + labels: + alertname: mcp-service-errors + severity: warn + trigger_tier: t1 + + - uid: tool-loop-detected + title: Tool Loop Detected + condition: B + data: + - refId: A + relativeTimeRange: { from: 1800, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="claude-dev-logging"} | json | level="WARN" | message=~"(?i).*tool.*loop.*|.*repeated.*tool.*" [30m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: Same tool called 5+ times in session without progress + description: Possible stuck agent — repeated tool invocations without forward progress + labels: + alertname: tool-loop-detected + severity: warn + trigger_tier: t2 + + - uid: session-zero-output + title: Session Zero Output + condition: B + data: + - refId: A + relativeTimeRange: { from: 3600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="claude-token-metrics"} | json [60m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: Session completed with token entry but zero assistant messages logged + description: Session appears to have run but produced no output — possible silent failure + labels: + alertname: session-zero-output + severity: warn + trigger_tier: t1 diff --git a/observability/local/grafana/provisioning/alerting/rules-cross-stream.yml b/observability/local/grafana/provisioning/alerting/rules-cross-stream.yml new file mode 100644 index 0000000..3c63a78 --- /dev/null +++ b/observability/local/grafana/provisioning/alerting/rules-cross-stream.yml @@ -0,0 +1,267 @@ +apiVersion: 1 + +groups: + - orgId: 1 + name: Cross-Stream Correlation + folder: Log Sentinel + interval: 1m + rules: + + # ws-claude-coinflict: WebSocket disconnect + Claude ERROR in same 5-min window + - uid: ws-claude-coinflict + title: WebSocket + Claude Error Conflict + condition: D + data: + - refId: A + relativeTimeRange: { from: 300, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | event="websocket_disconnect" [5m])' + instant: true + refId: A + - refId: B + relativeTimeRange: { from: 300, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="claude-dev-logging"} | json | level="ERROR" [5m])' + instant: true + refId: B + - refId: C + datasourceUid: __expr__ + model: + type: math + expression: "$A > 0 && $B > 0" + refId: C + - refId: D + datasourceUid: __expr__ + model: + type: classic_conditions + refId: D + conditions: + - evaluator: { params: [0], type: gt } + operator: { type: and } + query: { params: [C] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: WebSocket disconnects co-occurring with Claude errors + description: WebSocket disconnect and Claude ERROR events detected in the same 5-minute window + labels: + alertname: ws-claude-coinflict + severity: warn + trigger_tier: t2 + + # session-token-abandon: Claude session ERROR + no token entry for that session + - uid: session-token-abandon + title: Session Error Without Token Entry + condition: D + data: + - refId: A + relativeTimeRange: { from: 3600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="claude-dev-logging"} | json | level="ERROR" [1h])' + instant: true + refId: A + - refId: B + relativeTimeRange: { from: 3600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="claude-token-metrics"} | json [1h])' + instant: true + refId: B + - refId: C + datasourceUid: __expr__ + model: + type: math + expression: "$A > 0 && $B == 0" + refId: C + - refId: D + datasourceUid: __expr__ + model: + type: classic_conditions + refId: D + conditions: + - evaluator: { params: [0], type: gt } + operator: { type: and } + query: { params: [C] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 5m + annotations: + summary: Session ERROR entries with no corresponding token metrics + description: Claude session errors present but no token/cost entry — session may have been abandoned or crashed before completion + labels: + alertname: session-token-abandon + severity: warn + trigger_tier: t2 + + # action-fail-session-fail: action_result errors + Claude session ERROR within 10 min + - uid: action-fail-session-fail + title: Action Failure + Session Failure + condition: D + data: + - refId: A + relativeTimeRange: { from: 600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | event="action_result" | level="ERROR" [10m])' + instant: true + refId: A + - refId: B + relativeTimeRange: { from: 600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="claude-dev-logging"} | json | level="ERROR" [10m])' + instant: true + refId: B + - refId: C + datasourceUid: __expr__ + model: + type: math + expression: "$A > 0 && $B > 0" + refId: C + - refId: D + datasourceUid: __expr__ + model: + type: classic_conditions + refId: D + conditions: + - evaluator: { params: [0], type: gt } + operator: { type: and } + query: { params: [C] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: Plugin action failures coinciding with Claude session errors + description: action_result errors and Claude session errors detected within the same 10-minute window — possible cascading failure + labels: + alertname: action-fail-session-fail + severity: critical + trigger_tier: t2 + + # deploy-triple-signal: 2+ streams elevated error rate within 15 min of plugin lifecycle event + - uid: deploy-triple-signal + title: Deploy Triple Signal + condition: E + data: + - refId: A + relativeTimeRange: { from: 900, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | level="ERROR" [15m])' + instant: true + refId: A + - refId: B + relativeTimeRange: { from: 900, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="claude-dev-logging"} | json | level="ERROR" [15m])' + instant: true + refId: B + - refId: C + relativeTimeRange: { from: 900, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | event="plugin_lifecycle" [15m])' + instant: true + refId: C + - refId: D + datasourceUid: __expr__ + model: + type: math + expression: "($A > 5 ? 1 : 0) + ($B > 5 ? 1 : 0) + ($C > 0 ? 1 : 0)" + refId: D + - refId: E + datasourceUid: __expr__ + model: + type: classic_conditions + refId: E + conditions: + - evaluator: { params: [1], type: gt } + operator: { type: and } + query: { params: [D] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: Multiple streams showing elevated errors within 15 min of deploy + description: Deploy triple signal — plugin lifecycle event plus 2+ streams with elevated error rates + labels: + alertname: deploy-triple-signal + severity: critical + trigger_tier: t2 + + # cost-spike-tool-flood: Tool call count spike + session cost spike in same cycle + - uid: cost-spike-tool-flood + title: Cost Spike + Tool Flood + condition: D + data: + - refId: A + relativeTimeRange: { from: 3600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'max_over_time({app="claude-token-metrics"} | json | unwrap cost_usd [1h])' + instant: true + refId: A + - refId: B + relativeTimeRange: { from: 3600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'max_over_time({app="claude-token-metrics"} | json | unwrap tool_calls [1h])' + instant: true + refId: B + - refId: C + datasourceUid: __expr__ + model: + type: math + expression: "$A > 0.5 && $B > 50" + refId: C + - refId: D + datasourceUid: __expr__ + model: + type: classic_conditions + refId: D + conditions: + - evaluator: { params: [0], type: gt } + operator: { type: and } + query: { params: [C] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: High tool call count coinciding with elevated session cost + description: Tool use flood and cost spike occurring together — likely agentic loop with real cost impact + labels: + alertname: cost-spike-tool-flood + severity: warn + trigger_tier: t1 diff --git a/observability/local/grafana/provisioning/alerting/rules-infrastructure.yml b/observability/local/grafana/provisioning/alerting/rules-infrastructure.yml new file mode 100644 index 0000000..d5e471a --- /dev/null +++ b/observability/local/grafana/provisioning/alerting/rules-infrastructure.yml @@ -0,0 +1,348 @@ +apiVersion: 1 + +groups: + - orgId: 1 + name: Infrastructure & Deploy Quality + folder: Log Sentinel + interval: 1m + rules: + + - uid: bridge-start-failed + title: Bridge Start Failed + condition: B + data: + - refId: A + relativeTimeRange: { from: 300, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | event="plugin_lifecycle" | level="ERROR" [5m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: Plugin lifecycle ERROR on bridge start + description: A plugin_lifecycle ERROR event was detected in the last 5 minutes + labels: + alertname: bridge-start-failed + severity: critical + trigger_tier: t1 + + - uid: plugin-never-ready + title: Plugin Never Ready + condition: B + data: + - refId: A + relativeTimeRange: { from: 3600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | event="plugin_lifecycle" | message=~".*ready.*" [60m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: lt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: Alerting + execErrState: Error + for: 1m + annotations: + summary: Plugin started but never reached ready state + description: Plugin lifecycle start event exists but no ready event within 60 minutes + labels: + alertname: plugin-never-ready + severity: warn + trigger_tier: t1 + + - uid: post-deploy-warn-rate + title: High WARN Rate After Deploy + condition: B + data: + - refId: A + relativeTimeRange: { from: 600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | level="WARN" [10m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [50], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 2m + annotations: + summary: Elevated WARN rate after deploy + description: More than 50 WARN entries in 10 minutes following a lifecycle event + labels: + alertname: post-deploy-warn-rate + severity: warn + trigger_tier: t1 + + - uid: bridge-failure-post-deploy + title: Bridge ERROR After Deploy + condition: B + data: + - refId: A + relativeTimeRange: { from: 900, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | level="ERROR" [15m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: ERROR in sim-steward within 15 min of plugin start + description: Bridge ERROR detected shortly after deploy — may indicate startup regression + labels: + alertname: bridge-failure-post-deploy + severity: critical + trigger_tier: t1 + + - uid: plugin-slow-start + title: Plugin Slow Start + condition: B + data: + - refId: A + relativeTimeRange: { from: 300, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | event="plugin_lifecycle" | message=~".*start_duration.*" | __error__="" [5m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: Plugin startup exceeded 30s threshold + description: Time from plugin_lifecycle start to ready exceeded 30 seconds + labels: + alertname: plugin-slow-start + severity: warn + trigger_tier: t1 + + - uid: error-spike-post-deploy + title: Error Spike After Deploy + condition: B + data: + - refId: A + relativeTimeRange: { from: 600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | level="ERROR" [10m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [5], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 2m + annotations: + summary: Error count doubled vs prior window after deploy + description: Error spike detected in 10-minute window following deploy event + labels: + alertname: error-spike-post-deploy + severity: warn + trigger_tier: t1 + + - uid: error-spike-general + title: General Error Spike + condition: B + data: + - refId: A + relativeTimeRange: { from: 600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | level="ERROR" [10m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [10], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 2m + annotations: + summary: More than 10 ERROR logs in 10-minute window + description: General error spike detected — not necessarily deploy-related + labels: + alertname: error-spike-general + severity: warn + trigger_tier: t1 + + - uid: ollama-unreachable + title: Ollama Unreachable + condition: B + data: + - refId: A + relativeTimeRange: { from: 600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_health" | ollama_reachable="false" [10m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: Ollama is not reachable from log-sentinel + description: sentinel_health event recorded ollama_reachable=false + labels: + alertname: ollama-unreachable + severity: critical + trigger_tier: t1 + + - uid: loki-circuit-open + title: Loki Circuit Breaker Open + condition: B + data: + - refId: A + relativeTimeRange: { from: 600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_health" | loki_circuit_open="true" [10m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: Loki circuit breaker is open in log-sentinel + description: sentinel_health event recorded loki_circuit_open=true — Loki queries are failing + labels: + alertname: loki-circuit-open + severity: critical + trigger_tier: t1 + + - uid: sentinel-cycle-stalled + title: Sentinel Cycle Stalled + condition: B + data: + - refId: A + relativeTimeRange: { from: 5400, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_cycle" [90m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: lt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: Alerting + execErrState: Error + for: 5m + annotations: + summary: No sentinel_cycle event in 90 minutes + description: Log sentinel appears to be stalled — no analysis cycles have completed + labels: + alertname: sentinel-cycle-stalled + severity: critical + trigger_tier: t1 diff --git a/observability/local/grafana/provisioning/alerting/rules-iracing.yml b/observability/local/grafana/provisioning/alerting/rules-iracing.yml new file mode 100644 index 0000000..39b3ab4 --- /dev/null +++ b/observability/local/grafana/provisioning/alerting/rules-iracing.yml @@ -0,0 +1,354 @@ +apiVersion: 1 + +groups: + - orgId: 1 + name: iRacing Session Behavior + folder: Log Sentinel + interval: 1m + rules: + + - uid: session-no-actions + title: Session No Actions + condition: B + data: + - refId: A + relativeTimeRange: { from: 900, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | event="action_dispatched" [15m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: lt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: Alerting + execErrState: Error + for: 15m + annotations: + summary: iRacing session active with zero action_dispatched events + description: Session has been active 15+ minutes with no user actions dispatched + labels: + alertname: session-no-actions + severity: warn + trigger_tier: t1 + + - uid: action-failure-streak + title: Action Failure Streak + condition: B + data: + - refId: A + relativeTimeRange: { from: 600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | event="action_result" | level="ERROR" [10m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [2], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: 3+ consecutive action_result errors in session + description: Multiple consecutive action failures detected — possible stuck state or feature regression + labels: + alertname: action-failure-streak + severity: critical + trigger_tier: t1 + + - uid: websocket-disconnect-spike + title: WebSocket Disconnect Spike + condition: B + data: + - refId: A + relativeTimeRange: { from: 300, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | event="websocket_disconnect" [5m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [2], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: 3+ WebSocket disconnects in 5 minutes + description: Elevated WebSocket disconnect rate detected — dashboard connectivity unstable + labels: + alertname: websocket-disconnect-spike + severity: warn + trigger_tier: t1 + + - uid: incident-detection-zero + title: Incident Detection Zero + condition: B + data: + - refId: A + relativeTimeRange: { from: 1800, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | event="iracing_incident" [30m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: lt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: Alerting + execErrState: Error + for: 30m + annotations: + summary: iRacing session active 30+ min with zero incident events + description: Incident detection may be broken — no iracing_incident events despite active session + labels: + alertname: incident-detection-zero + severity: warn + trigger_tier: t1 + + - uid: session-no-end + title: Session No End Event + condition: B + data: + - refId: A + relativeTimeRange: { from: 14400, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | event="iracing_session_end" [4h])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: lt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: Alerting + execErrState: Error + for: 4h + annotations: + summary: iracing_session_start with no iracing_session_end within 4 hours + description: Session end event not received — possible session lifecycle tracking failure + labels: + alertname: session-no-end + severity: warn + trigger_tier: t1 + + - orgId: 1 + name: Replay & Incident Investigation + folder: Log Sentinel + interval: 1m + rules: + + - uid: replay-no-seeks + title: Replay No Seeks + condition: B + data: + - refId: A + relativeTimeRange: { from: 300, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | event="iracing_replay_seek" [5m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: lt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: Alerting + execErrState: Error + for: 5m + annotations: + summary: Replay started but no seek events in 5 minutes + description: Replay mode active with zero iracing_replay_seek events — may indicate broken replay controls + labels: + alertname: replay-no-seeks + severity: warn + trigger_tier: t1 + + - uid: incident-detection-stall + title: Incident Detection Stall in Replay + condition: B + data: + - refId: A + relativeTimeRange: { from: 1800, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | event="iracing_incident" [30m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: lt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: Alerting + execErrState: Error + for: 30m + annotations: + summary: Replay session active 30+ min with zero incident events + description: No incidents detected during replay — detector may be broken in replay mode + labels: + alertname: incident-detection-stall + severity: warn + trigger_tier: t1 + + - uid: incident-camera-stuck + title: Incident Camera Stuck + condition: B + data: + - refId: A + relativeTimeRange: { from: 600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | event="iracing_incident" [10m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [2], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: Same camera_view on 3+ consecutive incidents + description: Camera may be stuck — same camera_view repeated across multiple incident events + labels: + alertname: incident-camera-stuck + severity: info + trigger_tier: t1 + + - uid: replay-session-no-close + title: Replay Session No Close + condition: B + data: + - refId: A + relativeTimeRange: { from: 7200, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | event="iracing_session_end" | mode="replay" [2h])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: lt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: Alerting + execErrState: Error + for: 2h + annotations: + summary: Replay session start with no session_end within 2 hours + description: Replay session lifecycle may be broken — no session end event received + labels: + alertname: replay-session-no-close + severity: warn + trigger_tier: t1 + + - uid: action-incident-gap + title: Action-Incident Gap + condition: B + data: + - refId: A + relativeTimeRange: { from: 600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward"} | json | event="iracing_incident" [10m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 10m + annotations: + summary: Incident detected, no action_dispatched within 10 minutes + description: User may not have reviewed the incident — no action followed the incident event + labels: + alertname: action-incident-gap + severity: info + trigger_tier: t1 diff --git a/observability/local/grafana/provisioning/alerting/rules-sentinel-health.yml b/observability/local/grafana/provisioning/alerting/rules-sentinel-health.yml new file mode 100644 index 0000000..6488a25 --- /dev/null +++ b/observability/local/grafana/provisioning/alerting/rules-sentinel-health.yml @@ -0,0 +1,246 @@ +apiVersion: 1 + +groups: + - orgId: 1 + name: Sentinel Self-Health + folder: Log Sentinel + interval: 1m + rules: + + - uid: sentinel-stalled + title: Sentinel Cycle Stalled (Health) + condition: B + data: + - refId: A + relativeTimeRange: { from: 5400, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_cycle" [90m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: lt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: Alerting + execErrState: Error + for: 5m + annotations: + summary: No sentinel_cycle event in 90 minutes + description: Log sentinel appears stalled — no completed analysis cycles + labels: + alertname: sentinel-cycle-stalled-health + severity: critical + trigger_tier: t1 + + - uid: detector-error-rate + title: Detector Error Rate + condition: B + data: + - refId: A + relativeTimeRange: { from: 600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_analyst_run" | level="ERROR" [10m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [2], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: Analyst run errors exceeding 3 in a single cycle + description: Multiple analysis errors detected — Ollama or Loki connectivity may be failing + labels: + alertname: detector-error-rate + severity: warn + trigger_tier: t1 + + - uid: t1-slow + title: T1 Inference Slow + condition: B + data: + - refId: A + relativeTimeRange: { from: 600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'max_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_analyst_run" | tier="t1" | unwrap duration_ms [10m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [120000], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: T1 inference duration exceeded 120 seconds + description: T1 triage is running slower than expected — GPU may be under load or model is too large + labels: + alertname: t1-slow + severity: warn + trigger_tier: t1 + + - uid: t2-slow + title: T2 Inference Slow + condition: B + data: + - refId: A + relativeTimeRange: { from: 1800, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'max_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_t2_investigation" | unwrap inference_duration_ms [30m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [300000], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: T2 inference duration exceeded 300 seconds + description: T2 investigation is taking too long — deep model may be under heavy load + labels: + alertname: t2-slow + severity: warn + trigger_tier: t1 + + - uid: sentry-flood + title: Sentry Flood + condition: B + data: + - refId: A + relativeTimeRange: { from: 3600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_sentry_issue" [1h])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [4], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: More than 5 Sentry-worthy findings in 1 hour + description: Sentinel is creating too many Sentry issues — possible false positive storm or real incident + labels: + alertname: sentry-flood + severity: warn + trigger_tier: t1 + + - uid: findings-flood + title: Findings Flood + condition: B + data: + - refId: A + relativeTimeRange: { from: 600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_finding" [10m])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [19], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: More than 20 findings in a single cycle + description: Finding flood detected — sentinel may be over-sensitive or a real incident is occurring + labels: + alertname: findings-flood + severity: warn + trigger_tier: t1 + + - uid: zero-findings-48h + title: Zero Findings 48h + condition: B + data: + - refId: A + relativeTimeRange: { from: 172800, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_finding" [48h])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: lt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: Alerting + execErrState: Error + for: 1h + annotations: + summary: No sentinel findings at all in 48 hours + description: System may be suppressing findings or the sentinel is not running correctly + labels: + alertname: zero-findings-48h + severity: info + trigger_tier: t1 diff --git a/observability/local/grafana/provisioning/alerting/rules-token-cost.yml b/observability/local/grafana/provisioning/alerting/rules-token-cost.yml new file mode 100644 index 0000000..b2509ba --- /dev/null +++ b/observability/local/grafana/provisioning/alerting/rules-token-cost.yml @@ -0,0 +1,246 @@ +apiVersion: 1 + +groups: + - orgId: 1 + name: Token & Cost Budget + folder: Log Sentinel + interval: 1m + rules: + + - uid: session-cost-spike + title: Session Cost Spike + condition: B + data: + - refId: A + relativeTimeRange: { from: 3600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'max_over_time({app="claude-token-metrics"} | json | unwrap cost_usd [1h])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [1.0], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: Single session cost exceeded $1.00 + description: A Claude session cost more than $1.00 — review for efficiency + labels: + alertname: session-cost-spike + severity: warn + trigger_tier: t1 + + - uid: session-cost-critical + title: Session Cost Critical + condition: B + data: + - refId: A + relativeTimeRange: { from: 3600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'max_over_time({app="claude-token-metrics"} | json | unwrap cost_usd [1h])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [3.0], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: Single session cost exceeded $3.00 + description: Critical cost threshold exceeded — session may be in a runaway loop + labels: + alertname: session-cost-critical + severity: critical + trigger_tier: t2 + + - uid: daily-spend-warning + title: Daily Spend Warning + condition: B + data: + - refId: A + relativeTimeRange: { from: 86400, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'sum_over_time({app="claude-token-metrics"} | json | unwrap cost_usd [24h])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [10.0], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: Rolling 24h spend exceeded $10.00 + description: Daily spend warning threshold hit — review recent session costs + labels: + alertname: daily-spend-warning + severity: warn + trigger_tier: t1 + + - uid: daily-spend-critical + title: Daily Spend Critical + condition: B + data: + - refId: A + relativeTimeRange: { from: 86400, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'sum_over_time({app="claude-token-metrics"} | json | unwrap cost_usd [24h])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [25.0], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: Rolling 24h spend exceeded $25.00 + description: Critical daily spend threshold hit — immediate review required + labels: + alertname: daily-spend-critical + severity: critical + trigger_tier: t2 + + - uid: tool-use-flood + title: Tool Use Flood + condition: B + data: + - refId: A + relativeTimeRange: { from: 3600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'max_over_time({app="claude-token-metrics"} | json | unwrap tool_calls [1h])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [100], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: Tool calls per session exceeded 100 + description: Unusually high tool call count — possible agentic loop or over-tooling + labels: + alertname: tool-use-flood + severity: warn + trigger_tier: t1 + + - uid: unexpected-model + title: Unexpected Model Used + condition: B + data: + - refId: A + relativeTimeRange: { from: 3600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'count_over_time({app="claude-token-metrics"} | json | model!~"claude-opus-4.*|claude-sonnet-4.*|claude-haiku-4.*" [1h])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0], type: gt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 0s + annotations: + summary: Unexpected model name in token metrics + description: A model field value outside the approved set was detected in claude-token-metrics + labels: + alertname: unexpected-model + severity: warn + trigger_tier: t1 + + - uid: cache-hit-rate-low + title: Cache Hit Rate Low + condition: B + data: + - refId: A + relativeTimeRange: { from: 3600, to: 0 } + datasourceUid: loki_local + model: + datasource: { type: loki, uid: loki_local } + editorMode: code + expr: 'avg_over_time({app="claude-token-metrics"} | json | unwrap cache_read_ratio [1h])' + instant: true + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: classic_conditions + refId: B + conditions: + - evaluator: { params: [0.2], type: lt } + operator: { type: and } + query: { params: [A] } + reducer: { type: last } + noDataState: OK + execErrState: Error + for: 15m + annotations: + summary: Cache hit rate below 20% over 1 hour + description: Low cache read ratio — context caching may be misconfigured or inactive + labels: + alertname: cache-hit-rate-low + severity: info + trigger_tier: t1 diff --git a/observability/local/grafana/provisioning/dashboards/claude-cache-context.json b/observability/local/grafana/provisioning/dashboards/claude-cache-context.json new file mode 100644 index 0000000..4355693 --- /dev/null +++ b/observability/local/grafana/provisioning/dashboards/claude-cache-context.json @@ -0,0 +1,882 @@ +{ + "id": null, + "uid": "claude-cache-context", + "title": "Claude Code — Cache & Context Health", + "description": "Cache hit rates, context pressure signals, per-turn token burn, and token budget analysis for Claude Code sessions.", + "tags": [ + "claude-code", + "cache", + "context", + "observability" + ], + "timezone": "browser", + "editable": true, + "graphTooltip": 1, + "time": { + "from": "now-7d", + "to": "now" + }, + "refresh": "30s", + "schemaVersion": 39, + "fiscalYearStartMonth": 0, + "liveNow": false, + "style": "dark", + "templating": { + "list": [ + { + "name": "session_id", + "label": "Session", + "type": "query", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "query": "{app=\"claude-token-metrics\"} | json session_id", + "regex": "session_id\":\"([^\"]+)", + "refresh": 2, + "includeAll": true, + "multi": false, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "sort": 2 + }, + { + "name": "model", + "label": "Model", + "type": "query", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "query": "{app=\"claude-token-metrics\"} | json model", + "regex": "model\":\"([^\"]+)", + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "sort": 1 + }, + { + "name": "project", + "label": "Project", + "type": "query", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "query": "{app=\"claude-token-metrics\"} | json project", + "regex": "project\":\"([^\"]+)", + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "sort": 1 + }, + { + "name": "effort", + "label": "Effort", + "type": "query", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "query": "{app=\"claude-token-metrics\"} | json effort", + "regex": "effort\":\"([^\"]+)", + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "sort": 1 + } + ] + }, + "panels": [ + { + "type": "row", + "title": "Cache Health Summary", + "collapsed": false, + "gridPos": { "x": 0, "y": 0, "w": 24, "h": 1 } + }, + { + "id": 1, + "title": "Context from Cache", + "description": "% of all context tokens served from cache. High values (>90%) are expected — Claude Code reuses a large context window across turns. This is NOT a per-request hit/miss rate; every turn hits the cache.", + "type": "gauge", + "gridPos": { "x": 0, "y": 1, "w": 6, "h": 5 }, + "datasource": { "type": "loki", "uid": "loki_local" }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_cache_read_tokens [$__interval]))", + "legendFormat": "Cache Read", + "queryType": "range", + "hide": true + }, + { + "refId": "B", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_cache_creation_tokens [$__interval]))", + "legendFormat": "Cache Creation", + "queryType": "range", + "hide": true + }, + { + "refId": "D", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_input_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_input_tokens [$__interval]))", + "legendFormat": "Input", + "queryType": "range", + "hide": true + }, + { + "refId": "C", + "datasource": { "type": "__expr__", "uid": "__expr__" }, + "type": "math", + "expression": "$A / ($A + $B + $D) * 100", + "hide": false + } + ], + "options": { + "orientation": "auto", + "showThresholdLabels": false, + "showThresholdMarkers": true, + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + } + }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "noValue": "0", + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#F2495C" }, + { "value": 30, "color": "#FF9830" }, + { "value": 50, "color": "#73BF69" } + ] + } + }, + "overrides": [] + } + }, + { + "id": 2, + "title": "Cache Read Tokens", + "type": "stat", + "gridPos": { "x": 6, "y": 1, "w": 6, "h": 5 }, + "datasource": { "type": "loki", "uid": "loki_local" }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__interval]))", + "legendFormat": "Cache Read", + "queryType": "range" + } + ], + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "textMode": "value", + "justifyMode": "center", + "orientation": "auto", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { + "values": false, + "calcs": ["sum"], + "fields": "" + } + }, + "fieldConfig": { + "defaults": { + "noValue": "0", + "unit": "short", + "decimals": 1, + "color": { "mode": "fixed", "fixedColor": "#B877D9" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "value": null, "color": "#B877D9" }] + } + }, + "overrides": [] + } + }, + { + "id": 3, + "title": "Cache Creation Tokens", + "type": "stat", + "gridPos": { "x": 12, "y": 1, "w": 6, "h": 5 }, + "datasource": { "type": "loki", "uid": "loki_local" }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_creation_tokens [$__interval]))", + "legendFormat": "Cache Creation", + "queryType": "range" + } + ], + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "textMode": "value", + "justifyMode": "center", + "orientation": "auto", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { + "values": false, + "calcs": ["sum"], + "fields": "" + } + }, + "fieldConfig": { + "defaults": { + "noValue": "0", + "unit": "short", + "decimals": 1, + "color": { "mode": "fixed", "fixedColor": "#FF9830" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "value": null, "color": "#FF9830" }] + } + }, + "overrides": [] + } + }, + { + "id": 4, + "title": "Cache Reuse Ratio", + "type": "stat", + "gridPos": { "x": 18, "y": 1, "w": 6, "h": 5 }, + "datasource": { "type": "loki", "uid": "loki_local" }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_cache_creation_tokens [$__interval]))", + "legendFormat": "Creation", + "queryType": "range", + "hide": true + }, + { + "refId": "B", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_cache_read_tokens [$__interval]))", + "legendFormat": "Read", + "queryType": "range", + "hide": true + }, + { + "refId": "C", + "datasource": { "type": "__expr__", "uid": "__expr__" }, + "type": "math", + "expression": "$B / $A", + "hide": false + } + ], + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "textMode": "value", + "justifyMode": "center", + "orientation": "auto", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + } + }, + "fieldConfig": { + "defaults": { + "noValue": "0", + "decimals": 1, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#F2495C" }, + { "value": 2, "color": "#FF9830" }, + { "value": 5, "color": "#73BF69" } + ] + } + }, + "overrides": [] + } + }, + { + "type": "row", + "title": "Cache Efficiency Over Time", + "collapsed": false, + "gridPos": { "x": 0, "y": 6, "w": 24, "h": 1 } + }, + { + "id": 5, + "title": "Context from Cache Trend", + "type": "timeseries", + "gridPos": { "x": 0, "y": 7, "w": 24, "h": 8 }, + "datasource": { "type": "loki", "uid": "loki_local" }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_cache_read_tokens [$__interval])) / (sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_cache_read_tokens [$__interval])) + sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_cache_creation_tokens [$__interval])) + sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_input_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_input_tokens [$__interval]))) * 100", + "legendFormat": "Context from Cache %", + "queryType": "range" + } + ], + "options": { + "legend": { "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "noValue": "0", + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "gradientMode": "opacity", + "lineWidth": 2, + "fillOpacity": 18, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": 3600000, + "axisLabel": "Hit Rate %", + "thresholdsStyle": { + "mode": "line" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#F2495C" }, + { "value": 50, "color": "#73BF69" } + ] + } + }, + "overrides": [] + } + }, + { + "type": "row", + "title": "Cache by Model & Effort", + "collapsed": false, + "gridPos": { "x": 0, "y": 15, "w": 24, "h": 1 } + }, + { + "id": 6, + "title": "Context from Cache by Model", + "type": "barchart", + "gridPos": { "x": 0, "y": 16, "w": 12, "h": 8 }, + "datasource": { "type": "loki", "uid": "loki_local" }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum by (model) (sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__range])) / (sum by (model) (sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__range])) + sum by (model) (sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_creation_tokens [$__range])) + sum by (model) (sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_input_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_input_tokens [$__range]))) * 100", + "legendFormat": "{{model}}", + "queryType": "instant" + } + ], + "options": { + "orientation": "horizontal", + "showValue": "always", + "barWidth": 0.8, + "groupWidth": 0.7, + "legend": { "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single" } + }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "decimals": 1, + "min": 0, + "max": 100, + "noValue": "0", + "color": { "mode": "palette-classic" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#F2495C" }, + { "value": 30, "color": "#FF9830" }, + { "value": 50, "color": "#73BF69" } + ] + } + }, + "overrides": [] + } + }, + { + "id": 7, + "title": "Context from Cache by Effort", + "type": "barchart", + "gridPos": { "x": 12, "y": 16, "w": 12, "h": 8 }, + "datasource": { "type": "loki", "uid": "loki_local" }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum by (effort) (sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__range])) / (sum by (effort) (sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__range])) + sum by (effort) (sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_creation_tokens [$__range])) + sum by (effort) (sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_input_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_input_tokens [$__range]))) * 100", + "legendFormat": "{{effort}}", + "queryType": "instant" + } + ], + "options": { + "orientation": "horizontal", + "showValue": "always", + "barWidth": 0.8, + "groupWidth": 0.7, + "legend": { "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single" } + }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "decimals": 1, + "min": 0, + "max": 100, + "noValue": "0", + "color": { "mode": "palette-classic" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#F2495C" }, + { "value": 30, "color": "#FF9830" }, + { "value": 50, "color": "#73BF69" } + ] + } + }, + "overrides": [] + } + }, + { + "type": "row", + "title": "Context Pressure", + "collapsed": false, + "gridPos": { "x": 0, "y": 24, "w": 24, "h": 1 } + }, + { + "id": 8, + "title": "Compactions", + "type": "stat", + "gridPos": { "x": 0, "y": 25, "w": 8, "h": 7 }, + "datasource": { "type": "loki", "uid": "loki_local" }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"lifecycle\"} | json hook_type, model, project, effort | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | hook_type=\"pre-compact\" [$__interval]))", + "legendFormat": "Compactions", + "queryType": "range" + } + ], + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "textMode": "value", + "justifyMode": "center", + "orientation": "auto", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { + "values": false, + "calcs": ["sum"], + "fields": "" + } + }, + "fieldConfig": { + "defaults": { + "noValue": "0", + "decimals": 0, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#73BF69" }, + { "value": 5, "color": "#F2495C" } + ] + } + }, + "overrides": [] + } + }, + { + "id": 9, + "title": "Compaction Rate", + "type": "timeseries", + "gridPos": { "x": 8, "y": 25, "w": 8, "h": 7 }, + "datasource": { "type": "loki", "uid": "loki_local" }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"lifecycle\"} | json hook_type, model, project, effort | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | hook_type=\"pre-compact\" [$__interval]))", + "legendFormat": "Compactions", + "queryType": "range" + } + ], + "options": { + "legend": { "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single" } + }, + "fieldConfig": { + "defaults": { + "noValue": "0", + "decimals": 0, + "color": { "mode": "fixed", "fixedColor": "#FF9830" }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "gradientMode": "opacity", + "fillOpacity": 25, + "lineWidth": 2, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": 3600000, + "axisLabel": "Compactions" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#73BF69" }, + { "value": 5, "color": "#F2495C" } + ] + } + }, + "overrides": [] + } + }, + { + "id": 10, + "title": "Avg Turns Before Compaction", + "type": "stat", + "gridPos": { "x": 16, "y": 25, "w": 8, "h": 7 }, + "datasource": { "type": "loki", "uid": "loki_local" }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"tokens\"} | json project | project=~\"$project\" [$__range]))", + "legendFormat": "Total Turns", + "queryType": "range", + "hide": true + }, + { + "refId": "B", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"lifecycle\"} | json hook_type, project | project=~\"$project\" | hook_type=\"pre-compact\" [$__range]))", + "legendFormat": "Total Compactions", + "queryType": "range", + "hide": true + }, + { + "refId": "C", + "datasource": { "type": "__expr__", "uid": "__expr__" }, + "type": "math", + "expression": "$A / $B", + "hide": false + } + ], + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "textMode": "value", + "justifyMode": "center", + "orientation": "auto", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + } + }, + "fieldConfig": { + "defaults": { + "noValue": "0", + "decimals": 1, + "color": { "mode": "fixed", "fixedColor": "#8AB8FF" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "value": null, "color": "#8AB8FF" }] + } + }, + "overrides": [] + } + }, + { + "type": "row", + "title": "Per-Turn Token Burn", + "collapsed": false, + "gridPos": { "x": 0, "y": 32, "w": 24, "h": 1 } + }, + { + "id": 11, + "title": "Per-Turn Token Flow (excl. Cache Read)", + "description": "Tokens spent per turn: input, output, and cache creation. Cache Read is excluded — it dominates the scale and is shown separately in Cache Trend above.", + "type": "timeseries", + "gridPos": { "x": 0, "y": 33, "w": 12, "h": 8 }, + "datasource": { "type": "loki", "uid": "loki_local" }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-dev-logging\", component=\"tokens\"} | json model, project, effort, turn_input_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap turn_input_tokens [$__interval]))", + "legendFormat": "Input Tokens", + "queryType": "range" + }, + { + "refId": "B", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-dev-logging\", component=\"tokens\"} | json model, project, effort, turn_output_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap turn_output_tokens [$__interval]))", + "legendFormat": "Output Tokens", + "queryType": "range" + }, + { + "refId": "D", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-dev-logging\", component=\"tokens\"} | json model, project, effort, turn_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap turn_cache_creation_tokens [$__interval]))", + "legendFormat": "Cache Creation Tokens", + "queryType": "range" + } + ], + "options": { + "legend": { "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "fieldConfig": { + "defaults": { + "noValue": "0", + "unit": "short", + "decimals": 1, + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "gradientMode": "opacity", + "lineWidth": 2, + "fillOpacity": 18, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": 3600000, + "stacking": { "mode": "none" }, + "axisLabel": "Tokens", + "scaleDistribution": { "type": "log", "log": 2 } + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Cache Creation Tokens" }, + "properties": [ + { "id": "color", "value": { "mode": "fixed", "fixedColor": "#FF9830" } } + ] + } + ] + } + }, + { + "id": 12, + "title": "Turn-by-Turn Output Burst", + "type": "timeseries", + "gridPos": { "x": 12, "y": 33, "w": 12, "h": 8 }, + "datasource": { "type": "loki", "uid": "loki_local" }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-dev-logging\", component=\"tokens\"} | json model, project, effort, turn_output_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap turn_output_tokens [$__interval]))", + "legendFormat": "Output per Turn", + "queryType": "range" + } + ], + "options": { + "legend": { "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single" } + }, + "fieldConfig": { + "defaults": { + "noValue": "0", + "decimals": 0, + "color": { "mode": "fixed", "fixedColor": "#FF6D00" }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "gradientMode": "opacity", + "lineWidth": 2, + "fillOpacity": 20, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": 3600000, + "axisLabel": "Output Tokens" + } + }, + "overrides": [] + } + }, + { + "type": "row", + "title": "Token Budget", + "collapsed": false, + "gridPos": { "x": 0, "y": 41, "w": 24, "h": 1 } + }, + { + "id": 13, + "title": "Token Type Distribution", + "type": "piechart", + "gridPos": { "x": 0, "y": 42, "w": 12, "h": 8 }, + "datasource": { "type": "loki", "uid": "loki_local" }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_input_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_input_tokens [$__interval]))", + "legendFormat": "Input", + "queryType": "range" + }, + { + "refId": "B", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_output_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_output_tokens [$__interval]))", + "legendFormat": "Output", + "queryType": "range" + }, + { + "refId": "C", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__interval]))", + "legendFormat": "Cache Read", + "queryType": "range" + }, + { + "refId": "D", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_creation_tokens [$__interval]))", + "legendFormat": "Cache Creation", + "queryType": "range" + } + ], + "options": { + "pieType": "donut", + "displayLabels": ["name", "percent"], + "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }, + "tooltip": { "mode": "single" }, + "reduceOptions": { + "values": false, + "calcs": ["sum"], + "fields": "" + } + }, + "fieldConfig": { + "defaults": { + "noValue": "0", + "unit": "short", + "decimals": 1, + "color": { "mode": "palette-classic" } + }, + "overrides": [] + } + }, + { + "id": 14, + "title": "Output Efficiency", + "type": "gauge", + "description": "Output tokens per total context token consumed. Higher = more output generated per unit of input.", + "gridPos": { "x": 12, "y": 42, "w": 12, "h": 7 }, + "datasource": { "type": "loki", "uid": "loki_local" }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_output_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_output_tokens [$__interval]))", + "legendFormat": "Output", + "queryType": "range", + "hide": true + }, + { + "refId": "B", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_input_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_input_tokens [$__interval]))", + "legendFormat": "Input", + "queryType": "range", + "hide": true + }, + { + "refId": "D", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_creation_tokens [$__interval]))", + "legendFormat": "Cache Creation", + "queryType": "range", + "hide": true + }, + { + "refId": "E", + "datasource": { "type": "loki", "uid": "loki_local" }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__interval]))", + "legendFormat": "Cache Read", + "queryType": "range", + "hide": true + }, + { + "refId": "C", + "datasource": { "type": "__expr__", "uid": "__expr__" }, + "type": "math", + "expression": "$A / ($B + $D + $E)", + "hide": false + } + ], + "options": { + "orientation": "auto", + "showThresholdLabels": false, + "showThresholdMarkers": true, + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + } + }, + "fieldConfig": { + "defaults": { + "noValue": "0", + "unit": "percentunit", + "min": 0, + "max": 0.05, + "decimals": 2, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#F2495C" }, + { "value": 0.002, "color": "#FF9830" }, + { "value": 0.01, "color": "#73BF69" } + ] + } + }, + "overrides": [] + } + } + ] +} diff --git a/observability/local/grafana/provisioning/dashboards/claude-code-overview.json b/observability/local/grafana/provisioning/dashboards/claude-code-overview.json index de6c222..a75af81 100644 --- a/observability/local/grafana/provisioning/dashboards/claude-code-overview.json +++ b/observability/local/grafana/provisioning/dashboards/claude-code-overview.json @@ -73,6 +73,69 @@ "h": 1 } }, + { + "id": 20, + "title": "Session Cost", + "type": "stat", + "transparent": true, + "gridPos": { + "x": 20, + "y": 1, + "w": 4, + "h": 4 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json session_id | session_id=~\"$session_id\" | unwrap cost_usd [$__interval]))", + "legendFormat": "Cost", + "queryType": "range" + } + ], + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "textMode": "value", + "justifyMode": "center", + "orientation": "auto", + "text": { + "titleSize": 12, + "valueSize": 36 + }, + "reduceOptions": { + "values": false, + "calcs": ["sum"], + "fields": "" + } + }, + "fieldConfig": { + "defaults": { + "noValue": "$0.00", + "unit": "currencyUSD", + "decimals": 4, + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#73BF69" }, + { "value": 5, "color": "#FF9830" }, + { "value": 20, "color": "#F2495C" } + ] + } + }, + "overrides": [] + } + }, { "id": 1, "title": "Tool Calls", @@ -81,7 +144,7 @@ "gridPos": { "x": 0, "y": 1, - "w": 5, + "w": 4, "h": 4 }, "datasource": { @@ -95,14 +158,14 @@ "type": "loki", "uid": "loki_local" }, - "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=~\"tool|mcp-.*\"} | json hook_type, session_id | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" [$__range]))", + "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=~\"tool|mcp-.*\"} | json hook_type, session_id | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" [$__interval]))", "legendFormat": "Tool Calls", "queryType": "range" } ], "options": { "colorMode": "background-gradient", - "graphMode": "none", + "graphMode": "area", "textMode": "value", "justifyMode": "center", "orientation": "auto", @@ -144,9 +207,9 @@ "type": "stat", "transparent": true, "gridPos": { - "x": 5, + "x": 4, "y": 1, - "w": 5, + "w": 4, "h": 4 }, "datasource": { @@ -160,14 +223,14 @@ "type": "loki", "uid": "loki_local" }, - "expr": "sum(count_over_time({app=\"claude-dev-logging\", level=\"ERROR\"} | json session_id | session_id=~\"$session_id\" [$__range]))", + "expr": "sum(count_over_time({app=\"claude-dev-logging\", level=\"ERROR\"} | json session_id | session_id=~\"$session_id\" [$__interval]))", "legendFormat": "Errors", "queryType": "range" } ], "options": { "colorMode": "background-gradient", - "graphMode": "none", + "graphMode": "area", "textMode": "value", "justifyMode": "center", "orientation": "auto", @@ -216,9 +279,9 @@ "type": "stat", "transparent": true, "gridPos": { - "x": 10, + "x": 8, "y": 1, - "w": 5, + "w": 4, "h": 4 }, "datasource": { @@ -232,14 +295,14 @@ "type": "loki", "uid": "loki_local" }, - "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"agent\"} | json hook_type, session_id | session_id=~\"$session_id\" | hook_type=\"subagent-start\" [$__range]))", + "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"agent\"} | json hook_type, session_id | session_id=~\"$session_id\" | hook_type=\"subagent-start\" [$__interval]))", "legendFormat": "Agents", "queryType": "range" } ], "options": { "colorMode": "background-gradient", - "graphMode": "none", + "graphMode": "area", "textMode": "value", "justifyMode": "center", "orientation": "auto", @@ -281,9 +344,9 @@ "type": "stat", "transparent": true, "gridPos": { - "x": 15, + "x": 12, "y": 1, - "w": 5, + "w": 4, "h": 4 }, "datasource": { @@ -297,14 +360,14 @@ "type": "loki", "uid": "loki_local" }, - "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"user\"} | json hook_type, session_id | session_id=~\"$session_id\" | hook_type=\"user-prompt-submit\" [$__range]))", + "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"user\"} | json hook_type, session_id | session_id=~\"$session_id\" | hook_type=\"user-prompt-submit\" [$__interval]))", "legendFormat": "Prompts", "queryType": "range" } ], "options": { "colorMode": "background-gradient", - "graphMode": "none", + "graphMode": "area", "textMode": "value", "justifyMode": "center", "orientation": "auto", @@ -346,7 +409,7 @@ "type": "stat", "transparent": true, "gridPos": { - "x": 20, + "x": 16, "y": 1, "w": 4, "h": 4 @@ -362,14 +425,14 @@ "type": "loki", "uid": "loki_local" }, - "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"user\"} | json hook_type, session_id | session_id=~\"$session_id\" | hook_type=\"permission-request\" [$__range]))", + "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"user\"} | json hook_type, session_id | session_id=~\"$session_id\" | hook_type=\"permission-request\" [$__interval]))", "legendFormat": "Permissions", "queryType": "range" } ], "options": { "colorMode": "background-gradient", - "graphMode": "none", + "graphMode": "area", "textMode": "value", "justifyMode": "center", "orientation": "auto", @@ -515,7 +578,7 @@ "type": "loki", "uid": "loki_local" }, - "expr": "sum by (component) (count_over_time({app=\"claude-dev-logging\"} | json session_id | session_id=~\"$session_id\" [$__range]))", + "expr": "sum by (component) (count_over_time({app=\"claude-dev-logging\"} | json session_id | session_id=~\"$session_id\" [$__interval]))", "legendFormat": "{{component}}", "queryType": "range" } @@ -597,6 +660,21 @@ } ] }, + { + "matcher": { + "id": "byName", + "options": "tokens" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FF6D00", + "mode": "fixed" + } + } + ] + }, { "matcher": { "id": "byName", @@ -757,7 +835,7 @@ { "id": 9, "title": "Top Tools Used", - "description": "% share of each tool across all post-tool-use events.", + "description": "Tool call counts across all post-tool-use events. mcp__ prefix stripped for readability.", "type": "table", "transparent": true, "gridPos": { @@ -777,8 +855,8 @@ "type": "loki", "uid": "loki_local" }, - "expr": "sum by (tool_name) (count_over_time({app=\"claude-dev-logging\", component=~\"tool|mcp-.*\"} | json hook_type, tool_name, session_id | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | tool_name != \"\" [$__range])) / ignoring(tool_name) group_left() sum(count_over_time({app=\"claude-dev-logging\", component=~\"tool|mcp-.*\"} | json hook_type, tool_name, session_id | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | tool_name != \"\" [$__range])) * 100", - "legendFormat": "{{tool_name}}", + "expr": "sum by (short_name) (count_over_time({app=\"claude-dev-logging\", component=~\"tool|mcp-.*\"} | json hook_type, tool_name, session_id | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | tool_name != \"\" | line_format \"{{.tool_name}}\" | regexp \"(?:mcp__)?(?P.*)\" [$__range]))", + "legendFormat": "{{short_name}}", "queryType": "range" } ], @@ -824,8 +902,8 @@ }, "fieldConfig": { "defaults": { - "unit": "percent", - "decimals": 1, + "unit": "short", + "decimals": 0, "custom": { "inspect": false, "width": 0 @@ -843,22 +921,12 @@ "properties": [ { "id": "custom.width", - "value": 200 - }, - { - "id": "max", - "value": 100 - }, - { - "id": "min", - "value": 0 + "value": 90 }, { "id": "custom.cellOptions", "value": { - "type": "gauge", - "mode": "basic", - "valueDisplayMode": "color" + "type": "auto" } } ] @@ -871,7 +939,7 @@ "properties": [ { "id": "custom.width", - "value": 170 + "value": 200 } ] } diff --git a/observability/local/grafana/provisioning/dashboards/claude-token-cost.json b/observability/local/grafana/provisioning/dashboards/claude-token-cost.json new file mode 100644 index 0000000..d03f5fd --- /dev/null +++ b/observability/local/grafana/provisioning/dashboards/claude-token-cost.json @@ -0,0 +1,1540 @@ +{ + "id": null, + "uid": "claude-token-cost", + "title": "Claude Code — Token & Cost Intelligence", + "description": "Token usage, cost tracking, cache economics, and model efficiency for Claude Code sessions.", + "tags": [ + "claude-code", + "tokens", + "cost", + "observability" + ], + "timezone": "browser", + "editable": true, + "graphTooltip": 1, + "time": { + "from": "now-7d", + "to": "now" + }, + "refresh": "30s", + "schemaVersion": 39, + "fiscalYearStartMonth": 0, + "liveNow": false, + "style": "dark", + "templating": { + "list": [ + { + "name": "model", + "label": "Model", + "type": "query", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "query": "{app=\"claude-token-metrics\"} | json", + "regex": "\"model\":\"([^\"]+)\"", + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "sort": 1 + }, + { + "name": "project", + "label": "Project", + "type": "query", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "query": "{app=\"claude-token-metrics\"} | json", + "regex": "\"project\":\"([^\"]+)\"", + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "sort": 1 + }, + { + "name": "effort", + "label": "Effort", + "type": "query", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "query": "{app=\"claude-token-metrics\"} | json", + "regex": "\"effort\":\"([^\"]+)\"", + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "sort": 1 + } + ] + }, + "panels": [ + { + "type": "row", + "title": "Spend Summary", + "collapsed": false, + "gridPos": { + "x": 0, + "y": 0, + "w": 24, + "h": 1 + } + }, + { + "id": 1, + "title": "Total Spend", + "type": "stat", + "gridPos": { + "x": 0, + "y": 1, + "w": 6, + "h": 5 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap cost_usd [$__interval]))", + "legendFormat": "Total Spend", + "queryType": "range" + } + ], + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { + "calcs": ["sum"], + "fields": "", + "values": false + } + }, + "fieldConfig": { + "defaults": { + "noValue": "$0.00", + "unit": "currencyUSD", + "decimals": 4, + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#73BF69" }, + { "value": 5, "color": "#FF9830" }, + { "value": 20, "color": "#F2495C" } + ] + } + }, + "overrides": [] + } + }, + { + "id": 2, + "title": "Sessions", + "type": "stat", + "gridPos": { + "x": 6, + "y": 1, + "w": 6, + "h": 5 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"lifecycle\"} | json | hook_type=\"session-end\" | project=~\"$project\" [$__interval]))", + "legendFormat": "Sessions", + "queryType": "range" + } + ], + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { + "calcs": ["sum"], + "fields": "", + "values": false + } + }, + "fieldConfig": { + "defaults": { + "noValue": "0", + "decimals": 0, + "color": { + "mode": "fixed", + "fixedColor": "#5794F2" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#5794F2" } + ] + } + }, + "overrides": [] + } + }, + { + "id": 3, + "title": "Avg Cost / Session", + "type": "stat", + "gridPos": { + "x": 12, + "y": 1, + "w": 6, + "h": 5 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap cost_usd [$__range]))", + "legendFormat": "Total Spend", + "queryType": "range", + "hide": true + }, + { + "refId": "B", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"lifecycle\"} | json | hook_type=\"session-end\" | project=~\"$project\" [$__range]))", + "legendFormat": "Sessions", + "queryType": "range", + "hide": true + }, + { + "refId": "C", + "datasource": { + "type": "__expr__", + "uid": "__expr__" + }, + "type": "math", + "expression": "$A / $B", + "hide": false + } + ], + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + } + }, + "fieldConfig": { + "defaults": { + "noValue": "$0.00", + "unit": "currencyUSD", + "decimals": 4, + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#73BF69" }, + { "value": 5, "color": "#FF9830" }, + { "value": 20, "color": "#F2495C" } + ] + } + }, + "overrides": [] + } + }, + { + "id": 4, + "title": "Projected Monthly", + "type": "stat", + "gridPos": { + "x": 18, + "y": 1, + "w": 6, + "h": 5 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(rate({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap cost_usd [$__range])) * 2592000", + "legendFormat": "Projected Monthly", + "queryType": "range" + } + ], + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + } + }, + "fieldConfig": { + "defaults": { + "noValue": "$0.00", + "unit": "currencyUSD", + "decimals": 2, + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#73BF69" }, + { "value": 100, "color": "#FF9830" }, + { "value": 300, "color": "#F2495C" } + ] + } + }, + "overrides": [] + } + }, + { + "type": "row", + "title": "Spend Trend", + "collapsed": false, + "gridPos": { + "x": 0, + "y": 6, + "w": 24, + "h": 1 + } + }, + { + "id": 5, + "title": "Cost Over Time", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 7, + "w": 24, + "h": 8 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum by (model) (sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap cost_usd [$__interval]))", + "legendFormat": "{{model}}", + "queryType": "range" + } + ], + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "fieldConfig": { + "defaults": { + "unit": "currencyUSD", + "decimals": 4, + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 18, + "gradientMode": "opacity", + "lineWidth": 2, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": 3600000, + "stacking": { + "mode": "normal", + "group": "A" + }, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "USD", + "scaleDistribution": { + "type": "linear" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#73BF69" } + ] + } + }, + "overrides": [] + } + }, + { + "type": "row", + "title": "Where Money Goes", + "collapsed": false, + "gridPos": { + "x": 0, + "y": 15, + "w": 24, + "h": 1 + } + }, + { + "id": 6, + "title": "Cost by Model", + "type": "piechart", + "gridPos": { + "x": 0, + "y": 16, + "w": 8, + "h": 8 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum by (model) (sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap cost_usd [$__interval]))", + "legendFormat": "{{model}}", + "queryType": "range" + } + ], + "options": { + "pieType": "donut", + "displayLabels": ["name", "percent"], + "reduceOptions": { + "calcs": ["sum"], + "fields": "", + "values": false + }, + "legend": { + "displayMode": "list", + "placement": "right", + "values": ["value", "percent"] + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "currencyUSD", + "decimals": 4, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "id": 7, + "title": "Cost by Project", + "type": "piechart", + "gridPos": { + "x": 8, + "y": 16, + "w": 8, + "h": 8 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum by (project) (sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap cost_usd [$__interval]))", + "legendFormat": "{{project}}", + "queryType": "range" + } + ], + "options": { + "pieType": "donut", + "displayLabels": ["name", "percent"], + "reduceOptions": { + "calcs": ["sum"], + "fields": "", + "values": false + }, + "legend": { + "displayMode": "list", + "placement": "right", + "values": ["value", "percent"] + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "currencyUSD", + "decimals": 4, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "id": 8, + "title": "Sessions by Effort", + "type": "piechart", + "gridPos": { + "x": 16, + "y": 16, + "w": 8, + "h": 8 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum by (effort) (count_over_time({app=\"claude-token-metrics\"} | json model, project, effort, cost_usd | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | cost_usd != \"\" [$__interval]))", + "legendFormat": "{{effort}}", + "queryType": "range" + } + ], + "options": { + "pieType": "donut", + "displayLabels": ["name", "percent"], + "reduceOptions": { + "calcs": ["sum"], + "fields": "", + "values": false + }, + "legend": { + "displayMode": "list", + "placement": "right", + "values": ["value", "percent"] + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "type": "row", + "title": "Cache Economics", + "collapsed": false, + "gridPos": { + "x": 0, + "y": 24, + "w": 24, + "h": 1 + } + }, + { + "id": 9, + "title": "Context from Cache", + "description": "% of all context tokens served from cache. High values are expected — Claude Code reuses a large context window. Every turn hits the cache; this measures token efficiency, not request hit rate.", + "type": "gauge", + "gridPos": { + "x": 0, + "y": 25, + "w": 8, + "h": 7 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__range]))", + "legendFormat": "Cache Read", + "queryType": "range", + "hide": true + }, + { + "refId": "B", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_input_tokens [$__range]))", + "legendFormat": "Fresh Input", + "queryType": "range", + "hide": true + }, + { + "refId": "D", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_creation_tokens [$__range]))", + "legendFormat": "Cache Creation", + "queryType": "range", + "hide": true + }, + { + "refId": "C", + "datasource": { + "type": "__expr__", + "uid": "__expr__" + }, + "type": "math", + "expression": "$A / ($A + $B + $D) * 100", + "hide": false + } + ], + "options": { + "orientation": "auto", + "showThresholdLabels": false, + "showThresholdMarkers": true, + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + } + }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#F2495C" }, + { "value": 30, "color": "#FF9830" }, + { "value": 50, "color": "#73BF69" } + ] + } + }, + "overrides": [] + } + }, + { + "id": 10, + "title": "Total Cache Read Tokens", + "type": "stat", + "gridPos": { + "x": 8, + "y": 25, + "w": 8, + "h": 7 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__interval]))", + "legendFormat": "Cache Read Tokens", + "queryType": "range" + } + ], + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { + "calcs": ["sum"], + "fields": "", + "values": false + } + }, + "fieldConfig": { + "defaults": { + "noValue": "0", + "unit": "short", + "decimals": 1, + "color": { + "mode": "fixed", + "fixedColor": "#73BF69" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#73BF69" } + ] + } + }, + "overrides": [] + } + }, + { + "id": 11, + "title": "Cache Trend", + "description": "Cache Read vs Cache Creation tokens per interval. Cache Creation (right axis) is orders of magnitude smaller — dual axes show both trends clearly.", + "type": "timeseries", + "gridPos": { + "x": 16, + "y": 25, + "w": 8, + "h": 7 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__interval]))", + "legendFormat": "Cache Read", + "queryType": "range" + }, + { + "refId": "B", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_creation_tokens [$__interval]))", + "legendFormat": "Cache Creation", + "queryType": "range" + } + ], + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 20, + "lineWidth": 2, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "mode": "none", + "group": "A" + }, + "axisCenteredZero": false, + "axisColorMode": "text", + "scaleDistribution": { + "type": "linear" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#73BF69" } + ] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Cache Creation" }, + "properties": [ + { "id": "custom.axisPlacement", "value": "right" }, + { "id": "color", "value": { "mode": "fixed", "fixedColor": "#FF9830" } } + ] + } + ] + } + }, + { + "type": "row", + "title": "Token Flow", + "collapsed": false, + "gridPos": { + "x": 0, + "y": 32, + "w": 24, + "h": 1 + } + }, + { + "id": 12, + "title": "Token Type Distribution", + "type": "piechart", + "gridPos": { + "x": 0, + "y": 33, + "w": 8, + "h": 8 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_input_tokens [$__interval]))", + "legendFormat": "Input", + "queryType": "range" + }, + { + "refId": "B", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_output_tokens [$__interval]))", + "legendFormat": "Output", + "queryType": "range" + }, + { + "refId": "C", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__interval]))", + "legendFormat": "Cache Read", + "queryType": "range" + }, + { + "refId": "D", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_creation_tokens [$__interval]))", + "legendFormat": "Cache Creation", + "queryType": "range" + } + ], + "options": { + "pieType": "donut", + "displayLabels": ["name", "percent"], + "reduceOptions": { + "calcs": ["sum"], + "fields": "", + "values": false + }, + "legend": { + "displayMode": "list", + "placement": "right", + "values": ["value", "percent"] + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "decimals": 1, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "id": 13, + "title": "Total Output Tokens", + "description": "Total output tokens generated across all sessions in the selected time range.", + "type": "stat", + "gridPos": { + "x": 8, + "y": 33, + "w": 5, + "h": 7 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_output_tokens [$__interval]))", + "legendFormat": "Output Tokens", + "queryType": "range" + } + ], + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { + "calcs": ["sum"], + "fields": "", + "values": false + } + }, + "fieldConfig": { + "defaults": { + "noValue": "0", + "unit": "short", + "decimals": 1, + "color": { + "mode": "fixed", + "fixedColor": "#B877D9" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#B877D9" } + ] + } + }, + "overrides": [] + } + }, + { + "id": 14, + "title": "Output Tokens / Dollar", + "description": "Output tokens generated per dollar spent. Higher = more efficient.", + "type": "stat", + "gridPos": { + "x": 13, + "y": 33, + "w": 5, + "h": 7 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_output_tokens [$__range]))", + "legendFormat": "Output Tokens", + "queryType": "range", + "hide": true + }, + { + "refId": "B", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap cost_usd [$__range]))", + "legendFormat": "Cost", + "queryType": "range", + "hide": true + }, + { + "refId": "C", + "datasource": { + "type": "__expr__", + "uid": "__expr__" + }, + "type": "math", + "expression": "$A / $B", + "hide": false + } + ], + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + } + }, + "fieldConfig": { + "defaults": { + "noValue": "0", + "decimals": 0, + "unit": "locale", + "color": { + "mode": "fixed", + "fixedColor": "#73BF69" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#73BF69" } + ] + } + }, + "overrides": [] + } + }, + { + "id": 15, + "title": "Avg Turns / Session", + "type": "stat", + "gridPos": { + "x": 18, + "y": 33, + "w": 6, + "h": 7 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap assistant_turns [$__range]))", + "legendFormat": "Total Turns", + "queryType": "range", + "hide": true + }, + { + "refId": "B", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"lifecycle\"} | json | hook_type=\"session-end\" | project=~\"$project\" [$__range]))", + "legendFormat": "Sessions", + "queryType": "range", + "hide": true + }, + { + "refId": "C", + "datasource": { + "type": "__expr__", + "uid": "__expr__" + }, + "type": "math", + "expression": "$A / $B", + "hide": false + } + ], + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + } + }, + "fieldConfig": { + "defaults": { + "noValue": "0", + "decimals": 1, + "color": { + "mode": "fixed", + "fixedColor": "#5794F2" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#5794F2" } + ] + } + }, + "overrides": [] + } + }, + { + "type": "row", + "title": "Model Economics", + "collapsed": false, + "gridPos": { + "x": 0, + "y": 40, + "w": 24, + "h": 1 + } + }, + { + "id": 16, + "title": "Avg Cost per Session / Model", + "type": "barchart", + "gridPos": { + "x": 0, + "y": 41, + "w": 12, + "h": 8 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "avg by (model) (avg_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap cost_usd [$__range]))", + "legendFormat": "{{model}}", + "queryType": "instant" + } + ], + "options": { + "orientation": "horizontal", + "barWidth": 0.8, + "groupWidth": 0.7, + "showValue": "always", + "stacking": "none", + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0 + }, + "fieldConfig": { + "defaults": { + "unit": "currencyUSD", + "decimals": 2, + "color": { + "mode": "palette-classic" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#73BF69" } + ] + } + }, + "overrides": [] + } + }, + { + "id": 17, + "title": "Avg Output Tokens per Model", + "type": "barchart", + "gridPos": { + "x": 12, + "y": 41, + "w": 12, + "h": 8 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "avg by (model) (avg_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_output_tokens [$__range]))", + "legendFormat": "{{model}}", + "queryType": "instant" + } + ], + "options": { + "orientation": "horizontal", + "barWidth": 0.8, + "groupWidth": 0.7, + "showValue": "always", + "stacking": "none", + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0 + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "decimals": 1, + "color": { + "mode": "palette-classic" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#73BF69" } + ] + } + }, + "overrides": [] + } + }, + { + "type": "row", + "title": "Session Economics", + "collapsed": false, + "gridPos": { + "x": 0, + "y": 49, + "w": 24, + "h": 1 + } + }, + { + "id": 18, + "title": "Session Breakdown", + "type": "table", + "gridPos": { + "x": 0, + "y": 50, + "w": 24, + "h": 10 + }, + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "loki", + "uid": "loki_local" + }, + "expr": "{app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\"", + "legendFormat": "", + "queryType": "range" + } + ], + "transformations": [ + { + "id": "extractFields", + "options": { + "source": "Line", + "format": "json", + "replace": false, + "keepTime": true + } + }, + { + "id": "groupBy", + "options": { + "fields": { + "session_id": { + "aggregations": [], + "operation": "groupby" + }, + "model": { + "aggregations": ["lastNotNull"], + "operation": "aggregate" + }, + "effort": { + "aggregations": ["lastNotNull"], + "operation": "aggregate" + }, + "cost_usd": { + "aggregations": ["sum"], + "operation": "aggregate" + }, + "total_input_tokens": { + "aggregations": ["sum"], + "operation": "aggregate" + }, + "total_output_tokens": { + "aggregations": ["sum"], + "operation": "aggregate" + }, + "total_cache_read_tokens": { + "aggregations": ["sum"], + "operation": "aggregate" + }, + "assistant_turns": { + "aggregations": ["sum"], + "operation": "aggregate" + } + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Line": true, + "id": true, + "tsNs": true, + "labels": true, + "Time": true + }, + "indexByName": { + "session_id": 0, + "model (lastNotNull)": 1, + "effort (lastNotNull)": 2, + "cost_usd (sum)": 3, + "total_input_tokens (sum)": 4, + "total_output_tokens (sum)": 5, + "total_cache_read_tokens (sum)": 6, + "assistant_turns (sum)": 7 + }, + "renameByName": { + "session_id": "Session", + "model (lastNotNull)": "Model", + "effort (lastNotNull)": "Effort", + "cost_usd (sum)": "Cost (USD)", + "total_input_tokens (sum)": "Input Tokens", + "total_output_tokens (sum)": "Output Tokens", + "total_cache_read_tokens (sum)": "Cache Read", + "assistant_turns (sum)": "Turns" + } + } + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "Cost (USD)", + "desc": true + } + ] + } + } + ], + "options": { + "showHeader": true, + "cellHeight": "sm", + "footer": { + "show": true, + "reducer": ["sum"], + "countRows": false, + "fields": ["Cost (USD)", "Input Tokens", "Output Tokens", "Cache Read"] + }, + "sortBy": [ + { + "displayName": "Cost (USD)", + "desc": true + } + ] + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#73BF69" } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Cost (USD)" + }, + "properties": [ + { + "id": "unit", + "value": "currencyUSD" + }, + { + "id": "decimals", + "value": 4 + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-background", + "mode": "gradient" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { "value": null, "color": "#73BF69" }, + { "value": 1, "color": "#FF9830" }, + { "value": 5, "color": "#F2495C" } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Input Tokens" + }, + "properties": [ + { + "id": "unit", + "value": "locale" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Output Tokens" + }, + "properties": [ + { + "id": "unit", + "value": "locale" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache Read" + }, + "properties": [ + { + "id": "unit", + "value": "locale" + } + ] + } + ] + } + } + ] +} diff --git a/observability/local/grafana/provisioning/dashboards/claude-token-usage.json b/observability/local/grafana/provisioning/dashboards/claude-token-usage.json deleted file mode 100644 index fe8f1ad..0000000 --- a/observability/local/grafana/provisioning/dashboards/claude-token-usage.json +++ /dev/null @@ -1,893 +0,0 @@ -{ - "id": null, - "uid": "claude-token-usage", - "title": "Claude Code — Token Usage", - "description": "Token consumption, estimated cost, cache efficiency, and session trends across Claude Code sessions.", - "tags": ["claude-code", "tokens", "cost", "observability"], - "timezone": "browser", - "editable": true, - "graphTooltip": 1, - "time": { "from": "now-7d", "to": "now" }, - "refresh": "1m", - "schemaVersion": 39, - "fiscalYearStartMonth": 0, - "liveNow": false, - "style": "dark", - "templating": { - "list": [ - { - "name": "model", - "label": "Model", - "type": "query", - "datasource": { "type": "loki", "uid": "loki_local" }, - "query": "{app=\"claude-token-metrics\"} | json", - "regex": "\"model\":\"([^\"]+)\"", - "refresh": 2, - "includeAll": true, - "multi": true, - "allValue": ".*", - "current": { "text": "All", "value": "$__all" }, - "sort": 1 - }, - { - "name": "project", - "label": "Project", - "type": "query", - "datasource": { "type": "loki", "uid": "loki_local" }, - "query": "{app=\"claude-token-metrics\"} | json", - "regex": "\"project\":\"([^\"]+)\"", - "refresh": 2, - "includeAll": true, - "multi": true, - "allValue": ".*", - "current": { "text": "All", "value": "$__all" }, - "sort": 1 - }, - { - "name": "effort", - "label": "Effort", - "type": "query", - "datasource": { "type": "loki", "uid": "loki_local" }, - "query": "{app=\"claude-token-metrics\"} | json", - "regex": "\"effort\":\"([^\"]+)\"", - "refresh": 2, - "includeAll": true, - "multi": true, - "allValue": ".*", - "current": { "text": "All", "value": "$__all" }, - "sort": 1 - }, - { - "name": "session_id", - "label": "Session", - "type": "query", - "datasource": { "type": "loki", "uid": "loki_local" }, - "query": "{app=\"claude-token-metrics\"} | json", - "regex": "\"session_id\":\"([^\"]+)\"", - "refresh": 2, - "includeAll": true, - "multi": false, - "allValue": ".*", - "current": { "text": "All", "value": "$__all" }, - "sort": 0 - } - ] - }, - "panels": [ - { - "type": "row", - "title": "Cost Summary", - "collapsed": false, - "gridPos": { "x": 0, "y": 0, "w": 24, "h": 1 } - }, - { - "id": 1, - "title": "Output Tokens", - "description": "Total output (generated) tokens in the selected time range.", - "type": "stat", - "transparent": true, - "gridPos": { "x": 0, "y": 1, "w": 5, "h": 5 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_output_tokens [$__range]))", - "queryType": "range" - } - ], - "options": { - "colorMode": "background-gradient", - "graphMode": "area", - "textMode": "auto", - "wideLayout": true, - "justifyMode": "auto", - "orientation": "auto", - "text": { "titleSize": 12, "valueSize": 32 }, - "reduceOptions": { "values": false, "calcs": ["sum"], "fields": "" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "fixed", "fixedColor": "#5794F2" }, - "unit": "short", - "decimals": 0, - "thresholds": { "mode": "absolute", "steps": [{ "value": null, "color": "#5794F2" }] } - }, - "overrides": [] - } - }, - { - "id": 2, - "title": "Est. Cost (USD)", - "description": "Estimated total spend based on Anthropic public pricing. Cache reads are priced at 10% of input rate.", - "type": "stat", - "transparent": true, - "gridPos": { "x": 5, "y": 1, "w": 5, "h": 5 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap cost_usd [$__range]))", - "queryType": "range" - } - ], - "options": { - "colorMode": "background-gradient", - "graphMode": "area", - "textMode": "auto", - "wideLayout": true, - "justifyMode": "auto", - "orientation": "auto", - "text": { "titleSize": 12, "valueSize": 32 }, - "reduceOptions": { "values": false, "calcs": ["sum"], "fields": "" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "unit": "currencyUSD", - "decimals": 2, - "thresholds": { - "mode": "absolute", - "steps": [ - { "value": null, "color": "#73BF69" }, - { "value": 5, "color": "#FADE2A" }, - { "value": 20, "color": "#FF9830" }, - { "value": 50, "color": "#F2495C" } - ] - } - }, - "overrides": [] - } - }, - { - "id": 3, - "title": "Cache Hit Rate", - "description": "Fraction of read tokens served from cache (cache_read / (input + cache_creation + cache_read)). Higher is better — reduces cost and latency.", - "type": "stat", - "transparent": true, - "gridPos": { "x": 10, "y": 1, "w": 5, "h": 5 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_cache_read_tokens [$__range]))", - "queryType": "range", - "hide": true - }, - { - "refId": "B", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_input_tokens [$__range]))", - "queryType": "range", - "hide": true - }, - { - "refId": "C", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_cache_creation_tokens [$__range]))", - "queryType": "range", - "hide": true - }, - { - "refId": "D", - "datasource": { "type": "__expr__", "uid": "__expr__" }, - "type": "math", - "expression": "($A / ($A + $B + $C)) * 100", - "hide": false - } - ], - "options": { - "colorMode": "background-gradient", - "graphMode": "none", - "textMode": "auto", - "wideLayout": true, - "justifyMode": "auto", - "orientation": "auto", - "text": { "titleSize": 12, "valueSize": 32 }, - "reduceOptions": { "values": false, "calcs": ["lastNotNull"], "fields": "" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "unit": "percent", - "decimals": 1, - "min": 0, - "max": 100, - "thresholds": { - "mode": "absolute", - "steps": [ - { "value": null, "color": "#F2495C" }, - { "value": 40, "color": "#FF9830" }, - { "value": 70, "color": "#73BF69" } - ] - } - }, - "overrides": [] - } - }, - { - "id": 4, - "title": "Sessions", - "description": "Number of completed Claude Code sessions in the selected time range.", - "type": "stat", - "transparent": true, - "gridPos": { "x": 15, "y": 1, "w": 4, "h": 5 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum(count_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" [$__range]))", - "queryType": "range" - } - ], - "options": { - "colorMode": "none", - "graphMode": "area", - "textMode": "auto", - "wideLayout": true, - "justifyMode": "auto", - "orientation": "auto", - "text": { "titleSize": 12, "valueSize": 32 }, - "reduceOptions": { "values": false, "calcs": ["sum"], "fields": "" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "fixed", "fixedColor": "#A0A0A0" }, - "unit": "short", - "decimals": 0, - "thresholds": { "mode": "absolute", "steps": [{ "value": null, "color": "#A0A0A0" }] } - }, - "overrides": [] - } - }, - { - "id": 5, - "title": "Avg Cost / Session", - "description": "Average estimated cost per completed session.", - "type": "stat", - "transparent": true, - "gridPos": { "x": 19, "y": 1, "w": 5, "h": 5 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "avg_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap cost_usd [$__range])", - "queryType": "range" - } - ], - "options": { - "colorMode": "background-gradient", - "graphMode": "none", - "textMode": "auto", - "wideLayout": true, - "justifyMode": "auto", - "orientation": "auto", - "text": { "titleSize": 12, "valueSize": 32 }, - "reduceOptions": { "values": false, "calcs": ["mean"], "fields": "" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "unit": "currencyUSD", - "decimals": 3, - "thresholds": { - "mode": "absolute", - "steps": [ - { "value": null, "color": "#73BF69" }, - { "value": 1, "color": "#FADE2A" }, - { "value": 5, "color": "#FF9830" }, - { "value": 15, "color": "#F2495C" } - ] - } - }, - "overrides": [] - } - }, - - { - "type": "row", - "title": "Token Burn — All Types", - "collapsed": false, - "gridPos": { "x": 0, "y": 6, "w": 24, "h": 1 } - }, - { - "id": 6, - "title": "Token Consumption Over Time", - "description": "Stacked view of all four token categories per session window. Cache reads typically dominate — that's a good sign.", - "type": "timeseries", - "transparent": true, - "gridPos": { "x": 0, "y": 7, "w": 24, "h": 10 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "Cache Read", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_cache_read_tokens [$__interval]))", - "legendFormat": "Cache Read", - "queryType": "range" - }, - { - "refId": "Output", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_output_tokens [$__interval]))", - "legendFormat": "Output", - "queryType": "range" - }, - { - "refId": "Cache Create", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_cache_creation_tokens [$__interval]))", - "legendFormat": "Cache Create", - "queryType": "range" - }, - { - "refId": "Input", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_input_tokens [$__interval]))", - "legendFormat": "Input", - "queryType": "range" - } - ], - "options": { - "legend": { - "displayMode": "table", - "placement": "right", - "calcs": ["sum", "mean", "max"] - }, - "tooltip": { "mode": "multi", "sort": "desc" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "unit": "short", - "custom": { - "drawStyle": "line", - "lineInterpolation": "smooth", - "lineWidth": 2, - "fillOpacity": 20, - "gradientMode": "opacity", - "showPoints": "never", - "spanNulls": false, - "axisBorderShow": false, - "stacking": { "mode": "normal", "group": "A" } - } - }, - "overrides": [ - { - "matcher": { "id": "byName", "options": "Cache Read" }, - "properties": [ - { "id": "color", "value": { "fixedColor": "#73BF69", "mode": "fixed" } }, - { "id": "custom.fillOpacity", "value": 25 } - ] - }, - { - "matcher": { "id": "byName", "options": "Output" }, - "properties": [{ "id": "color", "value": { "fixedColor": "#5794F2", "mode": "fixed" } }] - }, - { - "matcher": { "id": "byName", "options": "Cache Create" }, - "properties": [{ "id": "color", "value": { "fixedColor": "#FF9830", "mode": "fixed" } }] - }, - { - "matcher": { "id": "byName", "options": "Input" }, - "properties": [{ "id": "color", "value": { "fixedColor": "#6C7280", "mode": "fixed" } }] - } - ] - } - }, - - { - "type": "row", - "title": "Daily Usage by Model", - "collapsed": false, - "gridPos": { "x": 0, "y": 17, "w": 24, "h": 1 } - }, - { - "id": 7, - "title": "Output Tokens per Day — by Model", - "description": "Stacked daily bars showing output token volume per model. Reveals model switching and high-burn days at a glance.", - "type": "timeseries", - "transparent": true, - "gridPos": { "x": 0, "y": 18, "w": 16, "h": 9 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum by (model) (sum_over_time({app=\"claude-token-metrics\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_output_tokens [$__interval]))", - "legendFormat": "{{model}}", - "queryType": "range" - } - ], - "options": { - "legend": { - "displayMode": "table", - "placement": "right", - "calcs": ["sum", "max"] - }, - "tooltip": { "mode": "multi", "sort": "desc" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "unit": "short", - "custom": { - "drawStyle": "bars", - "lineWidth": 1, - "fillOpacity": 80, - "gradientMode": "none", - "showPoints": "never", - "spanNulls": false, - "axisBorderShow": false, - "stacking": { "mode": "normal", "group": "A" }, - "barAlignment": 0 - } - }, - "overrides": [ - { - "matcher": { "id": "byRegexp", "options": ".*opus.*" }, - "properties": [{ "id": "color", "value": { "fixedColor": "#B877D9", "mode": "fixed" } }] - }, - { - "matcher": { "id": "byRegexp", "options": ".*sonnet.*" }, - "properties": [{ "id": "color", "value": { "fixedColor": "#5794F2", "mode": "fixed" } }] - }, - { - "matcher": { "id": "byRegexp", "options": ".*haiku.*" }, - "properties": [{ "id": "color", "value": { "fixedColor": "#73BF69", "mode": "fixed" } }] - } - ] - } - }, - { - "id": 8, - "title": "Spend by Model", - "description": "Cumulative estimated cost share per model over the selected period.", - "type": "piechart", - "transparent": true, - "gridPos": { "x": 16, "y": 18, "w": 8, "h": 9 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum by (model) (sum_over_time({app=\"claude-token-metrics\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap cost_usd [$__range]))", - "legendFormat": "{{model}}", - "queryType": "range" - } - ], - "options": { - "pieType": "donut", - "displayLabels": ["name", "percent"], - "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }, - "tooltip": { "mode": "multi" }, - "reduceOptions": { "values": false, "calcs": ["sum"], "fields": "" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "unit": "currencyUSD", - "decimals": 3 - }, - "overrides": [ - { - "matcher": { "id": "byRegexp", "options": ".*opus.*" }, - "properties": [{ "id": "color", "value": { "fixedColor": "#B877D9", "mode": "fixed" } }] - }, - { - "matcher": { "id": "byRegexp", "options": ".*sonnet.*" }, - "properties": [{ "id": "color", "value": { "fixedColor": "#5794F2", "mode": "fixed" } }] - }, - { - "matcher": { "id": "byRegexp", "options": ".*haiku.*" }, - "properties": [{ "id": "color", "value": { "fixedColor": "#73BF69", "mode": "fixed" } }] - } - ] - } - }, - - { - "type": "row", - "title": "Effort & Cache Efficiency", - "collapsed": false, - "gridPos": { "x": 0, "y": 27, "w": 24, "h": 1 } - }, - { - "id": 9, - "title": "Cost by Effort Level", - "description": "Standard = default mode. Extended thinking = thinking blocks enabled. Fast = /fast mode.", - "type": "barchart", - "transparent": true, - "gridPos": { "x": 0, "y": 28, "w": 8, "h": 9 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum by (effort) (sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\"} | json | session_id=~\"$session_id\" | unwrap cost_usd [$__range]))", - "legendFormat": "{{effort}}", - "queryType": "range" - } - ], - "transformations": [ - { "id": "reduce", "options": { "reducers": ["sum"] } }, - { "id": "sortBy", "options": { "fields": [{ "desc": true, "displayName": "Sum" }] } } - ], - "options": { - "orientation": "horizontal", - "barWidth": 0.7, - "groupWidth": 0.7, - "showValue": "always", - "stacking": "none", - "xTickLabelMaxLength": 24, - "legend": { "displayMode": "hidden" }, - "tooltip": { "mode": "multi" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "unit": "currencyUSD", - "decimals": 3, - "custom": { "fillOpacity": 80, "gradientMode": "none" } - }, - "overrides": [ - { - "matcher": { "id": "byName", "options": "standard" }, - "properties": [{ "id": "color", "value": { "fixedColor": "#5794F2", "mode": "fixed" } }] - }, - { - "matcher": { "id": "byName", "options": "extended_thinking" }, - "properties": [{ "id": "color", "value": { "fixedColor": "#B877D9", "mode": "fixed" } }] - }, - { - "matcher": { "id": "byName", "options": "fast" }, - "properties": [{ "id": "color", "value": { "fixedColor": "#73BF69", "mode": "fixed" } }] - } - ] - } - }, - { - "id": 10, - "title": "Output Tokens by Effort Level", - "description": "Session count and output token volume per effort mode.", - "type": "barchart", - "transparent": true, - "gridPos": { "x": 8, "y": 28, "w": 8, "h": 9 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum by (effort) (sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\"} | json | session_id=~\"$session_id\" | unwrap total_output_tokens [$__range]))", - "legendFormat": "{{effort}}", - "queryType": "range" - } - ], - "transformations": [ - { "id": "reduce", "options": { "reducers": ["sum"] } }, - { "id": "sortBy", "options": { "fields": [{ "desc": true, "displayName": "Sum" }] } } - ], - "options": { - "orientation": "horizontal", - "barWidth": 0.7, - "groupWidth": 0.7, - "showValue": "always", - "stacking": "none", - "xTickLabelMaxLength": 24, - "legend": { "displayMode": "hidden" }, - "tooltip": { "mode": "multi" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "unit": "short", - "decimals": 0, - "custom": { "fillOpacity": 80, "gradientMode": "none" } - }, - "overrides": [ - { - "matcher": { "id": "byName", "options": "standard" }, - "properties": [{ "id": "color", "value": { "fixedColor": "#5794F2", "mode": "fixed" } }] - }, - { - "matcher": { "id": "byName", "options": "extended_thinking" }, - "properties": [{ "id": "color", "value": { "fixedColor": "#B877D9", "mode": "fixed" } }] - }, - { - "matcher": { "id": "byName", "options": "fast" }, - "properties": [{ "id": "color", "value": { "fixedColor": "#73BF69", "mode": "fixed" } }] - } - ] - } - }, - { - "id": 11, - "title": "Cache Efficiency Over Time", - "description": "Cache hit rate (%) per session window. Sustained high rates mean context is being efficiently reused across turns.", - "type": "timeseries", - "transparent": true, - "gridPos": { "x": 16, "y": 28, "w": 8, "h": 9 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_cache_read_tokens [$__interval]))", - "queryType": "range", - "hide": true - }, - { - "refId": "B", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_input_tokens [$__interval]))", - "queryType": "range", - "hide": true - }, - { - "refId": "C", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_cache_creation_tokens [$__interval]))", - "queryType": "range", - "hide": true - }, - { - "refId": "CacheRate", - "datasource": { "type": "__expr__", "uid": "__expr__" }, - "type": "math", - "expression": "($A / ($A + $B + $C)) * 100", - "legendFormat": "Cache Hit %" - } - ], - "options": { - "legend": { "displayMode": "hidden" }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "unit": "percent", - "min": 0, - "max": 100, - "custom": { - "drawStyle": "line", - "lineInterpolation": "smooth", - "lineWidth": 2, - "fillOpacity": 20, - "gradientMode": "scheme", - "showPoints": "always", - "pointSize": 5, - "spanNulls": false, - "axisBorderShow": false, - "thresholdsStyle": { "mode": "area" } - }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "value": null, "color": "#F2495C" }, - { "value": 40, "color": "#FF9830" }, - { "value": 70, "color": "#73BF69" } - ] - } - }, - "overrides": [] - } - }, - - { - "type": "row", - "title": "Session Leaderboard", - "collapsed": false, - "gridPos": { "x": 0, "y": 37, "w": 24, "h": 1 } - }, - { - "id": 12, - "title": "Top Sessions by Cost", - "description": "Most expensive sessions in the selected period. Bar length = estimated USD spend. Identify long/costly outlier sessions here.", - "type": "barchart", - "transparent": true, - "gridPos": { "x": 0, "y": 38, "w": 14, "h": 12 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "topk(15, sum by (session_id) (sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json session_id=\"session_id\" | session_id=~\"$session_id\" | unwrap cost_usd [$__range])))", - "legendFormat": "{{session_id}}", - "queryType": "range" - } - ], - "transformations": [ - { "id": "reduce", "options": { "reducers": ["sum"] } }, - { "id": "sortBy", "options": { "fields": [{ "desc": true, "displayName": "Sum" }] } } - ], - "options": { - "orientation": "horizontal", - "barWidth": 0.7, - "groupWidth": 0.7, - "showValue": "always", - "stacking": "none", - "xTickLabelMaxLength": 28, - "legend": { "displayMode": "hidden" }, - "tooltip": { "mode": "single" } - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "continuous-GrYlRd" - }, - "unit": "currencyUSD", - "decimals": 3, - "custom": { "fillOpacity": 85, "gradientMode": "none" } - }, - "overrides": [] - } - }, - { - "id": 13, - "title": "Recent Session Log", - "description": "Raw session records. Each line = one completed Claude Code session. Includes model, effort, cost, token counts, and turns.", - "type": "logs", - "transparent": true, - "gridPos": { "x": 14, "y": 38, "w": 10, "h": 12 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "{app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | line_format \"{{.model}} | {{.effort}} | ${{.cost_usd}} | out={{.total_output_tokens}} | turns={{.assistant_turns}} | cache={{.total_cache_read_tokens}} | {{.session_id}}\"", - "queryType": "range" - } - ], - "options": { - "dedupStrategy": "none", - "enableLogDetails": true, - "prettifyLogMessage": false, - "showCommonLabels": false, - "showLabels": false, - "showTime": true, - "sortOrder": "Descending", - "wrapLogMessage": false - } - }, - - { - "type": "row", - "title": "Cost Trend", - "collapsed": false, - "gridPos": { "x": 0, "y": 50, "w": 24, "h": 1 } - }, - { - "id": 14, - "title": "Daily Spend Trend", - "description": "Estimated USD cost per day. Spot cost spikes and track efficiency gains over time.", - "type": "timeseries", - "transparent": true, - "gridPos": { "x": 0, "y": 51, "w": 16, "h": 8 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap cost_usd [1d]))", - "legendFormat": "Daily Cost", - "queryType": "range" - } - ], - "options": { - "legend": { "displayMode": "hidden" }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "fixed", "fixedColor": "#FADE2A" }, - "unit": "currencyUSD", - "decimals": 2, - "custom": { - "drawStyle": "bars", - "lineWidth": 1, - "fillOpacity": 70, - "gradientMode": "opacity", - "showPoints": "never", - "spanNulls": false, - "axisBorderShow": false, - "barAlignment": 0 - }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "value": null, "color": "#73BF69" }, - { "value": 5, "color": "#FF9830" }, - { "value": 15, "color": "#F2495C" } - ] - } - }, - "overrides": [] - } - }, - { - "id": 15, - "title": "Assistant Turns per Session", - "description": "Distribution of session depth (number of back-and-forth turns). Long sessions = more complex work or exploration.", - "type": "timeseries", - "transparent": true, - "gridPos": { "x": 16, "y": 51, "w": 8, "h": 8 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "avg_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap assistant_turns [$__interval])", - "legendFormat": "Avg Turns", - "queryType": "range" - }, - { - "refId": "B", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "max_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap assistant_turns [$__interval])", - "legendFormat": "Max Turns", - "queryType": "range" - } - ], - "options": { - "legend": { "displayMode": "list", "placement": "bottom" }, - "tooltip": { "mode": "multi", "sort": "desc" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "unit": "short", - "decimals": 0, - "custom": { - "drawStyle": "line", - "lineInterpolation": "smooth", - "lineWidth": 2, - "fillOpacity": 10, - "gradientMode": "none", - "showPoints": "always", - "pointSize": 5, - "spanNulls": false, - "axisBorderShow": false - } - }, - "overrides": [ - { - "matcher": { "id": "byName", "options": "Max Turns" }, - "properties": [ - { "id": "color", "value": { "fixedColor": "#FF9830", "mode": "fixed" } }, - { "id": "custom.lineWidth", "value": 1 }, - { "id": "custom.lineStyle", "value": { "dash": [4, 4], "fill": "dash" } } - ] - }, - { - "matcher": { "id": "byName", "options": "Avg Turns" }, - "properties": [{ "id": "color", "value": { "fixedColor": "#5794F2", "mode": "fixed" } }] - } - ] - } - } - ] -} diff --git a/observability/local/grafana/provisioning/dashboards/contextstream-deep-dive.json b/observability/local/grafana/provisioning/dashboards/contextstream-deep-dive.json deleted file mode 100644 index 1a918b1..0000000 --- a/observability/local/grafana/provisioning/dashboards/contextstream-deep-dive.json +++ /dev/null @@ -1,432 +0,0 @@ -{ - "id": null, - "uid": "contextstream-deep-dive", - "title": "ContextStream — Deep Dive", - "description": "MCP call patterns, action distribution, object lifecycle, and performance for ContextStream integration.", - "tags": ["claude-code", "contextstream", "mcp"], - "timezone": "browser", - "editable": true, - "graphTooltip": 1, - "time": { "from": "now-6h", "to": "now" }, - "refresh": "30s", - "schemaVersion": 39, - "fiscalYearStartMonth": 0, - "liveNow": false, - "style": "dark", - "templating": { - "list": [ - { - "name": "session_id", - "label": "Session", - "type": "query", - "datasource": { "type": "loki", "uid": "loki_local" }, - "query": "{app=\"claude-dev-logging\", component=\"lifecycle\"} | json | hook_type=\"session-start\"", - "regex": "session_id\":\"([^\"]+)", - "refresh": 2, - "includeAll": true, - "allValue": ".*", - "current": { "text": "All", "value": "$__all" }, - "sort": 2 - }, - { - "name": "cs_action", - "label": "Action", - "type": "query", - "datasource": { "type": "loki", "uid": "loki_local" }, - "query": "{app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | hook_type=\"post-tool-use\"", - "regex": "hook_payload_tool_input_action\":\"([^\"]+)", - "refresh": 2, - "includeAll": true, - "allValue": ".*", - "current": { "text": "All", "value": "$__all" }, - "sort": 1 - } - ] - }, - "panels": [ - { - "type": "row", - "title": "MCP Call Summary", - "collapsed": false, - "gridPos": { "x": 0, "y": 0, "w": 24, "h": 1 } - }, - { - "id": 1, - "title": "Total CS Calls", - "type": "stat", - "transparent": true, - "gridPos": { "x": 0, "y": 1, "w": 6, "h": 5 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" [$__range])", - "queryType": "range" - } - ], - "options": { - "colorMode": "background-gradient", - "graphMode": "area", - "textMode": "auto", - "wideLayout": true, - "justifyMode": "auto", - "orientation": "auto", - "text": { "titleSize": 12, "valueSize": 32 }, - "reduceOptions": { "values": false, "calcs": ["sum"], "fields": "" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "fixed", "fixedColor": "#B877D9" }, - "thresholds": { "mode": "absolute", "steps": [{ "value": null, "color": "#B877D9" }] } - }, - "overrides": [] - } - }, - { - "id": 2, - "title": "Failure Rate", - "type": "gauge", - "transparent": true, - "gridPos": { "x": 6, "y": 1, "w": 6, "h": 5 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "errors", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\", level=\"ERROR\"} | json | session_id=~\"$session_id\" [$__range])", - "queryType": "range", - "hide": true - }, - { - "refId": "total", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" [$__range])", - "queryType": "range", - "hide": true - }, - { - "refId": "rate", - "datasource": { "type": "__expr__", "uid": "__expr__" }, - "type": "math", - "expression": "$errors / $total * 100" - } - ], - "options": { - "showThresholdLabels": false, - "showThresholdMarkers": true, - "reduceOptions": { "values": false, "calcs": ["lastNotNull"], "fields": "" } - }, - "fieldConfig": { - "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "value": null, "color": "#73BF69" }, - { "value": 5, "color": "#FF9830" }, - { "value": 20, "color": "#F2495C" } - ] - } - }, - "overrides": [] - } - }, - { - "id": 3, - "title": "Calls by Tool", - "type": "piechart", - "transparent": true, - "gridPos": { "x": 12, "y": 1, "w": 12, "h": 5 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum by (tool_name) (count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" [$__range]))", - "queryType": "range" - } - ], - "options": { - "pieType": "donut", - "displayLabels": ["percent"], - "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }, - "tooltip": { "mode": "multi" }, - "reduceOptions": { "values": false, "calcs": ["sum"], "fields": "" } - }, - "fieldConfig": { - "defaults": { "color": { "mode": "palette-classic" } }, - "overrides": [ - { "matcher": { "id": "byName", "options": "mcp__contextstream__memory" }, "properties": [{ "id": "color", "value": { "fixedColor": "#B877D9", "mode": "fixed" } }] }, - { "matcher": { "id": "byName", "options": "mcp__contextstream__session" }, "properties": [{ "id": "color", "value": { "fixedColor": "#5794F2", "mode": "fixed" } }] }, - { "matcher": { "id": "byName", "options": "mcp__contextstream__search" }, "properties": [{ "id": "color", "value": { "fixedColor": "#73BF69", "mode": "fixed" } }] }, - { "matcher": { "id": "byName", "options": "mcp__contextstream__context" }, "properties": [{ "id": "color", "value": { "fixedColor": "#FF9830", "mode": "fixed" } }] }, - { "matcher": { "id": "byName", "options": "mcp__contextstream__init" }, "properties": [{ "id": "color", "value": { "fixedColor": "#FADE2A", "mode": "fixed" } }] }, - { "matcher": { "id": "byName", "options": "mcp__contextstream__workspace" }, "properties": [{ "id": "color", "value": { "fixedColor": "#8AB8FF", "mode": "fixed" } }] }, - { "matcher": { "id": "byName", "options": "mcp__contextstream__project" }, "properties": [{ "id": "color", "value": { "fixedColor": "#CA95E5", "mode": "fixed" } }] }, - { "matcher": { "id": "byName", "options": "mcp__contextstream__help" }, "properties": [{ "id": "color", "value": { "fixedColor": "#96D98D", "mode": "fixed" } }] } - ] - } - }, - { - "type": "row", - "title": "Action Patterns", - "collapsed": false, - "gridPos": { "x": 0, "y": 7, "w": 24, "h": 1 } - }, - { - "id": 4, - "title": "Action Breakdown", - "description": "Which MCP actions are called most frequently.", - "type": "barchart", - "transparent": true, - "gridPos": { "x": 0, "y": 8, "w": 12, "h": 10 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum by (hook_payload_tool_input_action) (count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" [$__range]))", - "queryType": "range" - } - ], - "options": { - "orientation": "horizontal", - "barWidth": 0.7, - "groupWidth": 0.7, - "showValue": "auto", - "stacking": "none", - "legend": { "displayMode": "hidden" }, - "tooltip": { "mode": "multi" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "fixed", "fixedColor": "#B877D9" }, - "custom": { "fillOpacity": 80, "gradientMode": "hue" } - }, - "overrides": [] - } - }, - { - "id": 5, - "title": "Action Mix Over Time", - "description": "How action usage patterns shift during a session.", - "type": "timeseries", - "transparent": true, - "gridPos": { "x": 12, "y": 8, "w": 12, "h": 10 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum by (hook_payload_tool_input_action) (count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" [$__interval]))", - "legendFormat": "{{hook_payload_tool_input_action}}", - "queryType": "range" - } - ], - "options": { - "legend": { "displayMode": "table", "placement": "right", "calcs": ["sum"] }, - "tooltip": { "mode": "multi", "sort": "desc" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { - "drawStyle": "line", - "lineInterpolation": "smooth", - "lineWidth": 2, - "fillOpacity": 10, - "gradientMode": "opacity", - "showPoints": "never", - "spanNulls": false, - "axisBorderShow": false, - "stacking": { "mode": "none", "group": "A" } - } - }, - "overrides": [] - } - }, - { - "type": "row", - "title": "Object Lifecycle", - "collapsed": false, - "gridPos": { "x": 0, "y": 19, "w": 24, "h": 1 } - }, - { - "id": 6, - "title": "CRUD Operations", - "description": "Create, read, update, and query operation distribution.", - "type": "barchart", - "transparent": true, - "gridPos": { "x": 0, "y": 20, "w": 12, "h": 9 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "Create", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | hook_payload_tool_input_action=~\"create.*|capture.*|import.*|remember\" [$__range])", - "legendFormat": "Create", - "queryType": "range" - }, - { - "refId": "Read", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | hook_payload_tool_input_action=~\"get.*|list.*\" [$__range])", - "legendFormat": "Read", - "queryType": "range" - }, - { - "refId": "Update", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | hook_payload_tool_input_action=~\"update.*|supersede.*|complete.*|reorder.*\" [$__range])", - "legendFormat": "Update", - "queryType": "range" - }, - { - "refId": "Query", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | hook_payload_tool_input_action=~\"search|recall|decisions|summary|timeline|smart_search|decision_trace\" [$__range])", - "legendFormat": "Query", - "queryType": "range" - } - ], - "options": { - "orientation": "vertical", - "barWidth": 0.6, - "groupWidth": 0.7, - "showValue": "always", - "stacking": "none", - "legend": { "displayMode": "list", "placement": "bottom" }, - "tooltip": { "mode": "multi" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { "fillOpacity": 80, "gradientMode": "hue" } - }, - "overrides": [ - { "matcher": { "id": "byName", "options": "Create" }, "properties": [{ "id": "color", "value": { "fixedColor": "#73BF69", "mode": "fixed" } }] }, - { "matcher": { "id": "byName", "options": "Read" }, "properties": [{ "id": "color", "value": { "fixedColor": "#5794F2", "mode": "fixed" } }] }, - { "matcher": { "id": "byName", "options": "Update" }, "properties": [{ "id": "color", "value": { "fixedColor": "#FF9830", "mode": "fixed" } }] }, - { "matcher": { "id": "byName", "options": "Query" }, "properties": [{ "id": "color", "value": { "fixedColor": "#B877D9", "mode": "fixed" } }] } - ] - } - }, - { - "id": 7, - "title": "Context & Search Patterns", - "description": "Context refresh frequency and search mode usage over time.", - "type": "timeseries", - "transparent": true, - "gridPos": { "x": 12, "y": 20, "w": 12, "h": 9 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "context", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum by (tool_name) (count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | tool_name=~\".*init|.*context\" [$__interval]))", - "legendFormat": "{{tool_name}}", - "queryType": "range" - }, - { - "refId": "search", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "sum by (hook_payload_tool_input_mode) (count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | tool_name=~\".*search\" [$__interval]))", - "legendFormat": "search:{{hook_payload_tool_input_mode}}", - "queryType": "range" - } - ], - "options": { - "legend": { "displayMode": "table", "placement": "right", "calcs": ["sum"] }, - "tooltip": { "mode": "multi", "sort": "desc" } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { - "drawStyle": "line", - "lineInterpolation": "smooth", - "lineWidth": 2, - "fillOpacity": 15, - "gradientMode": "opacity", - "showPoints": "never", - "spanNulls": false, - "axisBorderShow": false, - "stacking": { "mode": "none", "group": "A" } - } - }, - "overrides": [] - } - }, - { - "type": "row", - "title": "Errors & Failures", - "collapsed": false, - "gridPos": { "x": 0, "y": 30, "w": 24, "h": 1 } - }, - { - "id": 8, - "title": "Error Log", - "description": "All ContextStream MCP errors.", - "type": "logs", - "transparent": true, - "gridPos": { "x": 0, "y": 31, "w": 24, "h": 8 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "{app=\"claude-dev-logging\", component=\"mcp-contextstream\", level=\"ERROR\"} | json | session_id=~\"$session_id\"", - "queryType": "range" - } - ], - "options": { - "showTime": true, - "showLabels": false, - "showCommonLabels": false, - "wrapLogMessage": true, - "prettifyLogMessage": false, - "enableLogDetails": true, - "sortOrder": "Descending", - "dedupStrategy": "none" - } - }, - { - "type": "row", - "title": "Full Log Stream", - "collapsed": true, - "gridPos": { "x": 0, "y": 40, "w": 24, "h": 1 }, - "panels": [ - { - "id": 9, - "title": "ContextStream Logs", - "type": "logs", - "transparent": true, - "gridPos": { "x": 0, "y": 41, "w": 24, "h": 14 }, - "datasource": { "type": "loki", "uid": "loki_local" }, - "targets": [ - { - "refId": "A", - "datasource": { "type": "loki", "uid": "loki_local" }, - "expr": "{app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | line_format \"{{.tool_name}} | action={{.hook_payload_tool_input_action}} | {{.hook_type}}\"", - "queryType": "range" - } - ], - "options": { - "showTime": true, - "showLabels": false, - "showCommonLabels": false, - "wrapLogMessage": true, - "prettifyLogMessage": false, - "enableLogDetails": true, - "sortOrder": "Descending", - "dedupStrategy": "none" - } - } - ] - } - ] -} diff --git a/observability/local/grafana/provisioning/dashboards/simsteward-deploy-health.json b/observability/local/grafana/provisioning/dashboards/simsteward-deploy-health.json index 7be1cdb..83539b8 100644 --- a/observability/local/grafana/provisioning/dashboards/simsteward-deploy-health.json +++ b/observability/local/grafana/provisioning/dashboards/simsteward-deploy-health.json @@ -1,5 +1,6 @@ { "annotations": { "list": [] }, + "description": "Deploy health — deploy markers, plugin bring-up, bridge start, and error volume. deploy.ps1 pushes event=deploy_marker when SIMSTEWARD_LOKI_URL is set. post_deploy_warn=true means post-deploy tests/*.ps1 failed.", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, @@ -8,19 +9,146 @@ "liveNow": false, "panels": [ { - "gridPos": { "h": 3, "w": 24, "x": 0, "y": 0 }, - "id": 1, + "datasource": { "type": "loki", "uid": "loki_local" }, + "fieldConfig": { + "defaults": { + "decimals": 0, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "#5794F2", "value": null }] } + }, + "overrides": [] + }, + "gridPos": { "h": 5, "w": 8, "x": 0, "y": 0 }, + "id": 8, + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "sum(count_over_time({app=\"sim-steward\", env=\"${env}\"} | json | event=\"deploy_marker\" [$__range]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "Deploys This Period", + "type": "stat" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "fieldConfig": { + "defaults": { + "decimals": 0, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "#73BF69", "value": null }, { "color": "#F2495C", "value": 1 }] } + }, + "overrides": [] + }, + "gridPos": { "h": 5, "w": 8, "x": 8, "y": 0 }, + "id": 9, + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "sum(count_over_time({app=\"sim-steward\", env=\"${env}\"} | json | event=\"deploy_marker\" | post_deploy_warn=\"true\" [$__range]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "Post-Deploy Warnings", + "type": "stat" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "fieldConfig": { + "defaults": { + "decimals": 0, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "#73BF69", "value": null }, { "color": "#F2495C", "value": 1 }] } + }, + "overrides": [] + }, + "gridPos": { "h": 5, "w": 8, "x": 16, "y": 0 }, + "id": 10, + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "sum(count_over_time({app=\"sim-steward\", env=\"${env}\"} | json | event=\"bridge_start_failed\" [$__range]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "Bridge Start Failures", + "type": "stat" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "gradientMode": "opacity", + "fillOpacity": 20, + "lineWidth": 2, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": 3600000, + "axisLabel": "Deploys" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 5 }, + "id": 11, "options": { - "code": { "language": "markdown", "showLineNumbers": false, "showMiniMap": false }, - "content": "**Deploy health** — Correlates `deploy.ps1` with plugin bring-up in Loki.\n\n- **Deploy markers** — Lines pushed at end of `deploy.ps1` when `SIMSTEWARD_LOKI_URL` is set (`event=deploy_marker`). `post_deploy_warn=true` means post-deploy `tests/*.ps1` failed after retry.\n- **Plugin / bridge** — `plugin_ready` and `bridge_start_failed` show whether SimHub loaded the plugin and WebSocket started.\n- **Errors** — Structured ERROR lines; spike after a bad deploy often means SimHub/plugin mismatch or WS failure.\n\nOpen repo `deploy.ps1` console output for copy failures; this dashboard is **telemetry**, not a full deploy log.", - "mode": "markdown" + "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } }, - "title": "About", - "type": "text" + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "sum(count_over_time({app=\"sim-steward\", env=\"${env}\"} | json | event=\"deploy_marker\" [$__interval]))", + "legendFormat": "Deploys", + "queryType": "range", + "refId": "A" + } + ], + "title": "Deploy Frequency", + "type": "timeseries" }, { "datasource": { "type": "loki", "uid": "loki_local" }, - "gridPos": { "h": 10, "w": 24, "x": 0, "y": 3 }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 11 }, "id": 2, "options": { "dedupStrategy": "none", @@ -41,12 +169,12 @@ "refId": "A" } ], - "title": "Deploy markers (deploy.ps1 → Loki)", + "title": "Deploy Markers (deploy.ps1 → Loki)", "type": "logs" }, { "datasource": { "type": "loki", "uid": "loki_local" }, - "gridPos": { "h": 9, "w": 12, "x": 0, "y": 13 }, + "gridPos": { "h": 9, "w": 12, "x": 0, "y": 21 }, "id": 3, "options": { "dedupStrategy": "none", @@ -67,12 +195,12 @@ "refId": "A" } ], - "title": "Plugin / bridge lifecycle", + "title": "Plugin / Bridge Lifecycle", "type": "logs" }, { "datasource": { "type": "loki", "uid": "loki_local" }, - "gridPos": { "h": 9, "w": 12, "x": 12, "y": 13 }, + "gridPos": { "h": 9, "w": 12, "x": 12, "y": 21 }, "id": 4, "options": { "dedupStrategy": "none", @@ -93,28 +221,30 @@ "refId": "A" } ], - "title": "WebSocket bridge failures", + "title": "WebSocket Bridge Failures", "type": "logs" }, { "datasource": { "type": "loki", "uid": "loki_local" }, "fieldConfig": { "defaults": { - "color": { "mode": "palette-classic" }, + "color": { "mode": "fixed", "fixedColor": "#F2495C" }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "drawStyle": "bars", - "fillOpacity": 40, - "lineWidth": 1, - "showPoints": "never" + "drawStyle": "line", + "lineInterpolation": "smooth", + "gradientMode": "opacity", + "fillOpacity": 25, + "lineWidth": 2, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": 3600000, + "axisLabel": "Errors" }, "unit": "short" }, "overrides": [] }, - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 30 }, "id": 5, "options": { "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, @@ -124,18 +254,18 @@ { "datasource": { "type": "loki", "uid": "loki_local" }, "editorMode": "code", - "expr": "sum(count_over_time({app=\"sim-steward\", env=\"${env}\", level=\"ERROR\"} [5m]))", - "legendFormat": "ERROR lines / 5m", + "expr": "sum(count_over_time({app=\"sim-steward\", env=\"${env}\", level=\"ERROR\"} [$__interval]))", + "legendFormat": "ERROR lines / interval", "queryType": "range", "refId": "A" } ], - "title": "ERROR log volume (5m buckets)", + "title": "ERROR Log Volume", "type": "timeseries" }, { "datasource": { "type": "loki", "uid": "loki_local" }, - "gridPos": { "h": 12, "w": 24, "x": 0, "y": 30 }, + "gridPos": { "h": 12, "w": 24, "x": 0, "y": 38 }, "id": 6, "options": { "dedupStrategy": "none", @@ -156,12 +286,12 @@ "refId": "A" } ], - "title": "Recent ERROR lines (full)", + "title": "Recent ERROR Lines (Full)", "type": "logs" }, { "datasource": { "type": "loki", "uid": "loki_local" }, - "gridPos": { "h": 10, "w": 24, "x": 0, "y": 42 }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 50 }, "id": 7, "options": { "dedupStrategy": "none", @@ -182,7 +312,7 @@ "refId": "A" } ], - "title": "Failed actions (action_result success=false)", + "title": "Failed Actions (action_result success=false)", "type": "logs" } ], @@ -210,7 +340,7 @@ "time": { "from": "now-24h", "to": "now" }, "timepicker": {}, "timezone": "browser", - "title": "Sim Steward — Deploy health", + "title": "Sim Steward — Deploy Health", "uid": "simsteward-deploy-health", "version": 1, "weekStart": "" diff --git a/observability/local/grafana/provisioning/dashboards/simsteward-log-sentinel.json b/observability/local/grafana/provisioning/dashboards/simsteward-log-sentinel.json new file mode 100644 index 0000000..79bb00d --- /dev/null +++ b/observability/local/grafana/provisioning/dashboards/simsteward-log-sentinel.json @@ -0,0 +1,749 @@ +{ + "annotations": { "list": [] }, + "description": "Autonomous log-analysis pipeline — 16 detectors (app + ops), three-tier LLM (T0 detect → T1 deduplicate → T2 investigate). Cycle every 5 min. component=log-sentinel.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "title": "Sentinel Health", + "type": "row" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "fieldConfig": { + "defaults": { + "decimals": 0, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "#73BF69", "value": null }] } + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 6, "x": 0, "y": 1 }, + "id": 2, + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "sum(count_over_time({component=\"log-sentinel\"} | json | event=\"sentinel_cycle\" [$__range]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "Cycles Completed", + "type": "stat" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "drawStyle": "line", + "fillOpacity": 18, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": 3600000 + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 10, "x": 6, "y": 1 }, + "id": 3, + "options": { + "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "avg(avg_over_time({component=\"log-sentinel\"} | json | event=\"sentinel_cycle\" | unwrap duration_ms [$__interval]))", + "legendFormat": "Avg Cycle Duration", + "queryType": "range", + "refId": "A" + } + ], + "title": "Cycle Duration", + "type": "timeseries" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "fieldConfig": { + "defaults": { + "decimals": 0, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "#73BF69", "value": null }, { "color": "#F2495C", "value": 1 }] } + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 8, "x": 16, "y": 1 }, + "id": 4, + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "sum(count_over_time({component=\"log-sentinel\", event=\"sentinel_detector_run\"} | json | error != \"\" [$__range]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "Detector Errors", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 }, + "id": 101, + "title": "Findings Overview", + "type": "row" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 18, + "gradientMode": "opacity", + "lineWidth": 2, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": 3600000, + "axisLabel": "Findings", + "stacking": { "mode": "normal", "group": "A" } + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "App" }, + "properties": [{ "id": "color", "value": { "fixedColor": "#F2495C", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "Ops" }, + "properties": [{ "id": "color", "value": { "fixedColor": "#FF9830", "mode": "fixed" } }] + } + ] + }, + "gridPos": { "h": 8, "w": 16, "x": 0, "y": 8 }, + "id": 17, + "options": { + "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "sum(count_over_time({component=\"log-sentinel\", event=\"sentinel_finding\"} | json | category=\"app\" [$__interval]))", + "legendFormat": "App", + "queryType": "range", + "refId": "A" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "sum(count_over_time({component=\"log-sentinel\", event=\"sentinel_finding\"} | json | category=\"ops\" [$__interval]))", + "legendFormat": "Ops", + "queryType": "range", + "refId": "B" + } + ], + "title": "Findings Over Time", + "type": "timeseries" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 8 }, + "id": 18, + "options": { + "pieType": "donut", + "displayLabels": ["name", "percent"], + "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }, + "tooltip": { "mode": "single" }, + "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "sum by (severity) (count_over_time({component=\"log-sentinel\", event=\"sentinel_finding\"} | json [$__range]))", + "legendFormat": "{{severity}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Findings by Severity", + "type": "piechart" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 16 }, + "id": 102, + "title": "App Findings", + "type": "row" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "gridPos": { "h": 8, "w": 16, "x": 0, "y": 17 }, + "id": 6, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": true, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "{component=\"log-sentinel\", event=\"sentinel_finding\"} | json | category=\"app\"", + "queryType": "range", + "refId": "A" + } + ], + "title": "App Findings", + "type": "logs" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "fieldConfig": { + "defaults": { + "decimals": 0, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "#73BF69", "value": null }, { "color": "#FF9830", "value": 5 }, { "color": "#F2495C", "value": 15 }] } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 17 }, + "id": 7, + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "sum(count_over_time({component=\"log-sentinel\", event=\"sentinel_finding\"} | json | category=\"app\" [$__range]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "App Finding Count", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 25 }, + "id": 103, + "title": "Ops Findings", + "type": "row" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "gridPos": { "h": 8, "w": 16, "x": 0, "y": 26 }, + "id": 8, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": true, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "{component=\"log-sentinel\", event=\"sentinel_finding\"} | json | category=\"ops\"", + "queryType": "range", + "refId": "A" + } + ], + "title": "Ops Findings", + "type": "logs" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "fieldConfig": { + "defaults": { + "decimals": 0, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "#73BF69", "value": null }, { "color": "#FF9830", "value": 5 }, { "color": "#F2495C", "value": 15 }] } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 26 }, + "id": 9, + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "sum(count_over_time({component=\"log-sentinel\", event=\"sentinel_finding\"} | json | category=\"ops\" [$__range]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "Ops Finding Count", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 34 }, + "id": 110, + "title": "Per-Detector Timing", + "type": "row" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10, + "gradientMode": "opacity", + "lineWidth": 2, + "pointSize": 4, + "showPoints": "auto", + "spanNulls": 3600000, + "axisLabel": "Duration (ms)" + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 35 }, + "id": 19, + "options": { + "legend": { "displayMode": "table", "placement": "right", "showLegend": true, "calcs": ["mean", "max"] }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "avg by (detector) (avg_over_time({component=\"log-sentinel\"} | json | event=\"sentinel_detector_run\" | unwrap duration_ms [$__interval]))", + "legendFormat": "{{detector}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Detector Duration by Name", + "type": "timeseries" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 43 }, + "id": 5, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": true, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "{component=\"log-sentinel\", event=\"sentinel_detector_run\"} | json", + "queryType": "range", + "refId": "A" + } + ], + "title": "Detector Runs", + "type": "logs" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 51 }, + "id": 104, + "title": "T2 LLM Activity", + "type": "row" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "fieldConfig": { + "defaults": { + "decimals": 0, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "#73BF69", "value": null }] } + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 6, "x": 0, "y": 52 }, + "id": 11, + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "sum(count_over_time({component=\"log-sentinel\", event=\"sentinel_t2_run\"} [$__range]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "T2 Investigations", + "type": "stat" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "fieldConfig": { + "defaults": { + "decimals": 0, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "#5794F2", "value": null }] } + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 6, "x": 6, "y": 52 }, + "id": 12, + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "sum(count_over_time({component=\"log-sentinel\", event=\"sentinel_t2_run\"} | json | trigger=\"proactive\" [$__range]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "Proactive Polls", + "type": "stat" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "fieldConfig": { + "defaults": { + "unit": "ms", + "decimals": 0, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "#8AB8FF", "value": null }, { "color": "#FF9830", "value": 10000 }, { "color": "#F2495C", "value": 30000 }] } + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 6, "x": 12, "y": 52 }, + "id": 20, + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { "calcs": ["mean"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "avg(avg_over_time({component=\"log-sentinel\"} | json | event=\"sentinel_t2_run\" | unwrap duration_ms [$__range]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "Avg T2 Duration", + "type": "stat" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "fieldConfig": { + "defaults": { + "decimals": 0, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "#B877D9", "value": null }] } + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 6, "x": 18, "y": 52 }, + "id": 21, + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { "calcs": ["mean"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "avg(avg_over_time({component=\"log-sentinel\"} | json | event=\"sentinel_t2_run\" | unwrap tokens_used [$__range]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "Avg T2 Tokens", + "type": "stat" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 58 }, + "id": 10, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": true, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "{component=\"log-sentinel\", event=\"sentinel_t2_run\"} | json", + "queryType": "range", + "refId": "A" + } + ], + "title": "T2 Run Metrics", + "type": "logs" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 66 }, + "id": 105, + "title": "T2 Investigation Reports", + "type": "row" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 67 }, + "id": 13, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": true, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "{component=\"log-sentinel\", event=\"sentinel_investigation\"} | json", + "queryType": "range", + "refId": "A" + } + ], + "title": "Investigation Reports", + "type": "logs" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 77 }, + "id": 106, + "title": "Sentry Issues", + "type": "row" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "gridPos": { "h": 6, "w": 16, "x": 0, "y": 78 }, + "id": 14, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": true, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "{component=\"log-sentinel\", event=\"sentinel_sentry_issue\"} | json", + "queryType": "range", + "refId": "A" + } + ], + "title": "Sentry Issues Created", + "type": "logs" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "fieldConfig": { + "defaults": { + "decimals": 0, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "#5794F2", "value": null }] } + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 8, "x": 16, "y": 78 }, + "id": 15, + "options": { + "colorMode": "background-gradient", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "textMode": "value", + "text": { "titleSize": 12, "valueSize": 36 }, + "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "sum(count_over_time({component=\"log-sentinel\", event=\"sentinel_sentry_issue\"} [$__range]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "Sentry Issues", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 84 }, + "id": 107, + "title": "Process Logs", + "type": "row" + }, + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 85 }, + "id": 16, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": true, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki_local" }, + "editorMode": "code", + "expr": "{component=\"log-sentinel\", event=\"sentinel_log\"} | json", + "queryType": "range", + "refId": "A" + } + ], + "title": "Sentinel Process Logs", + "type": "logs" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["simsteward", "log-sentinel", "observability"], + "templating": { + "list": [ + { + "current": { "selected": true, "text": "local", "value": "local" }, + "hide": 0, + "includeAll": false, + "label": "env", + "name": "env", + "options": [ + { "selected": true, "text": "local", "value": "local" }, + { "selected": false, "text": "production", "value": "production" } + ], + "query": "local,production", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "SimSteward — Log Sentinel", + "uid": "simsteward-log-sentinel", + "version": 1, + "weekStart": "" +} diff --git a/observability/local/log-sentinel/Dockerfile b/observability/local/log-sentinel/Dockerfile new file mode 100644 index 0000000..f72d0cb --- /dev/null +++ b/observability/local/log-sentinel/Dockerfile @@ -0,0 +1,7 @@ +FROM python:3.12-slim +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +COPY . . +EXPOSE 8081 +CMD ["python", "app.py"] diff --git a/observability/local/log-sentinel/__pycache__/circuit_breaker.cpython-313.pyc b/observability/local/log-sentinel/__pycache__/circuit_breaker.cpython-313.pyc new file mode 100644 index 0000000..d1e105e Binary files /dev/null and b/observability/local/log-sentinel/__pycache__/circuit_breaker.cpython-313.pyc differ diff --git a/observability/local/log-sentinel/__pycache__/config.cpython-313.pyc b/observability/local/log-sentinel/__pycache__/config.cpython-313.pyc new file mode 100644 index 0000000..321aeef Binary files /dev/null and b/observability/local/log-sentinel/__pycache__/config.cpython-313.pyc differ diff --git a/observability/local/log-sentinel/__pycache__/grafana_client.cpython-313.pyc b/observability/local/log-sentinel/__pycache__/grafana_client.cpython-313.pyc new file mode 100644 index 0000000..0bb3368 Binary files /dev/null and b/observability/local/log-sentinel/__pycache__/grafana_client.cpython-313.pyc differ diff --git a/observability/local/log-sentinel/__pycache__/loki_client.cpython-313.pyc b/observability/local/log-sentinel/__pycache__/loki_client.cpython-313.pyc new file mode 100644 index 0000000..b2b45cf Binary files /dev/null and b/observability/local/log-sentinel/__pycache__/loki_client.cpython-313.pyc differ diff --git a/observability/local/log-sentinel/__pycache__/loki_handler.cpython-313.pyc b/observability/local/log-sentinel/__pycache__/loki_handler.cpython-313.pyc new file mode 100644 index 0000000..16bfaca Binary files /dev/null and b/observability/local/log-sentinel/__pycache__/loki_handler.cpython-313.pyc differ diff --git a/observability/local/log-sentinel/__pycache__/models.cpython-313.pyc b/observability/local/log-sentinel/__pycache__/models.cpython-313.pyc new file mode 100644 index 0000000..e8dd266 Binary files /dev/null and b/observability/local/log-sentinel/__pycache__/models.cpython-313.pyc differ diff --git a/observability/local/log-sentinel/__pycache__/query_cache.cpython-313.pyc b/observability/local/log-sentinel/__pycache__/query_cache.cpython-313.pyc new file mode 100644 index 0000000..894b25d Binary files /dev/null and b/observability/local/log-sentinel/__pycache__/query_cache.cpython-313.pyc differ diff --git a/observability/local/log-sentinel/__pycache__/sentry_client.cpython-313.pyc b/observability/local/log-sentinel/__pycache__/sentry_client.cpython-313.pyc new file mode 100644 index 0000000..27141cd Binary files /dev/null and b/observability/local/log-sentinel/__pycache__/sentry_client.cpython-313.pyc differ diff --git a/observability/local/log-sentinel/analyst.py b/observability/local/log-sentinel/analyst.py new file mode 100644 index 0000000..ce4f94b --- /dev/null +++ b/observability/local/log-sentinel/analyst.py @@ -0,0 +1,376 @@ +"""LLM-driven analyst — T1 fast scan and T2 deep investigation.""" + +import json +import logging +import re +import time +from dataclasses import dataclass, field + +from circuit_breaker import CircuitBreaker +from config import Config +from loki_client import LokiClient +from ollama_client import OllamaClient +from prompts import ( + T1_SYSTEM, T1_SUMMARY_PROMPT, T1_ANOMALY_PROMPT, + T2_SYSTEM, T2_INVESTIGATION_PROMPT, + LOGQL_GEN_SYSTEM, LOGQL_GEN_PROMPT, + build_stream_guide, format_log_sample, format_logql_results, +) +from timeline import TimelineEvent + +logger = logging.getLogger("sentinel.analyst") + + +@dataclass +class T1Result: + summary: str + cycle_notes: str + anomalies: list[dict] + model: str + summary_duration_ms: int + anomaly_duration_ms: int + raw_summary_response: str + raw_anomaly_response: str + + @property + def needs_t2(self) -> bool: + return any(a.get("needs_t2") for a in self.anomalies) + + @property + def total_duration_ms(self) -> int: + return self.summary_duration_ms + self.anomaly_duration_ms + + +@dataclass +class T2Result: + root_cause: str + issue_type: str + confidence: str + correlation: str + impact: str + recommendation: str + logql_queries_used: list[str] + sentry_worthy: bool + model: str + inference_duration_ms: int + logql_gather_duration_ms: int + raw_response: str = field(repr=False) + + @property + def total_duration_ms(self) -> int: + return self.inference_duration_ms + self.logql_gather_duration_ms + + +class Analyst: + def __init__( + self, + ollama: OllamaClient, + loki: LokiClient, + breaker: CircuitBreaker, + config: Config, + ): + self.ollama = ollama + self.loki = loki + self.breaker = breaker + self.config = config + self._stream_guide = build_stream_guide() + + # ── T1 ────────────────────────────────────────────────────────────────── + + def run_t1( + self, + start_ns: int, + end_ns: int, + counts: dict[str, int], + sim_steward_sample: list[dict], + claude_dev_sample: list[dict], + claude_token_sample: list[dict], + ) -> T1Result: + window_minutes = max(1, int((end_ns - start_ns) / 1e9 / 60)) + counts_text = "\n".join(f" {k}: {v}" for k, v in counts.items()) + + samples = dict( + sim_steward_sample=format_log_sample(sim_steward_sample), + sim_steward_count=len(sim_steward_sample), + claude_dev_sample=format_log_sample(claude_dev_sample), + claude_dev_count=len(claude_dev_sample), + claude_token_sample=format_log_sample(claude_token_sample), + claude_token_count=len(claude_token_sample), + ) + + system = T1_SYSTEM.format(stream_guide=self._stream_guide) + + # Call A: summary (/no_think — fast) + summary_prompt = T1_SUMMARY_PROMPT.format( + window_minutes=window_minutes, + counts=counts_text, + **samples, + ) + summary_text = "" + cycle_notes = "" + summary_ms = 0 + raw_summary = "" + try: + raw_summary, summary_ms = self.ollama.generate( + self.config.ollama_model_fast, + system + "\n\n" + summary_prompt, + think=False, + ) + self.breaker.record_success() + parsed = _parse_json(raw_summary) + summary_text = parsed.get("summary", "") + cycle_notes = parsed.get("cycle_notes", "") + except Exception as e: + self.breaker.record_failure() + logger.error("T1 summary call failed: %s", e) + + # Call B: anomaly scan (/think — reasoning) + anomaly_prompt = T1_ANOMALY_PROMPT.format( + summary=summary_text or "(summary unavailable)", + counts=counts_text, + **samples, + ) + anomalies = [] + anomaly_ms = 0 + raw_anomaly = "" + try: + raw_anomaly, anomaly_ms = self.ollama.generate( + self.config.ollama_model_fast, + system + "\n\n" + anomaly_prompt, + think=True, + ) + self.breaker.record_success() + parsed = _parse_json(raw_anomaly) + anomalies = _normalize_anomalies(parsed.get("anomalies", [])) + except Exception as e: + self.breaker.record_failure() + logger.error("T1 anomaly call failed: %s", e) + + logger.info( + "T1 complete: %d anomalies (%d need T2), summary=%dms anomaly=%dms", + len(anomalies), + sum(1 for a in anomalies if a.get("needs_t2")), + summary_ms, + anomaly_ms, + ) + + return T1Result( + summary=summary_text, + cycle_notes=cycle_notes, + anomalies=anomalies, + model=self.config.ollama_model_fast, + summary_duration_ms=summary_ms, + anomaly_duration_ms=anomaly_ms, + raw_summary_response=raw_summary, + raw_anomaly_response=raw_anomaly, + ) + + # ── T2 ────────────────────────────────────────────────────────────────── + + def run_t2( + self, + t1_result: T1Result, + timeline: list[TimelineEvent], + start_ns: int, + end_ns: int, + ) -> T2Result: + window_minutes = max(1, int((end_ns - start_ns) / 1e9 / 60)) + t2_anomalies = [a for a in t1_result.anomalies if a.get("needs_t2")] + + # Step 1: generate LogQL queries + gather_start = time.time() + queries = self._generate_logql_queries(t2_anomalies, window_minutes) + + # Step 2: execute queries + logql_results = self._execute_logql_queries(queries, start_ns, end_ns) + gather_ms = int((time.time() - gather_start) * 1000) + + # Step 3: build T2 prompt + from timeline import TimelineBuilder + # Use a simple formatter — timeline already built, just need text + timeline_text = _format_timeline_for_prompt(timeline) + + anomaly_descriptions = "\n".join( + f"- [{a.get('severity','?').upper()}] {a.get('id','?')}: {a.get('description','')}" + for a in t2_anomalies + ) + + system = T2_SYSTEM.format(stream_guide=self._stream_guide) + prompt = T2_INVESTIGATION_PROMPT.format( + anomaly_descriptions=anomaly_descriptions, + window_minutes=window_minutes, + timeline_text=timeline_text, + logql_results=format_logql_results(logql_results), + logql_queries_list=json.dumps(queries), + ) + + # Step 4: T2 inference + raw = "" + infer_ms = 0 + try: + raw, infer_ms = self.ollama.generate( + self.config.ollama_model_deep, + system + "\n\n" + prompt, + think=True, + ) + self.breaker.record_success() + except Exception as e: + self.breaker.record_failure() + logger.error("T2 inference failed: %s", e) + + parsed = _parse_json(raw) + result = T2Result( + root_cause=parsed.get("root_cause", "Unable to determine root cause."), + issue_type=_normalize_issue_type(parsed.get("issue_type", "unknown")), + confidence=_normalize_confidence(parsed.get("confidence", "low")), + correlation=parsed.get("correlation", "No correlations identified."), + impact=parsed.get("impact", "Impact unknown."), + recommendation=parsed.get("recommendation", "Investigate manually."), + logql_queries_used=queries, + sentry_worthy=bool(parsed.get("sentry_worthy", False)), + model=self.config.ollama_model_deep, + inference_duration_ms=infer_ms, + logql_gather_duration_ms=gather_ms, + raw_response=raw, + ) + + logger.info( + "T2 complete: confidence=%s sentry=%s gather=%dms infer=%dms queries=%d", + result.confidence, result.sentry_worthy, + gather_ms, infer_ms, len(queries), + ) + return result + + # ── LogQL helpers ──────────────────────────────────────────────────────── + + def _generate_logql_queries( + self, + anomalies: list[dict], + window_minutes: int, + ) -> list[str]: + if not anomalies: + return [] + + # Seed with any suggested_logql from T1 + seeded = [a.get("suggested_logql", "") for a in anomalies if a.get("suggested_logql")] + + anomaly_descriptions = "\n".join( + f"- {a.get('id','?')}: {a.get('description','')}" for a in anomalies[:5] + ) + prompt = LOGQL_GEN_SYSTEM + "\n\n" + LOGQL_GEN_PROMPT.format( + anomaly_descriptions=anomaly_descriptions, + window_minutes=window_minutes, + ) + try: + raw, _ = self.ollama.generate( + self.config.ollama_model_fast, + prompt, + think=False, + temperature=0.0, + ) + generated = json.loads(raw) if raw.strip().startswith("[") else [] + if isinstance(generated, list): + # Combine seeded + generated, validate all + combined = seeded + [q for q in generated if isinstance(q, str)] + valid = [q.strip() for q in combined if _valid_logql(q)] + return valid[:5] + except Exception as e: + logger.warning("LogQL gen failed: %s", e) + + # Fall back to seeded only + return [q for q in seeded if _valid_logql(q)][:5] + + def _execute_logql_queries( + self, + queries: list[str], + start_ns: int, + end_ns: int, + ) -> dict[str, list[dict]]: + results = {} + for query in queries: + try: + lines = self.loki.query_lines(query, start_ns, end_ns, limit=50) + results[query] = lines + except Exception as e: + logger.warning("LogQL execute failed (%s): %s", query[:60], e) + results[query] = [] + return results + + +# ── Helpers ────────────────────────────────────────────────────────────────── + +def _parse_json(text: str) -> dict: + """Extract and parse the first JSON object or array from text.""" + if not text: + return {} + # Try direct parse first + text = text.strip() + try: + return json.loads(text) + except json.JSONDecodeError: + pass + # Find first {...} or [...] block + for start_char, end_char in [('{', '}'), ('[', ']')]: + start = text.find(start_char) + end = text.rfind(end_char) + if start != -1 and end > start: + try: + return json.loads(text[start:end + 1]) + except json.JSONDecodeError: + pass + return {} + + +def _normalize_anomalies(raw: list) -> list[dict]: + if not isinstance(raw, list): + return [] + valid = [] + for a in raw: + if not isinstance(a, dict): + continue + valid.append({ + "id": str(a.get("id", "unknown"))[:64], + "stream": a.get("stream", "unknown"), + "description": str(a.get("description", ""))[:500], + "severity": a.get("severity", "info") if a.get("severity") in ("info", "warn", "critical") else "info", + "needs_t2": bool(a.get("needs_t2", False)), + "suggested_logql": str(a.get("suggested_logql", ""))[:300], + }) + return valid + + +def _normalize_confidence(v: str) -> str: + return v if v in ("high", "medium", "low") else "low" + + +def _normalize_issue_type(v: str) -> str: + valid = ("error_spike", "config", "regression", "user_behavior", "infra", "unknown") + return v if v in valid else "unknown" + + +def _valid_logql(q: str) -> bool: + q = q.strip() + return bool(q) and q.startswith("{") and "|" in q + + +def _format_timeline_for_prompt(events: list[TimelineEvent], max_events: int = 60) -> str: + """Minimal timeline formatter used by analyst (avoids circular import with TimelineBuilder).""" + if not events: + return "(no timeline events)" + + truncated = len(events) > max_events + shown = events[-max_events:] if truncated else events + + lines = [] + for i, ev in enumerate(shown, 1): + try: + t = ev.ts_iso[11:19] + except (IndexError, TypeError): + t = "??:??:??" + sid = f" session={ev.session_id[:8]}" if ev.session_id else "" + lines.append(f" [{i:03d}] {t} {ev.stream:<25} {ev.event_type}{sid}") + + if truncated: + lines.append(f" [... {len(events) - max_events} earlier events not shown]") + + return "\n".join(lines) diff --git a/observability/local/log-sentinel/app.py b/observability/local/log-sentinel/app.py new file mode 100644 index 0000000..c8af441 --- /dev/null +++ b/observability/local/log-sentinel/app.py @@ -0,0 +1,151 @@ +"""Log Sentinel v3 — Flask health/status/trigger + background sentinel loop.""" + +import logging +import threading +import time + +from flask import Flask, jsonify, request + +from config import Config +from loki_handler import LokiHandler +from sentinel import Sentinel + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(name)-20s %(levelname)-5s %(message)s", +) + +config = Config.from_env() + +# Push process logs to Loki +loki_handler = LokiHandler(config.loki_url, env=config.env_label) +loki_handler.setLevel(logging.INFO) +logging.getLogger().addHandler(loki_handler) + +app = Flask(__name__) +sentinel = Sentinel(config) + + +@app.route("/health", methods=["GET"]) +def health(): + return jsonify({"status": "ok", "service": "log-sentinel", "version": "3.0"}) + + +@app.route("/run", methods=["POST"]) +def manual_run(): + result = sentinel.run_cycle() + return jsonify({ + "status": "ok", + "cycle_id": result.cycle_id, + "cycle_num": result.cycle_num, + "window_minutes": result.window_minutes, + "timeline_event_count": result.timeline_event_count, + "anomaly_count": result.anomaly_count, + "duration_ms": result.duration_ms, + "summary": result.t1.summary if result.t1 else None, + "anomalies": result.t1.anomalies if result.t1 else [], + "evidence_packet_count": len(result.t1.evidence_packets) if result.t1 else 0, + "error": result.error, + }) + + +@app.route("/run_t2", methods=["POST"]) +def manual_run_t2(): + t = threading.Thread(target=sentinel.run_t2_cycle, daemon=True) + t.start() + return jsonify({"status": "accepted", "message": "T2 cycle started in background"}) + + +@app.route("/run_t3", methods=["POST"]) +def manual_run_t3(): + t = threading.Thread(target=sentinel.run_t3_cycle, daemon=True) + t.start() + return jsonify({"status": "accepted", "message": "T3 cycle started in background"}) + + +@app.route("/trigger", methods=["POST"]) +def grafana_trigger(): + """Receive Grafana alert webhook. Dedup, parse, and dispatch trigger_cycle().""" + payload = request.get_json(silent=True) or {} + alerts = payload.get("alerts", []) + if not alerts: + return jsonify({"status": "ignored", "reason": "no alerts"}), 200 + + fired_names = [] + now = time.time() + trigger_tier = "t1" + alert_lines = [] + + for alert in alerts: + labels = alert.get("labels", {}) + annotations = alert.get("annotations", {}) + alertname = labels.get("alertname", "unknown") + tier = labels.get("trigger_tier", "t1") + severity = labels.get("severity", "warn") + starts_at = alert.get("startsAt", "") + + # Dedup: skip if same alertname fired within dedup window + last_ts = sentinel._trigger_dedup.get(alertname, 0) + if now - last_ts < config.dedup_window_sec: + continue + + sentinel._trigger_dedup[alertname] = now + fired_names.append(alertname) + if tier == "t2": + trigger_tier = "t2" + + description = annotations.get("description", annotations.get("summary", "")) + alert_lines.append( + f" Alert: {alertname} ({severity})\n" + f" Fired: {starts_at}\n" + f" {description}" + ) + + if not fired_names: + return jsonify({"status": "deduped"}), 200 + + alert_context = "\n".join(alert_lines) + sentinel.loki.push_trigger( + { + "alertname": ",".join(fired_names), + "trigger_tier": trigger_tier, + "alert_count": len(fired_names), + }, + env=config.env_label, + ) + + # Run in background — webhook must return fast + t = threading.Thread( + target=sentinel.trigger_cycle, + args=(alert_context, trigger_tier, fired_names), + daemon=True, + ) + t.start() + + return jsonify({"status": "accepted", "alerts": fired_names, "tier": trigger_tier}), 202 + + +@app.route("/status", methods=["GET"]) +def status(): + return jsonify({ + "version": "3.0", + "sentinel_mode": config.sentinel_mode, + "t1_interval_sec": config.t1_interval_sec, + "t2_interval_sec": config.t2_interval_sec, + "t3_interval_sec": config.t3_interval_sec, + "lookback_sec": config.lookback_sec, + "t2_enabled": config.t2_enabled, + "models": {"fast": config.ollama_model_fast, "deep": config.ollama_model_deep}, + "sentry_enabled": sentinel.sentry.enabled, + "stats": sentinel._stats, + "circuit_breakers": { + "loki": sentinel.loki_breaker.state, + "ollama": sentinel.ollama_breaker.state, + }, + }) + + +if __name__ == "__main__": + t = threading.Thread(target=sentinel.start, daemon=True) + t.start() + app.run(host="0.0.0.0", port=8081, debug=False) diff --git a/observability/local/log-sentinel/baseline.py b/observability/local/log-sentinel/baseline.py new file mode 100644 index 0000000..c50b1e0 --- /dev/null +++ b/observability/local/log-sentinel/baseline.py @@ -0,0 +1,229 @@ +"""Baseline manager — rolling stats from Loki → baselines.json. + +T3 calls compute_and_save() to recompute baselines from the Loki window. +T1 calls load() + get_prompt_context() to inject baseline values into its prompt. +T3 calls get_threshold_recommendations() to surface T0 alert calibration suggestions. + +No ML, no LLM — simple rolling math (mean, count rates, p95 where sample size allows). +""" + +import json +import logging +import os +import statistics +from datetime import datetime, timezone + +from loki_client import LokiClient + +logger = logging.getLogger("sentinel.baseline") + +DEFAULT_PATH = "/data/baselines.json" + +# Metric definitions: key, logql, how to compute the value +_METRICS = [ + { + "key": "sim_steward.error_rate.per_min", + "logql": '{app="sim-steward"} | json | level="ERROR"', + "compute": "rate_per_min", + "description": "ERROR log rate (per minute)", + }, + { + "key": "sim_steward.action_count.per_session", + "logql": '{app="sim-steward"} | json | event="action_dispatched"', + "compute": "count_per_session", + "description": "Actions dispatched per iRacing session", + }, + { + "key": "sim_steward.websocket_disconnect.per_hour", + "logql": '{app="sim-steward"} | json | event="websocket_disconnect"', + "compute": "rate_per_hour", + "description": "WebSocket disconnects per hour", + }, + { + "key": "claude.cost_per_session.mean_usd", + "logql": '{app="claude-token-metrics"} | json', + "compute": "field_mean", + "field": "cost_usd", + "description": "Mean Claude session cost (USD)", + }, + { + "key": "claude.tool_calls.per_session", + "logql": '{app="claude-dev-logging"} | json | event="tool_use"', + "compute": "count_per_session", + "description": "Tool calls per Claude session", + }, + { + "key": "claude.error_rate.per_min", + "logql": '{app="claude-dev-logging"} | json | level="ERROR"', + "compute": "rate_per_min", + "description": "Claude session ERROR rate (per minute)", + }, +] + +# Known T0 alert thresholds for recommendation comparison +# Format: alert_name → (baseline_key, window_minutes, current_threshold) +_ALERT_MAPPINGS = [ + ("error-spike-general", "sim_steward.error_rate.per_min", 10, 10), + ("claude-error-spike", "claude.error_rate.per_min", 5, 5), + ("websocket-disconnect-spike", "sim_steward.websocket_disconnect.per_hour", 5, 3), +] + + +class BaselineManager: + def __init__(self, loki: LokiClient, baseline_path: str = DEFAULT_PATH): + self.loki = loki + self.path = baseline_path + self._cache: dict = {} + + def load(self) -> dict: + """Load baselines.json from disk. Returns empty dict if not found.""" + try: + if os.path.exists(self.path): + with open(self.path) as f: + self._cache = json.load(f) + logger.info("Loaded baselines from %s (%d metrics)", self.path, len(self._cache)) + else: + logger.info("No baselines.json at %s — starting fresh", self.path) + self._cache = {} + except Exception as e: + logger.warning("Failed to load baselines: %s", e) + self._cache = {} + return self._cache + + def compute_and_save(self, lookback_sec: int = 86400) -> dict: + """ + Query Loki over the lookback window, compute rolling metrics, write baselines.json. + Preserves existing values for metrics where no new data is found. + """ + end_ns = self.loki.now_ns() + start_ns = end_ns - lookback_sec * 1_000_000_000 + updated = dict(self._cache) + computed_count = 0 + + for metric in _METRICS: + try: + value = self._compute_metric(metric, start_ns, end_ns, lookback_sec) + if value is not None: + updated[metric["key"]] = round(value, 4) + computed_count += 1 + logger.debug("Baseline %s = %.4f", metric["key"], value) + except Exception as e: + logger.warning("Baseline compute failed for %s: %s", metric["key"], e) + + # Persist + try: + dirpath = os.path.dirname(os.path.abspath(self.path)) + os.makedirs(dirpath, exist_ok=True) + with open(self.path, "w") as f: + json.dump(updated, f, indent=2) + self._cache = updated + logger.info( + "Baselines saved to %s (%d computed, %d total)", + self.path, computed_count, len(updated), + ) + except Exception as e: + logger.warning("Failed to save baselines: %s", e) + + return updated + + def get_prompt_context(self) -> str: + """Format baseline values for injection into T1 LLM prompt.""" + if not self._cache: + return "(no baseline data available yet — first run or no historical data)" + + lines = ["Historical baseline for this system (use these to judge what is anomalous):"] + for key, value in sorted(self._cache.items()): + metric = next((m for m in _METRICS if m["key"] == key), None) + description = metric["description"] if metric else key.replace(".", " | ").replace("_", " ") + lines.append(f" {description}: {value}") + lines.append( + "Flag metrics that exceed baselines by 3x or more as anomalous. " + "Use these values to calibrate 'high', 'normal', and 'low' thresholds." + ) + return "\n".join(lines) + + def get_threshold_recommendations(self) -> list[dict]: + """ + Compare computed baselines against known T0 alert thresholds. + Returns recommendation dicts for alerts that appear mis-calibrated. + Emitted by T3 as sentinel_threshold_recommendation events. + """ + if not self._cache: + return [] + + recommendations = [] + for alert_name, baseline_key, window_minutes, current_threshold in _ALERT_MAPPINGS: + baseline_val = self._cache.get(baseline_key) + if baseline_val is None: + continue + + # Suggested threshold: 5x the baseline rate scaled to the alert window + suggested = round(baseline_val * window_minutes * 5, 1) + if suggested <= 0: + continue + + delta_pct = abs(suggested - current_threshold) / max(current_threshold, 0.001) + if delta_pct < 0.25: + continue # Less than 25% difference — not worth recommending + + recommendations.append({ + "alert": alert_name, + "current_threshold": current_threshold, + "suggested_threshold": suggested, + "basis": ( + f"{baseline_key}={baseline_val:.3f}/min × {window_minutes}min window × 5x safety margin" + ), + "confidence": min(0.9, 0.5 + delta_pct * 0.2), + "direction": "lower" if suggested < current_threshold else "higher", + }) + + return recommendations + + # ── Private ─────────────────────────────────────────────────────────── + + def _compute_metric( + self, metric: dict, start_ns: int, end_ns: int, lookback_sec: int + ) -> float | None: + lines = self.loki.query_lines(metric["logql"], start_ns, end_ns, limit=1000) + if not lines: + return None + + compute = metric.get("compute", "count") + + if compute == "rate_per_min": + minutes = lookback_sec / 60 + return len(lines) / minutes if minutes > 0 else None + + elif compute == "rate_per_hour": + hours = lookback_sec / 3600 + return len(lines) / hours if hours > 0 else None + + elif compute == "count_per_session": + # Group by session_id, compute mean count per session + sessions: dict[str, int] = {} + no_session = 0 + for line in lines: + sid = line.get("session_id") + if sid: + sessions[sid] = sessions.get(sid, 0) + 1 + else: + no_session += 1 + if sessions: + return statistics.mean(sessions.values()) + # Fallback: total / estimated sessions (assume 1 session per hour) + estimated_sessions = max(1, lookback_sec / 3600) + return len(lines) / estimated_sessions + + elif compute == "field_mean": + field = metric.get("field", "") + values = [] + for line in lines: + v = line.get(field) + try: + values.append(float(v)) + except (TypeError, ValueError): + pass + return statistics.mean(values) if values else None + + else: + return float(len(lines)) diff --git a/observability/local/log-sentinel/circuit_breaker.py b/observability/local/log-sentinel/circuit_breaker.py new file mode 100644 index 0000000..cedec1d --- /dev/null +++ b/observability/local/log-sentinel/circuit_breaker.py @@ -0,0 +1,51 @@ +"""Circuit breaker for dependency health (Loki, Ollama).""" + +import logging +import time + +logger = logging.getLogger("sentinel.circuit") + + +class CircuitBreaker: + """Track consecutive failures and skip calls during backoff.""" + + CLOSED = "closed" + OPEN = "open" + HALF_OPEN = "half_open" + + def __init__(self, name: str, failure_threshold: int = 3, backoff_sec: int = 60): + self.name = name + self.failure_threshold = failure_threshold + self.backoff_sec = backoff_sec + self.state = self.CLOSED + self.consecutive_failures = 0 + self.last_failure_time = 0.0 + + def allow_request(self) -> bool: + if self.state == self.CLOSED: + return True + if self.state == self.OPEN: + if time.time() - self.last_failure_time >= self.backoff_sec: + self.state = self.HALF_OPEN + logger.info("Circuit %s half-open, trying one request", self.name) + return True + return False + # HALF_OPEN — allow one probe + return True + + def record_success(self): + if self.state != self.CLOSED: + logger.info("Circuit %s closed (recovered)", self.name) + self.state = self.CLOSED + self.consecutive_failures = 0 + + def record_failure(self): + self.consecutive_failures += 1 + self.last_failure_time = time.time() + if self.consecutive_failures >= self.failure_threshold: + if self.state != self.OPEN: + logger.warning( + "Circuit %s OPEN after %d failures, backing off %ds", + self.name, self.consecutive_failures, self.backoff_sec, + ) + self.state = self.OPEN diff --git a/observability/local/log-sentinel/config.py b/observability/local/log-sentinel/config.py new file mode 100644 index 0000000..2ea06a1 --- /dev/null +++ b/observability/local/log-sentinel/config.py @@ -0,0 +1,35 @@ +"""Configuration from environment variables.""" + +import os + + +class Config: + def __init__(self): + self.loki_url = os.environ.get("LOKI_URL", "http://loki:3100") + self.grafana_url = os.environ.get("GRAFANA_URL", "http://grafana:3000") + self.grafana_user = os.environ.get("GRAFANA_USER", "admin") + self.grafana_password = os.environ.get("GRAFANA_PASSWORD", "admin") + self.ollama_url = os.environ.get("OLLAMA_URL", "http://host.docker.internal:11434") + self.ollama_model_fast = os.environ.get("OLLAMA_MODEL_FAST", "qwen3:8b") + self.ollama_model_deep = os.environ.get("OLLAMA_MODEL_DEEP", "qwen3:32b") + self.poll_interval_sec = int(os.environ.get("SENTINEL_POLL_INTERVAL_SEC", "60")) + self.lookback_sec = int(os.environ.get("SENTINEL_LOOKBACK_SEC", "300")) + self.t2_enabled = os.environ.get("SENTINEL_T2_ENABLED", "true").lower() == "true" + self.t2_proactive_interval_sec = int(os.environ.get("SENTINEL_T2_PROACTIVE_INTERVAL_SEC", "300")) + self.dedup_window_sec = int(os.environ.get("SENTINEL_DEDUP_WINDOW_SEC", "300")) + self.env_label = os.environ.get("SIMSTEWARD_LOG_ENV", "local") + self.sentry_dsn = os.environ.get("SENTINEL_SENTRY_DSN", "") + # v3 additions + self.sentinel_mode = os.environ.get("SENTINEL_MODE", "dev") # "dev" | "prod" + self.t1_interval_sec = int(os.environ.get("SENTINEL_T1_INTERVAL_SEC", "300")) # 5 min + self.t2_interval_sec = int(os.environ.get("SENTINEL_T2_INTERVAL_SEC", "900")) # 15 min + self.t3_interval_sec = int(os.environ.get("SENTINEL_T3_INTERVAL_SEC", "7200")) # 2h (dev default) + self.merge_window_sec = int(os.environ.get("SENTINEL_MERGE_WINDOW_SEC", "10")) # T0 batch window + self.sentry_auth_token = os.environ.get("SENTRY_AUTH_TOKEN", "") + self.sentry_org = os.environ.get("SENTRY_ORG", "") + self.sentry_project = os.environ.get("SENTRY_PROJECT", "") + self.baseline_path = os.environ.get("SENTINEL_BASELINE_PATH", "/data/baselines.json") + + @classmethod + def from_env(cls): + return cls() diff --git a/observability/local/log-sentinel/evidence.py b/observability/local/log-sentinel/evidence.py new file mode 100644 index 0000000..1a9199a --- /dev/null +++ b/observability/local/log-sentinel/evidence.py @@ -0,0 +1,235 @@ +"""Evidence packet model — pre-assembles log context for T2 consumption. + +T1 identifies an anomaly, then EvidenceBuilder: + 1. Finds which feature invocations contain the anomalous signal + 2. Builds a targeted LogQL query + 3. Pre-fetches up to 50 related log lines from Loki + 4. Packages everything into an EvidencePacket ready for T2 + +T2 receives EvidencePackets — it reasons over pre-assembled evidence, +not raw Loki queries. This dramatically improves T2 output quality. +""" + +import logging +import time +import uuid +from dataclasses import dataclass, field + +from loki_client import LokiClient +from trace import FeatureInvocation + +logger = logging.getLogger("sentinel.evidence") + +_MAX_LOG_LINES = 50 + + +@dataclass +class EvidencePacket: + anomaly_id: str + anomaly_description: str + severity: str # "info" | "warn" | "critical" + detector_stream: str # which stream flagged it + invocations: list[FeatureInvocation] # invocations containing the anomaly + related_log_lines: list[dict] # pre-fetched raw log lines (capped at 50) + suggested_logql: str # T1's suggested query for T2 to refine + t1_hypothesis: str # T1's one-sentence best-guess root cause + t1_confidence: float # 0.0 to 1.0 + assembled_at_ns: int + logql_used: str # the actual query used to fetch related_log_lines + + def to_loki_dict(self) -> dict: + """Serializable dict for push to Loki as sentinel_evidence_packet event.""" + return { + "event": "sentinel_evidence_packet", + "component": "log-sentinel", + "domain": "system", + "level": "WARN" if self.severity in ("warn", "critical") else "INFO", + "message": f"[{self.severity.upper()}] {self.anomaly_description[:120]}", + "anomaly_id": self.anomaly_id, + "anomaly_description": self.anomaly_description, + "severity": self.severity, + "detector_stream": self.detector_stream, + "t1_hypothesis": self.t1_hypothesis, + "t1_confidence": self.t1_confidence, + "suggested_logql": self.suggested_logql, + "logql_used": self.logql_used, + "related_lines_count": len(self.related_log_lines), + "invocation_count": len(self.invocations), + "invocation_ids": [inv.invocation_id for inv in self.invocations], + "action_types": list({inv.action_type for inv in self.invocations}), + "assembled_at_ns": self.assembled_at_ns, + } + + def to_prompt_text(self) -> str: + """Format evidence packet as text block for LLM (T2) consumption.""" + lines = [ + f"=== EVIDENCE PACKET {self.anomaly_id} ===", + f"Severity: {self.severity.upper()}", + f"Stream: {self.detector_stream}", + f"Anomaly: {self.anomaly_description}", + f"T1 hypothesis: {self.t1_hypothesis or '(none)'}", + f"T1 confidence: {self.t1_confidence:.0%}", + "", + ] + + if self.invocations: + lines.append(f"Affected invocations ({len(self.invocations)}):") + for inv in self.invocations[:5]: + status = "FAILED" if inv.success is False else ("OK" if inv.success else "?") + lines.append( + f" [{status}] {inv.action_type} via {inv.correlation_method} " + f"({inv.duration_ms}ms, {len(inv.events)} events)" + ) + if inv.error: + lines.append(f" error: {inv.error}") + lines.append("") + + if self.related_log_lines: + lines.append(f"Related log lines ({len(self.related_log_lines)}, capped at {_MAX_LOG_LINES}):") + for log in self.related_log_lines[:_MAX_LOG_LINES]: + ts = log.get("timestamp", "")[:19] + evt = log.get("event", log.get("message", ""))[:60] + lvl = log.get("level", "") + err = log.get("error", "") + suffix = f" error={err[:60]}" if err else "" + lines.append(f" {ts} [{lvl}] {evt}{suffix}") + lines.append("") + + lines.append(f"Suggested LogQL for deeper investigation: {self.suggested_logql}") + return "\n".join(lines) + + +class EvidenceBuilder: + """Assembles EvidencePackets from T1 anomaly signals + feature invocations.""" + + def __init__(self, loki: LokiClient): + self.loki = loki + + def build( + self, + anomaly: dict, + invocations: list[FeatureInvocation], + start_ns: int, + end_ns: int, + ) -> EvidencePacket: + """ + Build an EvidencePacket for a single T1 anomaly. + + anomaly dict shape (from T1 LLM output): + id, description, severity, stream, event_type, + hypothesis, confidence, suggested_logql, trace_id + """ + anomaly_id = anomaly.get("id") or str(uuid.uuid4())[:8] + stream = anomaly.get("stream", "sim-steward") + event_type = anomaly.get("event_type", "") + + relevant = self._find_relevant_invocations(anomaly, invocations) + logql = self._build_logql(anomaly, relevant, stream, event_type) + + try: + lines = self.loki.query_lines(logql, start_ns, end_ns, limit=_MAX_LOG_LINES) + except Exception as e: + logger.warning("EvidenceBuilder Loki query failed: %s", e) + lines = [] + + suggested = anomaly.get("suggested_logql") or logql + + return EvidencePacket( + anomaly_id=anomaly_id, + anomaly_description=anomaly.get("description", anomaly.get("title", "")), + severity=anomaly.get("severity", "warn"), + detector_stream=stream, + invocations=relevant, + related_log_lines=lines, + suggested_logql=suggested, + t1_hypothesis=anomaly.get("hypothesis", ""), + t1_confidence=float(anomaly.get("confidence", 0.5)), + assembled_at_ns=int(time.time() * 1e9), + logql_used=logql, + ) + + def build_many( + self, + anomalies: list[dict], + invocations: list[FeatureInvocation], + start_ns: int, + end_ns: int, + ) -> list[EvidencePacket]: + """Build evidence packets for all anomalies. Skips on error.""" + packets = [] + for anomaly in anomalies: + try: + packet = self.build(anomaly, invocations, start_ns, end_ns) + packets.append(packet) + except Exception as e: + logger.warning("Failed to build evidence for anomaly %s: %s", anomaly.get("id", "?"), e) + return packets + + # ── Private ─────────────────────────────────────────────────────────── + + def _find_relevant_invocations( + self, anomaly: dict, invocations: list[FeatureInvocation] + ) -> list[FeatureInvocation]: + """Find invocations that contain signals matching this anomaly.""" + # Tier 1: exact trace_id match + trace_id = anomaly.get("trace_id") + if trace_id: + matches = [inv for inv in invocations if inv.invocation_id == trace_id] + if matches: + return matches + + # Tier 2: invocations containing an event of the matching type/stream + anomaly_event = anomaly.get("event_type", "") + anomaly_stream = anomaly.get("stream", "") + anomaly_severity = anomaly.get("severity", "") + + relevant = [] + for inv in invocations: + for ev in inv.events: + stream_match = anomaly_stream and ev.stream == anomaly_stream + event_match = anomaly_event and ev.event_type == anomaly_event + error_match = anomaly_severity == "critical" and ( + ev.raw.get("level", "").upper() == "ERROR" or ev.raw.get("error") + ) + if stream_match or event_match or error_match: + relevant.append(inv) + break + + if relevant: + return relevant + + # Tier 3: failed invocations (best-effort for error anomalies) + failed = [inv for inv in invocations if inv.success is False] + if failed: + return failed[:3] + + # Fallback: first 3 invocations + return invocations[:3] + + def _build_logql( + self, + anomaly: dict, + invocations: list[FeatureInvocation], + stream: str, + event_type: str, + ) -> str: + """Build a targeted LogQL query for fetching related log lines.""" + # Prefer trace_id query if available + trace_ids = [ + inv.invocation_id + for inv in invocations + if inv.correlation_method == "trace_id" + ] + if len(trace_ids) == 1: + return f'{{app="{stream}"}} | json | trace_id="{trace_ids[0]}"' + + # Event-type query + if event_type: + return f'{{app="{stream}"}} | json | event="{event_type}"' + + # Severity-based fallback + severity = anomaly.get("severity", "warn") + if severity == "critical": + return f'{{app="{stream}"}} | json | level="ERROR"' + + return f'{{app="{stream}"}} | json' diff --git a/observability/local/log-sentinel/grafana_client.py b/observability/local/log-sentinel/grafana_client.py new file mode 100644 index 0000000..cc065c5 --- /dev/null +++ b/observability/local/log-sentinel/grafana_client.py @@ -0,0 +1,64 @@ +"""Grafana HTTP API client for annotations.""" + +import logging +import time + +import requests + +logger = logging.getLogger("sentinel.grafana") + + +class GrafanaClient: + def __init__(self, base_url: str, user: str = "admin", password: str = "admin"): + self.base_url = base_url.rstrip("/") + self.auth = (user, password) + + def annotate(self, finding): + try: + requests.post( + f"{self.base_url}/api/annotations", + auth=self.auth, + json={ + "time": int(time.time() * 1000), + "tags": ["log-sentinel", finding.detector, finding.severity, finding.category], + "text": f"[{finding.severity.upper()}] {finding.title}
{finding.summary}", + }, + timeout=5, + ) + except Exception as e: + logger.debug("Grafana annotation error: %s", e) + + def annotate_investigation(self, investigation): + try: + requests.post( + f"{self.base_url}/api/annotations", + auth=self.auth, + json={ + "time": int(time.time() * 1000), + "tags": ["log-sentinel", "investigation", investigation.finding.detector, investigation.confidence, investigation.trigger], + "text": ( + f"Investigation: {investigation.finding.title}
" + f"Root cause: {investigation.root_cause}
" + f"Recommendation: {investigation.recommendation}
" + f"Confidence: {investigation.confidence} | Model: {investigation.model} | Type: {investigation.issue_type}" + ), + }, + timeout=5, + ) + except Exception as e: + logger.debug("Grafana investigation annotation error: %s", e) + + def annotate_raw(self, title: str, text: str, tags: list[str]): + try: + requests.post( + f"{self.base_url}/api/annotations", + auth=self.auth, + json={ + "time": int(time.time() * 1000), + "tags": ["log-sentinel"] + [t for t in tags if t], + "text": f"{title}
{text}", + }, + timeout=5, + ) + except Exception as e: + logger.debug("Grafana annotate_raw error: %s", e) diff --git a/observability/local/log-sentinel/loki_client.py b/observability/local/log-sentinel/loki_client.py new file mode 100644 index 0000000..7de2c9e --- /dev/null +++ b/observability/local/log-sentinel/loki_client.py @@ -0,0 +1,330 @@ +"""Loki HTTP API client — query + push, with structured sentinel event helpers.""" + +import json +import logging +import time +from datetime import datetime, timezone + +import requests + +logger = logging.getLogger("sentinel.loki") + + +class LokiClient: + def __init__(self, base_url: str, timeout: int = 5): + self.base_url = base_url.rstrip("/") + self.timeout = timeout + + # ── Time helpers ── + + @staticmethod + def now_ns() -> int: + return int(datetime.now(timezone.utc).timestamp() * 1e9) + + @staticmethod + def now_minus_ms(offset_ms: int) -> int: + return int((datetime.now(timezone.utc).timestamp() * 1000 - offset_ms) * 1e6) + + # ── Query API ── + + def count(self, logql: str, start_ns: int, end_ns: int) -> int: + try: + resp = requests.get( + f"{self.base_url}/loki/api/v1/query_range", + params={"query": logql, "start": str(start_ns), "end": str(end_ns), "limit": 1000, "direction": "forward"}, + timeout=self.timeout, + ) + if resp.status_code != 200: + return -1 + total = 0 + for stream in resp.json().get("data", {}).get("result", []): + total += len(stream.get("values", [])) + return total + except Exception as e: + logger.warning("Loki count error: %s", e) + return -1 + + def query_lines(self, logql: str, start_ns: int, end_ns: int, limit: int = 1000) -> list[dict]: + try: + resp = requests.get( + f"{self.base_url}/loki/api/v1/query_range", + params={"query": logql, "start": str(start_ns), "end": str(end_ns), "limit": limit, "direction": "forward"}, + timeout=self.timeout, + ) + if resp.status_code != 200: + return [] + lines = [] + for stream in resp.json().get("data", {}).get("result", []): + for pair in stream.get("values", []): + if len(pair) >= 2: + try: + lines.append(json.loads(pair[1])) + except (json.JSONDecodeError, TypeError): + pass + return lines + except Exception: + return [] + + # ── Push API ── + + def push(self, entry: dict, env: str = "local"): + """Push a single log entry to Loki. Fire-and-forget.""" + try: + ts_ns = str(int(time.time() * 1e9)) + stream_labels = {"app": "sim-steward", "env": env, "level": entry.get("level", "INFO")} + for key in ("component", "event", "domain"): + val = entry.get(key) + if val: + stream_labels[key] = val + payload = {"streams": [{"stream": stream_labels, "values": [[ts_ns, json.dumps(entry)]]}]} + requests.post(f"{self.base_url}/loki/api/v1/push", json=payload, timeout=3) + except Exception as e: + logger.debug("Loki push error: %s", e) + + # ── Sentinel event helpers ── + + def push_finding(self, finding, env: str = "local"): + entry = { + "level": "WARN" if finding.severity in ("warn", "critical") else "INFO", + "message": finding.title, + "timestamp": finding.timestamp, + "component": "log-sentinel", + "event": "sentinel_finding", + "domain": "system", + "finding_id": finding.finding_id, + "detector": finding.detector, + "category": finding.category, + "severity": finding.severity, + "title": finding.title, + "summary": finding.summary, + "fingerprint": finding.fingerprint, + "escalated_to_t2": finding.escalate_to_t2, + "logql_query": finding.logql_query, + "flow_context": finding.flow_context, + **finding.evidence, + } + self.push(entry, env) + + def push_investigation(self, investigation, env: str = "local"): + entry = { + "level": "INFO", + "message": f"Investigation: {investigation.root_cause[:120]}", + "timestamp": investigation.timestamp, + "component": "log-sentinel", + "event": "sentinel_investigation", + "domain": "system", + "investigation_id": investigation.investigation_id, + "finding_id": investigation.finding.finding_id, + "detector": investigation.finding.detector, + "category": investigation.finding.category, + "trigger": investigation.trigger, + "model": investigation.model, + "confidence": investigation.confidence, + "issue_type": investigation.issue_type, + "root_cause": investigation.root_cause, + "correlation": investigation.correlation, + "impact": investigation.impact, + "recommendation": investigation.recommendation, + "inference_duration_ms": investigation.inference_duration_ms, + "gather_duration_ms": investigation.gather_duration_ms, + "context_lines_gathered": investigation.context_lines_gathered, + } + self.push(entry, env) + + def push_cycle(self, cycle_data: dict, env: str = "local"): + anomaly_count = cycle_data.get("anomaly_count", cycle_data.get("finding_count", 0)) + entry = { + "level": "INFO", + "message": f"Cycle #{cycle_data['cycle_num']}: {anomaly_count} anomalies", + "component": "log-sentinel", + "event": "sentinel_cycle", + "domain": "system", + **cycle_data, + } + self.push(entry, env) + + def push_detector_run(self, run_data: dict, env: str = "local"): + entry = { + "level": "ERROR" if run_data.get("error") else "INFO", + "message": f"Detector {run_data['detector']}: {run_data['finding_count']} findings in {run_data['duration_ms']}ms", + "component": "log-sentinel", + "event": "sentinel_detector_run", + "domain": "system", + **run_data, + } + self.push(entry, env) + + def push_t2_run(self, t2_data: dict, env: str = "local"): + entry = { + "level": "INFO", + "message": f"T2 {t2_data['tier']}: {t2_data['model']} confidence={t2_data.get('confidence', '?')} in {t2_data.get('total_duration_ms', '?')}ms", + "component": "log-sentinel", + "event": "sentinel_t2_run", + "domain": "system", + **t2_data, + } + self.push(entry, env) + + def push_analyst_run(self, run_data: dict, env: str = "local"): + tier = run_data.get("tier", "t1") + entry = { + "level": "INFO", + "message": f"Analyst {tier}: model={run_data.get('model','?')} anomalies={run_data.get('anomaly_count', run_data.get('logql_queries_generated', '?'))} duration={run_data.get('duration_ms','?')}ms", + "component": "log-sentinel", + "event": "sentinel_analyst_run", + "domain": "system", + **run_data, + } + self.push(entry, env) + + def push_timeline(self, timeline_data: dict, env: str = "local"): + entry = { + "level": "INFO", + "message": f"Timeline: {timeline_data.get('event_count', 0)} events, {timeline_data.get('session_count', 0)} sessions", + "component": "log-sentinel", + "event": "sentinel_timeline_built", + "domain": "system", + **timeline_data, + } + self.push(entry, env) + + def push_investigation_v2(self, t2_result, anomalies: list, env: str = "local"): + from analyst import T2Result + entry = { + "level": "INFO", + "message": f"Investigation [{t2_result.confidence}]: {t2_result.root_cause[:120]}", + "component": "log-sentinel", + "event": "sentinel_investigation", + "domain": "system", + "anomaly_ids": [a.get("id", "") for a in anomalies if a.get("needs_t2")], + "root_cause": t2_result.root_cause, + "issue_type": t2_result.issue_type, + "confidence": t2_result.confidence, + "correlation": t2_result.correlation, + "impact": t2_result.impact, + "recommendation": t2_result.recommendation, + "logql_queries_used": t2_result.logql_queries_used, + "logql_gather_duration_ms": t2_result.logql_gather_duration_ms, + "inference_duration_ms": t2_result.inference_duration_ms, + "sentry_worthy": t2_result.sentry_worthy, + "model": t2_result.model, + } + self.push(entry, env) + + def annotate_raw(self, *args, **kwargs): + """Stub — annotate_raw is called on grafana_client, not loki_client.""" + pass + + def push_sentry_event(self, sentry_data: dict, env: str = "local"): + entry = { + "level": "INFO", + "message": f"Sentry issue: {sentry_data.get('title', '?')[:100]}", + "component": "log-sentinel", + "event": "sentinel_sentry_issue", + "domain": "system", + **sentry_data, + } + self.push(entry, env) + + # ── v3 push helpers ────────────────────────────────────────────────────── + + def push_evidence_packet(self, packet, env: str = "local"): + """Push sentinel_evidence_packet — T1's pre-assembled anomaly context.""" + entry = packet.to_loki_dict() + self.push(entry, env) + + def push_t2_investigation(self, t2_result, packet_dicts: list, env: str = "local"): + """Push sentinel_t2_investigation — T2's investigation result.""" + entry = { + "level": "INFO", + "message": f"T2 investigation [{t2_result.confidence}]: {t2_result.root_cause[:120]}", + "component": "log-sentinel", + "event": "sentinel_t2_investigation", + "domain": "system", + "root_cause": t2_result.root_cause, + "issue_type": t2_result.issue_type, + "confidence": t2_result.confidence, + "correlation": t2_result.correlation, + "impact": t2_result.impact, + "recommendation": t2_result.recommendation, + "sentry_worthy": t2_result.sentry_worthy, + "sentry_fingerprint": t2_result.sentry_fingerprint, + "sentry_event_id": t2_result.sentry_event_id or "", + "evidence_packet_count": t2_result.evidence_packet_count, + "anomaly_ids": [p.get("anomaly_id", "") for p in packet_dicts], + "logql_queries_used": t2_result.logql_queries_used, + "logql_gather_duration_ms": t2_result.logql_gather_duration_ms, + "inference_duration_ms": t2_result.inference_duration_ms, + "model": t2_result.model, + } + self.push(entry, env) + + def push_synthesis(self, t3_result, trigger: str = "scheduled", env: str = "local"): + """Push sentinel_synthesis — T3's period synthesis summary.""" + entry = { + "level": "INFO", + "message": f"T3 synthesis [{trigger}]: {t3_result.sessions_analyzed} sessions, " + f"{len(t3_result.recurring_patterns)} patterns", + "component": "log-sentinel", + "event": "sentinel_synthesis", + "domain": "system", + "trigger": trigger, + "period_summary": t3_result.period_summary[:500], + "sessions_analyzed": t3_result.sessions_analyzed, + "features_worked": t3_result.features_worked, + "features_failed": t3_result.features_failed, + "recurring_pattern_count": len(t3_result.recurring_patterns), + "regression_detected": t3_result.regression_detected, + "regression_detail": t3_result.regression_detail[:200], + "action_items": t3_result.action_items[:5], + "baselines_updated": t3_result.baselines_updated, + "threshold_recommendation_count": len(t3_result.threshold_recommendations), + "model": t3_result.model, + "inference_duration_ms": t3_result.inference_duration_ms, + } + self.push(entry, env) + + def push_narrative(self, narrative_dict: dict, env: str = "local"): + """Push sentinel_narrative — T3's per-session story.""" + entry = { + "level": "INFO", + "message": f"Session narrative: {narrative_dict.get('session_id', '?')[:12]}", + "component": "log-sentinel", + "event": "sentinel_narrative", + "domain": "system", + "session_id": narrative_dict.get("session_id", ""), + "narrative_text": narrative_dict.get("narrative_text", "")[:1000], + "features_worked": narrative_dict.get("features_worked", []), + "features_failed": narrative_dict.get("features_failed", []), + "invocation_count": narrative_dict.get("invocation_count", 0), + } + self.push(entry, env) + + def push_threshold_recommendation(self, rec: dict, env: str = "local"): + """Push sentinel_threshold_recommendation — T3's threshold calibration advice.""" + entry = { + "level": "INFO", + "message": ( + f"Threshold recommendation: {rec.get('alert', '?')} " + f"current={rec.get('current_threshold')} → suggested={rec.get('suggested_threshold')} " + f"({rec.get('direction', '?')})" + ), + "component": "log-sentinel", + "event": "sentinel_threshold_recommendation", + "domain": "system", + **rec, + } + self.push(entry, env) + + def push_trigger(self, alert_data: dict, env: str = "local"): + """Push sentinel_trigger — per T0 webhook alert received.""" + entry = { + "level": "INFO", + "message": f"Trigger: {alert_data.get('alertname', '?')} [{alert_data.get('trigger_tier', '?')}]", + "component": "log-sentinel", + "event": "sentinel_trigger", + "domain": "system", + "trigger_source": "grafana_alert", + **alert_data, + } + self.push(entry, env) diff --git a/observability/local/log-sentinel/loki_handler.py b/observability/local/log-sentinel/loki_handler.py new file mode 100644 index 0000000..e8a9dcd --- /dev/null +++ b/observability/local/log-sentinel/loki_handler.py @@ -0,0 +1,66 @@ +"""Python logging handler that pushes log records to Loki.""" + +import json +import logging +import time +import threading + +import requests + + +class LokiHandler(logging.Handler): + def __init__(self, loki_url: str, env: str = "local", flush_interval: float = 2.0): + super().__init__() + self.loki_url = loki_url.rstrip("/") + self.env = env + self.flush_interval = flush_interval + self._buffer = [] + self._lock = threading.Lock() + self._start_flush_timer() + + def _start_flush_timer(self): + self._timer = threading.Timer(self.flush_interval, self._flush_loop) + self._timer.daemon = True + self._timer.start() + + def _flush_loop(self): + self._flush() + self._start_flush_timer() + + def emit(self, record: logging.LogRecord): + try: + entry = { + "level": record.levelname, + "message": self.format(record), + "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S.000Z", time.gmtime(record.created)), + "component": "log-sentinel", + "event": "sentinel_log", + "domain": "system", + "logger": record.name, + "func": record.funcName, + } + with self._lock: + self._buffer.append(entry) + except Exception: + self.handleError(record) + + def _flush(self): + with self._lock: + if not self._buffer: + return + entries = self._buffer[:] + self._buffer.clear() + by_level = {} + for e in entries: + by_level.setdefault(e["level"], []).append(e) + streams = [] + for level, group in by_level.items(): + values = [[str(int(time.time() * 1e9)), json.dumps(e)] for e in group] + streams.append({ + "stream": {"app": "sim-steward", "env": self.env, "level": level, "component": "log-sentinel", "event": "sentinel_log", "domain": "system"}, + "values": values, + }) + try: + requests.post(f"{self.loki_url}/loki/api/v1/push", json={"streams": streams}, timeout=3) + except Exception: + pass diff --git a/observability/local/log-sentinel/narrative.py b/observability/local/log-sentinel/narrative.py new file mode 100644 index 0000000..72e3a9b --- /dev/null +++ b/observability/local/log-sentinel/narrative.py @@ -0,0 +1,214 @@ +"""Session narrative builder — used by T3 synthesis. + +Turns a set of FeatureInvocations + T1/T2 findings into a human-readable +per-session story that answers: "What was the user trying to do, did it work?" + +Output shape (returned as text block): + NARRATIVE: [] + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + + <2-3 sentence prose of what happened> + + WORKED: · + FAILED: (error) + PATTERNS: + ACTION: +""" + +import logging +from datetime import datetime, timezone + +from trace import FeatureInvocation + +logger = logging.getLogger("sentinel.narrative") + + +class NarrativeBuilder: + def build( + self, + session_id: str, + invocations: list[FeatureInvocation], + anomaly_dicts: list[dict], + t2_investigation_dicts: list[dict], + pattern_matches: list[str] | None = None, + ) -> str: + """Build a narrative text block for a single session.""" + if not invocations: + return f"NARRATIVE: session={session_id}\n (no feature invocations recorded)" + + # Time range + start_ns = min(inv.start_ts_ns for inv in invocations) + end_ns = max(inv.end_ts_ns for inv in invocations) + start_dt = datetime.fromtimestamp(start_ns / 1e9, tz=timezone.utc) + end_dt = datetime.fromtimestamp(end_ns / 1e9, tz=timezone.utc) + date_str = start_dt.strftime("%Y-%m-%d") + time_range = f"{start_dt.strftime('%H:%M')}–{end_dt.strftime('%H:%M')}" + + # Classify worked vs failed + worked = [] + failed = [] + for inv in invocations: + if inv.success is False: + failed.append(inv) + elif inv.success is True: + worked.append(inv) + # success=None (unknown) counted as neither + + # Build prose summary + prose = self._build_prose(invocations, worked, failed, anomaly_dicts) + + # Recommendation from T2 investigations or anomalies + action = self._extract_action(t2_investigation_dicts, anomaly_dicts) + + # Pattern summary + patterns_text = "" + if pattern_matches: + patterns_text = " · ".join(pattern_matches[:3]) + elif _has_recurring_issue(anomaly_dicts): + patterns_text = f"{sum(1 for a in anomaly_dicts if a.get('severity') in ('warn', 'critical'))} anomalies flagged" + + # Assemble + sep = "━" * 48 + lines = [ + f"NARRATIVE: {date_str} {time_range} [{session_id[:12]}]", + sep, + "", + prose, + "", + ] + + if worked: + worked_str = " · ".join(_action_label(inv) for inv in _dedupe_by_type(worked)) + lines.append(f"WORKED: {worked_str}") + if failed: + failed_str = " · ".join( + f"{_action_label(inv)} ({(inv.error or 'error')[:40]})" + for inv in _dedupe_by_type(failed) + ) + lines.append(f"FAILED: {failed_str}") + if patterns_text: + lines.append(f"PATTERNS: {patterns_text}") + if action: + lines.append(f"ACTION: {action[:200]}") + + return "\n".join(lines) + + def build_all( + self, + invocations: list[FeatureInvocation], + anomaly_dicts: list[dict], + t2_investigation_dicts: list[dict], + ) -> list[dict]: + """Group invocations by session_id, build a narrative per session. + + Returns list of dicts with keys: session_id, narrative_text, features_worked, + features_failed, invocation_count. + """ + # Group invocations by session_id + sessions: dict[str, list[FeatureInvocation]] = {} + for inv in invocations: + sid = (inv.trigger_event.session_id if inv.trigger_event else None) or "no_session" + sessions.setdefault(sid, []).append(inv) + + results = [] + for sid, session_invocations in sessions.items(): + # Filter anomalies + investigations for this session + session_anomalies = _filter_for_session(anomaly_dicts, sid) + session_t2 = _filter_for_session(t2_investigation_dicts, sid) + + text = self.build( + session_id=sid, + invocations=session_invocations, + anomaly_dicts=session_anomalies, + t2_investigation_dicts=session_t2, + ) + worked = [inv.action_type for inv in session_invocations if inv.success is True] + failed = [inv.action_type for inv in session_invocations if inv.success is False] + + results.append({ + "session_id": sid, + "narrative_text": text, + "features_worked": list(dict.fromkeys(worked)), # dedupe, order-preserving + "features_failed": list(dict.fromkeys(failed)), + "invocation_count": len(session_invocations), + }) + + return results + + # ── Private ─────────────────────────────────────────────────────────────── + + def _build_prose( + self, + all_invocations: list[FeatureInvocation], + worked: list[FeatureInvocation], + failed: list[FeatureInvocation], + anomaly_dicts: list[dict], + ) -> str: + total = len(all_invocations) + worked_count = len(worked) + failed_count = len(failed) + + # Action type distribution + type_counts: dict[str, int] = {} + for inv in all_invocations: + type_counts[inv.action_type] = type_counts.get(inv.action_type, 0) + 1 + + top_types = sorted(type_counts.items(), key=lambda x: x[1], reverse=True)[:3] + type_str = ", ".join(f"{name} (×{n})" for name, n in top_types) + + health_str = ( + "All recorded actions completed successfully." + if failed_count == 0 + else f"{failed_count} of {total} action(s) failed." + ) + + anomaly_count = sum(1 for a in anomaly_dicts if a.get("severity") in ("warn", "critical")) + anomaly_str = f" {anomaly_count} anomaly flags were raised." if anomaly_count else "" + + return ( + f"{total} feature invocation(s) recorded: {type_str}. " + f"{health_str}{anomaly_str}" + ) + + def _extract_action( + self, + t2_dicts: list[dict], + anomaly_dicts: list[dict], + ) -> str: + # Prefer T2 recommendation if available + for t2 in t2_dicts: + rec = t2.get("recommendation", "") + if rec and rec not in ("Investigate manually.", ""): + return rec[:200] + # Fall back to critical anomaly hypothesis + for a in anomaly_dicts: + if a.get("severity") == "critical" and a.get("hypothesis"): + return a["hypothesis"][:200] + return "" + + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +def _action_label(inv: FeatureInvocation) -> str: + return inv.action_type.replace("_", " ").replace("-", " ").lower() + + +def _dedupe_by_type(invocations: list[FeatureInvocation]) -> list[FeatureInvocation]: + seen: dict[str, FeatureInvocation] = {} + for inv in invocations: + seen.setdefault(inv.action_type, inv) + return list(seen.values()) + + +def _has_recurring_issue(anomaly_dicts: list[dict]) -> bool: + return any(a.get("severity") in ("warn", "critical") for a in anomaly_dicts) + + +def _filter_for_session(items: list[dict], session_id: str) -> list[dict]: + """Return items that mention this session_id, or all items if session is no_session.""" + if session_id == "no_session": + return items + return [ + item for item in items + if not item.get("session_id") or item.get("session_id") == session_id + ] diff --git a/observability/local/log-sentinel/ollama_client.py b/observability/local/log-sentinel/ollama_client.py new file mode 100644 index 0000000..4448a44 --- /dev/null +++ b/observability/local/log-sentinel/ollama_client.py @@ -0,0 +1,64 @@ +"""Ollama HTTP client with qwen3 /think and /no_think mode support.""" + +import re +import time +import logging + +import requests + +logger = logging.getLogger("sentinel.ollama") + +_THINK_STRIP = re.compile(r".*?", re.DOTALL) + + +class OllamaClient: + def __init__(self, base_url: str, timeout: int = 300): + self.base_url = base_url.rstrip("/") + self.timeout = timeout + + def generate( + self, + model: str, + prompt: str, + think: bool = False, + temperature: float = 0.1, + ) -> tuple[str, int]: + """ + Call Ollama /api/generate. Returns (response_text, duration_ms). + Prepends /think or /no_think for qwen3 models. + Strips ... blocks from output before returning. + Raises on failure so callers can handle via circuit breaker. + """ + mode_prefix = "/think\n" if think else "/no_think\n" + full_prompt = mode_prefix + prompt + + start = time.time() + resp = requests.post( + f"{self.base_url}/api/generate", + json={ + "model": model, + "prompt": full_prompt, + "stream": False, + "options": { + "temperature": temperature, + "num_predict": 2048, + }, + }, + timeout=self.timeout, + ) + duration_ms = int((time.time() - start) * 1000) + + if resp.status_code != 200: + raise RuntimeError(f"Ollama {resp.status_code}: {resp.text[:200]}") + + raw = resp.json().get("response", "") + cleaned = _THINK_STRIP.sub("", raw).strip() + return cleaned, duration_ms + + def is_available(self) -> bool: + """Quick availability check — HEAD /api/tags.""" + try: + resp = requests.get(f"{self.base_url}/api/tags", timeout=5) + return resp.status_code == 200 + except Exception: + return False diff --git a/observability/local/log-sentinel/prompts.py b/observability/local/log-sentinel/prompts.py new file mode 100644 index 0000000..08bfdb6 --- /dev/null +++ b/observability/local/log-sentinel/prompts.py @@ -0,0 +1,396 @@ +"""Prompt templates and structured output schemas for Log Sentinel v2/v3.""" + +# ── Stream descriptions injected into every prompt ────────────────────────── + +STREAM_DESCRIPTIONS = { + "sim-steward": ( + "SimHub plugin logs: iRacing session events, user actions (button clicks, " + "replay controls), WebSocket messages, incident detection, plugin lifecycle. " + "Key fields: event, domain, component, session_id, subsession_id." + ), + "claude-dev-logging": ( + "Claude Code AI agent logs: tool calls (Read, Write, Bash, etc.), " + "session lifecycle, subagent activity, MCP service calls, token snapshots. " + "Key fields: event, hook_type, tool_name, service, session_id, duration_ms." + ), + "claude-token-metrics": ( + "Claude Code session summaries: one entry per completed AI session. " + "Fields: total_input_tokens, total_output_tokens, cost_usd, model, effort, " + "assistant_turns, tool_use_count, session_id." + ), +} + +# ── T1 prompts ─────────────────────────────────────────────────────────────── + +T1_SYSTEM = """\ +You are a log analyst for a SimHub iRacing plugin system that integrates with an AI coding assistant. +You analyze structured JSON logs from three streams to identify what happened and what looks wrong. + +Stream guide: +{stream_guide} + +Always respond with valid JSON only. No markdown, no explanation outside the JSON object.\ +""" + +T1_SUMMARY_PROMPT = """\ +Analyze the following log activity from the past {window_minutes} minutes. + +LOG COUNTS (total lines per stream): +{counts} + +RECENT LOGS — sim-steward ({sim_steward_count} lines shown): +{sim_steward_sample} + +RECENT LOGS — claude-dev-logging ({claude_dev_count} lines shown): +{claude_dev_sample} + +RECENT LOGS — claude-token-metrics ({claude_token_count} lines shown): +{claude_token_sample} + +Respond with this JSON schema exactly: +{{ + "summary": "<2-3 sentence narrative of what happened this window>", + "cycle_notes": "" +}} +""" + +T1_ANOMALY_PROMPT = """\ +You have already summarized this window: +{summary} + +Now analyze the same logs for anomalies. Look for: +- Error spikes or unexpected ERROR/WARN levels +- Gaps in expected activity (e.g. session started but no actions followed) +- Unusual token costs or AI session patterns +- WebSocket disconnects, action failures, plugin crashes +- Anything that deviates from normal healthy operation + +LOG COUNTS: +{counts} + +RECENT LOGS — sim-steward: +{sim_steward_sample} + +RECENT LOGS — claude-dev-logging: +{claude_dev_sample} + +RECENT LOGS — claude-token-metrics: +{claude_token_sample} + +Respond with this JSON schema exactly: +{{ + "anomalies": [ + {{ + "id": "", + "stream": "", + "description": "", + "severity": "", + "needs_t2": , + "suggested_logql": "" + }} + ] +}} + +Return an empty anomalies array if nothing looks wrong. Do not invent anomalies. +""" + +# ── T2 prompts ─────────────────────────────────────────────────────────────── + +T2_SYSTEM = """\ +You are a senior site reliability engineer investigating anomalies in a SimHub iRacing plugin system. +You have been given anomaly flags, a chronological event timeline, and raw log evidence from targeted queries. +Your job: determine root cause, identify cross-stream correlations, and provide concrete actionable recommendations. + +Stream guide: +{stream_guide} + +Always respond with valid JSON only. No markdown, no explanation outside the JSON object.\ +""" + +T2_INVESTIGATION_PROMPT = """\ +ANOMALIES TO INVESTIGATE: +{anomaly_descriptions} + +EVENT TIMELINE (past {window_minutes} minutes, chronological across all streams): +{timeline_text} + +TARGETED LOG QUERIES AND RESULTS: +{logql_results} + +Based on all of the above, respond with this JSON schema exactly: +{{ + "root_cause": "", + "issue_type": "", + "confidence": "", + "correlation": "", + "impact": "", + "recommendation": "", + "logql_queries_used": {logql_queries_list}, + "sentry_worthy": +}} +""" + +# ── LogQL generation prompt ────────────────────────────────────────────────── + +LOGQL_GEN_SYSTEM = """\ +You are a Loki LogQL expert. Generate precise LogQL queries to investigate anomalies. +Always respond with a valid JSON array of strings only. No explanation.\ +""" + +LOGQL_GEN_PROMPT = """\ +Generate up to 5 LogQL queries to investigate these anomalies: +{anomaly_descriptions} + +Available streams (use exact app label values): +- {{app="sim-steward"}} — plugin actions, iRacing events +- {{app="claude-dev-logging"}} — AI agent tool calls, lifecycle +- {{app="claude-token-metrics"}} — AI session token summaries + +Time window: past {window_minutes} minutes. + +Rules: +- Every query must start with {{ and contain at least one | +- Use | json to parse JSON log lines +- Use | level = "ERROR" or | event = "..." to filter +- Keep queries focused and specific to the anomalies + +Respond with a JSON array of strings: +["", "", ...] +""" + + +# ── Helper: build formatted stream guide ──────────────────────────────────── + +def build_stream_guide() -> str: + return "\n".join( + f" {app}: {desc}" for app, desc in STREAM_DESCRIPTIONS.items() + ) + + +# ── Helper: format log sample for prompt ──────────────────────────────────── + +def format_log_sample(lines: list[dict], max_lines: int = 30) -> str: + import json + if not lines: + return " (no logs in this window)" + shown = lines[-max_lines:] # most recent + return "\n".join(f" {json.dumps(line, default=str)}" for line in shown) + + +# ── Helper: format LogQL results for T2 prompt ────────────────────────────── + +def format_logql_results(results: dict[str, list[dict]]) -> str: + import json + if not results: + return " (no additional queries executed)" + sections = [] + for query, lines in results.items(): + if not lines: + sections.append(f"=== {query} ===\n (0 results)") + else: + formatted = "\n".join( + f" {json.dumps(line, default=str)}" for line in lines[:50] + ) + sections.append(f"=== {query} ===\n{formatted}") + return "\n\n".join(sections) + + +# ── v3: Feature invocation formatter ──────────────────────────────────────── + +def format_invocations(invocations, max_invocations: int = 15) -> str: + """Format FeatureInvocation list for injection into T1 prompt.""" + if not invocations: + return " (no feature invocations detected this window)" + + shown = invocations[:max_invocations] + lines = [] + for inv in shown: + status = "FAILED" if inv.success is False else ("OK" if inv.success else "?") + err = f" error={inv.error[:60]}" if inv.error else "" + lines.append( + f" [{status}] {inv.action_type} via {inv.correlation_method} " + f"({inv.duration_ms}ms, {len(inv.events)} events){err}" + ) + if len(invocations) > max_invocations: + lines.append(f" [... {len(invocations) - max_invocations} more invocations not shown]") + return "\n".join(lines) + + +def format_evidence_packets_for_t2(packet_dicts: list[dict]) -> str: + """Format Loki-serialized evidence packet metadata for T2 prompt.""" + if not packet_dicts: + return " (no evidence packets available)" + lines = [] + for p in packet_dicts: + lines.append( + f" [{p.get('severity', '?').upper()}] anomaly_id={p.get('anomaly_id', '?')} " + f"stream={p.get('detector_stream', '?')}" + ) + lines.append(f" {p.get('anomaly_description', '')[:120]}") + if p.get("t1_hypothesis"): + lines.append(f" T1 hypothesis: {p['t1_hypothesis'][:120]}") + lines.append( + f" confidence={p.get('t1_confidence', 0):.0%} " + f"related_lines={p.get('related_lines_count', 0)} " + f"invocations={p.get('invocation_count', 0)}" + ) + if p.get("suggested_logql"): + lines.append(f" suggested_logql: {p['suggested_logql'][:120]}") + lines.append("") + return "\n".join(lines) + + +# ── v3: T1 anomaly prompt with invocations + baseline context ──────────────── + +T1_ANOMALY_PROMPT_V3 = """\ +You have already summarized this window: +{summary} + +FEATURE INVOCATIONS (user actions traced end-to-end this window): +{invocations_text} + +BASELINE CONTEXT (historical normal values — use to judge what is anomalous): +{baseline_context} + +Now analyze the logs for anomalies. Look for: +- Error spikes or unexpected ERROR/WARN levels +- Failed feature invocations (action_type FAILED) +- Gaps in expected activity (e.g. session started but no actions followed) +- Unusual token costs or AI session patterns +- WebSocket disconnects, action failures, plugin crashes +- Metrics exceeding baselines by 3x or more +- Anything deviating from historical normal operation + +LOG COUNTS: +{counts} + +RECENT LOGS — sim-steward: +{sim_steward_sample} + +RECENT LOGS — claude-dev-logging: +{claude_dev_sample} + +RECENT LOGS — claude-token-metrics: +{claude_token_sample} + +Respond with this JSON schema exactly: +{{ + "anomalies": [ + {{ + "id": "", + "stream": "", + "event_type": "", + "description": "", + "severity": "", + "needs_t2": , + "hypothesis": "", + "confidence": <0.0 to 1.0>, + "trace_id": "", + "suggested_logql": "" + }} + ] +}} + +Return an empty anomalies array if nothing looks wrong. Do not invent anomalies. +""" + + +# ── v3: T2 evidence-packet prompts ────────────────────────────────────────── + +T2_EVIDENCE_SYSTEM = """\ +You are a senior site reliability engineer investigating anomalies in a SimHub iRacing plugin system. +You have been given pre-assembled evidence packets from T1 fast triage, plus relevant Sentry history. +Your job: validate T1 hypotheses, determine root cause, identify cross-stream correlations, and provide +concrete actionable recommendations. + +Stream guide: +{stream_guide} + +Always respond with valid JSON only. No markdown, no explanation outside the JSON object.\ +""" + +T2_EVIDENCE_PROMPT = """\ +EVIDENCE PACKETS FROM T1 TRIAGE: +{evidence_text} + +SENTRY HISTORY (existing issues matching these anomaly signatures): +{sentry_context} + +ADDITIONAL LOG EVIDENCE (from targeted LogQL queries): +{logql_results} + +Based on all of the above, respond with this JSON schema exactly: +{{ + "root_cause": "", + "issue_type": "", + "confidence": "", + "correlation": "", + "impact": "", + "recommendation": "", + "sentry_worthy": , + "sentry_fingerprint": "", + "logql_queries_used": [] +}} +""" + + +# ── v3: T3 synthesis prompts ───────────────────────────────────────────────── + +T3_SYSTEM = """\ +You are a systems analyst synthesizing log data, anomaly findings, and Sentry history +for a SimHub iRacing plugin with an integrated AI coding assistant. + +Your goal: answer "What was the user trying to do, and did it work?" +Produce a human-readable synthesis covering sessions, patterns, costs, regressions, and health. + +Stream guide: +{stream_guide} + +Always respond with valid JSON only. No markdown, no explanation outside the JSON object.\ +""" + +T3_SYNTHESIS_PROMPT = """\ +SYNTHESIS WINDOW: {window_description} +MODE: {mode} + +T1 EVIDENCE PACKETS (anomalies found this period): +{evidence_summary} + +T2 INVESTIGATIONS (deep findings this period): +{investigation_summary} + +OPEN SENTRY ISSUES: +{sentry_issues} + +RECENT RELEASES: +{recent_releases} + +SESSION NARRATIVES: +{session_narratives} + +Respond with this JSON schema exactly: +{{ + "period_summary": "<2-3 sentence overview of the period>", + "sessions_analyzed": , + "features_worked": ["", ...], + "features_failed": ["", ...], + "recurring_patterns": [ + {{ + "pattern": "", + "occurrences": , + "first_seen": "", + "recommendation": "" + }} + ], + "cost_summary": {{ + "sessions": , + "total_usd": , + "mean_per_session_usd": , + "trend": "" + }}, + "regression_detected": , + "regression_detail": "", + "action_items": ["", ""], + "baselines_need_update": +}} +""" diff --git a/observability/local/log-sentinel/requirements.txt b/observability/local/log-sentinel/requirements.txt new file mode 100644 index 0000000..e8f6ece --- /dev/null +++ b/observability/local/log-sentinel/requirements.txt @@ -0,0 +1,5 @@ +flask>=3.0.0 +requests>=2.31.0 +schedule>=1.2.0 +sentry-sdk>=2.0.0 +pytest>=8.0.0 diff --git a/observability/local/log-sentinel/sentinel.py b/observability/local/log-sentinel/sentinel.py new file mode 100644 index 0000000..55df187 --- /dev/null +++ b/observability/local/log-sentinel/sentinel.py @@ -0,0 +1,326 @@ +"""Log Sentinel v3 — main cycle orchestrator.""" + +import logging +import time +import uuid +from dataclasses import dataclass + +import schedule + +from baseline import BaselineManager +from circuit_breaker import CircuitBreaker +from config import Config +from evidence import EvidenceBuilder +from grafana_client import GrafanaClient +from loki_client import LokiClient +from ollama_client import OllamaClient +from sentry_client import SentryClient +from t1_agent import T1Agent, T1Result +from t2_agent import T2Agent, T2Result +from t3_agent import T3Agent +from timeline import TimelineBuilder +from trace import InvocationBuilder + +logger = logging.getLogger("sentinel") + + +@dataclass +class CycleResult: + cycle_id: str + cycle_num: int + window_minutes: int + t1: T1Result | None + timeline_event_count: int + anomaly_count: int + duration_ms: int + error: str | None = None + + +class Sentinel: + def __init__(self, config: Config): + self.config = config + + self.loki = LokiClient(config.loki_url) + self.ollama = OllamaClient(config.ollama_url) + self.grafana = GrafanaClient(config.grafana_url, config.grafana_user, config.grafana_password) + self.sentry = SentryClient(config.sentry_dsn, config.env_label) + + self.loki_breaker = CircuitBreaker("loki", failure_threshold=3, backoff_sec=60) + self.ollama_breaker = CircuitBreaker("ollama", failure_threshold=3, backoff_sec=120) + + self.baseline = BaselineManager(self.loki, config.baseline_path) + self.evidence_builder = EvidenceBuilder(self.loki) + self.invocation_builder = InvocationBuilder() + self.timeline_builder = TimelineBuilder(self.loki, self.loki_breaker) + + self.t1_agent = T1Agent( + self.ollama, self.loki, self.ollama_breaker, config, + self.baseline, self.evidence_builder, + ) + self.t2_agent = T2Agent( + self.ollama, self.loki, self.grafana, self.sentry, + self.ollama_breaker, config, + ) + self.t3_agent = T3Agent( + self.ollama, self.loki, self.grafana, self.sentry, + self.baseline, self.ollama_breaker, config, + ) + + self._cycle_num = 0 + self._trigger_dedup: dict[str, float] = {} # alertname → last trigger time.time() + self._stats = { + "cycles_completed": 0, + "total_anomalies": 0, + "last_cycle_duration_ms": 0, + "last_t1_duration_ms": 0, + "last_t2_run_ts": 0, + "last_t3_run_ts": 0, + } + + # ── Public ─────────────────────────────────────────────────────────────── + + def start(self): + """Blocking schedule loop.""" + logger.info( + "Sentinel v3 started: mode=%s t1=%ds t2=%ds t3=%ds fast=%s deep=%s", + self.config.sentinel_mode, + self.config.t1_interval_sec, + self.config.t2_interval_sec, + self.config.t3_interval_sec, + self.config.ollama_model_fast, + self.config.ollama_model_deep, + ) + self.run_cycle() + schedule.every(self.config.t1_interval_sec).seconds.do(self.run_cycle) + schedule.every(self.config.t2_interval_sec).seconds.do(self.run_t2_cycle) + schedule.every(self.config.t3_interval_sec).seconds.do(self.run_t3_cycle) + while True: + schedule.run_pending() + time.sleep(1) + + def run_cycle(self) -> CycleResult: + """T1 analysis cycle. Always returns CycleResult.""" + self._cycle_num += 1 + cycle_id = str(uuid.uuid4())[:8] + cycle_start = time.time() + + end_ns = self.loki.now_ns() + start_ns = end_ns - int(self.config.lookback_sec * 1e9) + window_minutes = max(1, self.config.lookback_sec // 60) + + logger.info("Cycle #%d [%s] start: window=%dmin", self._cycle_num, cycle_id, window_minutes) + + t1 = None + timeline_events = [] + error = None + + try: + # 1. Gather + counts, samples = self._gather(start_ns, end_ns) + + # 2. Build timeline + invocations + timeline_events = self.timeline_builder.build(start_ns, end_ns) + tl_stats = self.timeline_builder.get_stats(timeline_events) + self.loki.push_timeline({ + **tl_stats, + "cycle_id": cycle_id, + "truncated": tl_stats["event_count"] > 60, + }, self.config.env_label) + + invocations = self.invocation_builder.build(timeline_events) + + # 3. T1 analysis + if not self.ollama_breaker.allow_request(): + logger.warning("T1 skipped: Ollama circuit open") + else: + t1 = self.t1_agent.run( + start_ns, end_ns, counts, + samples["sim-steward"], + samples["claude-dev-logging"], + samples["claude-token-metrics"], + invocations=invocations, + trigger_source="scheduled", + ) + self.loki.push_analyst_run({ + "cycle_id": cycle_id, + "tier": "t1", + "model": t1.model, + "think_mode": True, + "duration_ms": t1.total_duration_ms, + "summary_duration_ms": t1.summary_duration_ms, + "anomaly_duration_ms": t1.anomaly_duration_ms, + "anomaly_count": len(t1.anomalies), + "needs_t2_count": sum(1 for a in t1.anomalies if a.get("needs_t2")), + "evidence_packet_count": len(t1.evidence_packets), + "invocation_count": len(t1.invocations), + "window_minutes": window_minutes, + "trigger_source": t1.trigger_source, + }, self.config.env_label) + + except Exception as e: + error = str(e) + logger.error("Cycle #%d error: %s", self._cycle_num, e) + + duration_ms = int((time.time() - cycle_start) * 1000) + result = CycleResult( + cycle_id=cycle_id, + cycle_num=self._cycle_num, + window_minutes=window_minutes, + t1=t1, + timeline_event_count=len(timeline_events), + anomaly_count=len(t1.anomalies) if t1 else 0, + duration_ms=duration_ms, + error=error, + ) + + self.loki.push_cycle({ + "cycle_id": cycle_id, + "cycle_num": self._cycle_num, + "window_minutes": window_minutes, + "t1_duration_ms": t1.total_duration_ms if t1 else 0, + "anomaly_count": result.anomaly_count, + "evidence_packet_count": len(t1.evidence_packets) if t1 else 0, + "timeline_event_count": len(timeline_events), + "total_duration_ms": duration_ms, + "error": error, + }, self.config.env_label) + + self._stats["cycles_completed"] = self._cycle_num + self._stats["last_cycle_duration_ms"] = duration_ms + self._stats["last_t1_duration_ms"] = t1.total_duration_ms if t1 else 0 + if t1: + self._stats["total_anomalies"] += result.anomaly_count + + logger.info( + "Cycle #%d complete: %d anomalies %d evidence_packets %dms", + self._cycle_num, result.anomaly_count, + len(t1.evidence_packets) if t1 else 0, duration_ms, + ) + return result + + def run_t2_cycle(self) -> None: + """Independent T2 investigation cycle — pulls evidence packets from Loki.""" + if not self.ollama_breaker.allow_request(): + logger.warning("T2 cycle skipped: Ollama circuit open") + return + logger.info("T2 cycle starting") + try: + result = self.t2_agent.run() + self._stats["last_t2_run_ts"] = int(time.time()) + if result: + logger.info( + "T2 cycle complete: confidence=%s sentry=%s %dms", + result.confidence, result.sentry_worthy, result.total_duration_ms, + ) + except Exception as e: + logger.error("T2 cycle error: %s", e) + + def run_t3_cycle(self) -> None: + """Independent T3 synthesis cycle — runs on slow cadence.""" + if not self.ollama_breaker.allow_request(): + logger.warning("T3 cycle skipped: Ollama circuit open") + return + logger.info("T3 cycle starting (mode=%s)", self.config.sentinel_mode) + try: + result = self.t3_agent.run(trigger="scheduled") + self._stats["last_t3_run_ts"] = int(time.time()) + logger.info( + "T3 cycle complete: %d sessions, regression=%s, %dms", + result.sessions_analyzed, result.regression_detected, result.inference_duration_ms, + ) + except Exception as e: + logger.error("T3 cycle error: %s", e) + + def trigger_cycle( + self, + alert_context: str, + trigger_tier: str, + alert_names: list[str], + lookback_sec: int = 1800, + ) -> None: + """Alert-driven cycle — called from /trigger webhook, runs in background thread.""" + logger.info( + "Trigger cycle: tier=%s alerts=%s lookback=%ds", + trigger_tier, alert_names, lookback_sec, + ) + end_ns = self.loki.now_ns() + start_ns = end_ns - lookback_sec * 1_000_000_000 + + try: + counts, samples = self._gather(start_ns, end_ns) + timeline_events = self.timeline_builder.build(start_ns, end_ns) + invocations = self.invocation_builder.build(timeline_events) + except Exception as e: + logger.error("Trigger cycle gather failed: %s", e) + return + + if not self.ollama_breaker.allow_request(): + logger.warning("Trigger cycle T1 skipped: Ollama circuit open") + return + + t1 = None + try: + t1 = self.t1_agent.run( + start_ns, end_ns, counts, + samples["sim-steward"], + samples["claude-dev-logging"], + samples["claude-token-metrics"], + invocations=invocations, + alert_context=alert_context, + trigger_source="grafana_alert", + alert_names=alert_names, + ) + logger.info( + "Trigger T1 complete: %d anomalies, %d evidence_packets, %dms", + len(t1.anomalies), len(t1.evidence_packets), t1.total_duration_ms, + ) + except Exception as e: + logger.error("Trigger cycle T1 failed: %s", e) + + # For t2-tier alerts, skip needs_t2 gate — escalate immediately + if trigger_tier == "t2" and self.config.t2_enabled: + if not self.ollama_breaker.allow_request(): + logger.warning("Trigger cycle T2 skipped: Ollama circuit open") + return + try: + forced_ids = [ep.anomaly_id for ep in t1.evidence_packets] if t1 else None + result = self.t2_agent.run(forced_packet_ids=forced_ids) + self._stats["last_t2_run_ts"] = int(time.time()) + if result: + logger.info( + "Trigger T2 complete: confidence=%s sentry=%s %dms", + result.confidence, result.sentry_worthy, result.total_duration_ms, + ) + except Exception as e: + logger.error("Trigger cycle T2 failed: %s", e) + + # ── Private ────────────────────────────────────────────────────────────── + + def _gather(self, start_ns: int, end_ns: int) -> tuple[dict, dict]: + """Fetch counts and samples from all three Loki streams.""" + stream_queries = { + "sim-steward": '{app="sim-steward"} | json', + "claude-dev-logging": '{app="claude-dev-logging"} | json', + "claude-token-metrics": '{app="claude-token-metrics"} | json', + } + + counts = {} + samples = {} + + if not self.loki_breaker.allow_request(): + logger.warning("Gather skipped: Loki circuit open") + return {k: 0 for k in stream_queries}, {k: [] for k in stream_queries} + + try: + for name, logql in stream_queries.items(): + counts[name] = self.loki.count(logql, start_ns, end_ns) + samples[name] = self.loki.query_lines(logql, start_ns, end_ns, limit=100) + self.loki_breaker.record_success() + except Exception as e: + self.loki_breaker.record_failure() + logger.error("Gather failed: %s", e) + for name in stream_queries: + counts.setdefault(name, -1) + samples.setdefault(name, []) + + return counts, samples diff --git a/observability/local/log-sentinel/sentry_client.py b/observability/local/log-sentinel/sentry_client.py new file mode 100644 index 0000000..1fc98ed --- /dev/null +++ b/observability/local/log-sentinel/sentry_client.py @@ -0,0 +1,230 @@ +"""Sentry SDK wrapper — create issues, read history, and capture behavioral findings. + +v3 additions: + - traces_sample_rate bumped to 1.0 (enable transactions) + - search_issues() — REST API read for T2/T3 history queries + - find_releases() — REST API read for T3 regression detection + - capture_behavioral_finding() — T2 writes behavioral patterns not captured by SDK +""" + +import logging +import requests + +logger = logging.getLogger("sentinel.sentry") + +_sdk_available = False +try: + import sentry_sdk + _sdk_available = True +except ImportError: + logger.warning("sentry-sdk not installed, Sentry integration disabled") + + +class SentryClient: + def __init__( + self, + dsn: str, + env: str = "local", + auth_token: str = "", + org: str = "", + project: str = "", + ): + self.enabled = bool(dsn) and _sdk_available + self._auth_token = auth_token + self._org = org + self._project = project + self._api_enabled = bool(auth_token and org and project) + + if self.enabled: + sentry_sdk.init( + dsn=dsn, + environment=env, + traces_sample_rate=1.0, + send_default_pii=False, + ) + logger.info("Sentry initialized (env=%s)", env) + else: + if dsn and not _sdk_available: + logger.warning("Sentry DSN set but sentry-sdk not installed") + elif not dsn: + logger.info("Sentry disabled (no DSN)") + + def create_issue(self, finding) -> str | None: + """Create Sentry issue for a critical finding. Returns event_id or None.""" + if not self.enabled: + return None + try: + with sentry_sdk.new_scope() as scope: + scope.set_tag("detector", finding.detector) + scope.set_tag("category", finding.category) + scope.set_tag("severity", finding.severity) + scope.set_tag("issue_type", "unknown") + scope.set_context("finding", { + "finding_id": finding.finding_id, + "fingerprint": finding.fingerprint, + "summary": finding.summary, + "logql_query": finding.logql_query, + "evidence": finding.evidence, + }) + scope.fingerprint = [finding.detector, finding.fingerprint] + event_id = sentry_sdk.capture_message( + f"[CRITICAL] {finding.title}", + level="error", + scope=scope, + ) + logger.info("Sentry issue created for finding %s: %s", finding.finding_id[:8], event_id) + return event_id + except Exception as e: + logger.warning("Sentry create_issue failed: %s", e) + return None + + def capture_behavioral_finding( + self, + title: str, + issue_type: str, + recommendation: str, + confidence: str, + fingerprint: str, + context: dict, + ) -> str | None: + """Create Sentry issue for a T2 behavioral finding (not captured by SDK). + + Only call this for patterns that wouldn't surface as clean exceptions: + e.g. 'WebSocket always drops after 20min replay', 'incident detection stalls + after session_num > 3'. Do NOT use for things already covered by SDK capture. + """ + if not self.enabled: + return None + try: + level = "error" if confidence == "high" else "warning" + with sentry_sdk.new_scope() as scope: + scope.set_tag("issue_type", issue_type) + scope.set_tag("confidence", confidence) + scope.set_tag("source", "t2_behavioral") + scope.set_context("finding", { + "recommendation": recommendation, + **context, + }) + scope.fingerprint = ["t2.behavioral", fingerprint] + event_id = sentry_sdk.capture_message( + f"[T2] {title}", + level=level, + scope=scope, + ) + logger.info("Sentry behavioral finding created: %s", event_id) + return event_id + except Exception as e: + logger.warning("Sentry capture_behavioral_finding failed: %s", e) + return None + + # ── REST API read methods ────────────────────────────────────────────────── + + def search_issues(self, query: str = "is:unresolved", limit: int = 10) -> list[dict]: + """Search Sentry issues via REST API. Returns list of issue dicts.""" + if not self._api_enabled: + return [] + try: + resp = requests.get( + f"https://sentry.io/api/0/projects/{self._org}/{self._project}/issues/", + headers={"Authorization": f"Bearer {self._auth_token}"}, + params={"query": query, "limit": limit}, + timeout=10, + ) + if resp.status_code == 200: + return resp.json() + logger.debug("Sentry search_issues HTTP %d: %s", resp.status_code, resp.text[:200]) + except Exception as e: + logger.debug("Sentry search_issues failed: %s", e) + return [] + + def get_issue(self, issue_id: str) -> dict: + """Fetch a single Sentry issue by ID.""" + if not self._api_enabled: + return {} + try: + resp = requests.get( + f"https://sentry.io/api/0/issues/{issue_id}/", + headers={"Authorization": f"Bearer {self._auth_token}"}, + timeout=10, + ) + if resp.status_code == 200: + return resp.json() + except Exception as e: + logger.debug("Sentry get_issue failed: %s", e) + return {} + + def find_releases(self, limit: int = 5) -> list[dict]: + """Fetch recent releases for regression detection in T3.""" + if not self._api_enabled: + return [] + try: + resp = requests.get( + f"https://sentry.io/api/0/projects/{self._org}/{self._project}/releases/", + headers={"Authorization": f"Bearer {self._auth_token}"}, + params={"limit": limit}, + timeout=10, + ) + if resp.status_code == 200: + return resp.json() + except Exception as e: + logger.debug("Sentry find_releases failed: %s", e) + return [] + + def create_release(self, version: str) -> dict: + """Create a Sentry release (called from deploy.ps1 via this client).""" + if not self._api_enabled: + return {} + try: + resp = requests.post( + f"https://sentry.io/api/0/organizations/{self._org}/releases/", + headers={"Authorization": f"Bearer {self._auth_token}"}, + json={"version": version, "projects": [self._project]}, + timeout=10, + ) + if resp.status_code in (200, 201): + return resp.json() + logger.debug("Sentry create_release HTTP %d: %s", resp.status_code, resp.text[:200]) + except Exception as e: + logger.warning("Sentry create_release failed: %s", e) + return {} + + def create_investigation_issue(self, investigation) -> str | None: + """Create Sentry issue for a T2 investigation report. Returns event_id or None.""" + if not self.enabled: + return None + try: + finding = investigation.finding + level = "error" if finding.severity == "critical" else "warning" + with sentry_sdk.new_scope() as scope: + scope.set_tag("detector", finding.detector) + scope.set_tag("category", finding.category) + scope.set_tag("severity", finding.severity) + scope.set_tag("model", investigation.model) + scope.set_tag("confidence", investigation.confidence) + scope.set_tag("issue_type", investigation.issue_type) + scope.set_tag("trigger", investigation.trigger) + scope.set_context("investigation", { + "investigation_id": investigation.investigation_id, + "finding_id": finding.finding_id, + "root_cause": investigation.root_cause, + "correlation": investigation.correlation, + "impact": investigation.impact, + "recommendation": investigation.recommendation, + "inference_duration_ms": investigation.inference_duration_ms, + }) + scope.set_context("finding", { + "title": finding.title, + "summary": finding.summary, + "evidence": finding.evidence, + }) + scope.fingerprint = [finding.detector, investigation.root_cause[:50]] + event_id = sentry_sdk.capture_message( + f"[T2] {investigation.root_cause[:120]}", + level=level, + scope=scope, + ) + logger.info("Sentry investigation issue for %s: %s", investigation.investigation_id[:8], event_id) + return event_id + except Exception as e: + logger.warning("Sentry create_investigation_issue failed: %s", e) + return None diff --git a/observability/local/log-sentinel/t1_agent.py b/observability/local/log-sentinel/t1_agent.py new file mode 100644 index 0000000..f6119f8 --- /dev/null +++ b/observability/local/log-sentinel/t1_agent.py @@ -0,0 +1,220 @@ +"""T1 — Fast triage agent. + +Replaces the T1 half of analyst.py for v3. +Key changes over v2 Analyst.run_t1(): + - Accepts pre-built FeatureInvocations from InvocationBuilder + - Injects BaselineManager context into anomaly prompt + - Accepts optional T0 alert context for event-driven runs + - Builds EvidencePackets for each anomaly via EvidenceBuilder + - Pushes sentinel_evidence_packet events to Loki + - T1Result carries invocations + evidence_packets + trigger metadata +""" + +import logging +from dataclasses import dataclass, field + +from analyst import _parse_json, _normalize_anomalies +from baseline import BaselineManager +from circuit_breaker import CircuitBreaker +from config import Config +from evidence import EvidenceBuilder, EvidencePacket +from loki_client import LokiClient +from ollama_client import OllamaClient +from prompts import ( + T1_SYSTEM, T1_SUMMARY_PROMPT, T1_ANOMALY_PROMPT_V3, + build_stream_guide, format_log_sample, format_invocations, +) +from trace import FeatureInvocation + +logger = logging.getLogger("sentinel.t1") + + +@dataclass +class T1Result: + summary: str + cycle_notes: str + anomalies: list[dict] + invocations: list[FeatureInvocation] + evidence_packets: list[EvidencePacket] + model: str + summary_duration_ms: int + anomaly_duration_ms: int + trigger_source: str # "scheduled" | "grafana_alert" + alert_names: list[str] # T0 alert names that triggered this run + raw_summary_response: str = field(repr=False, default="") + raw_anomaly_response: str = field(repr=False, default="") + + @property + def needs_t2(self) -> bool: + return any(a.get("needs_t2") for a in self.anomalies) + + @property + def total_duration_ms(self) -> int: + return self.summary_duration_ms + self.anomaly_duration_ms + + +class T1Agent: + def __init__( + self, + ollama: OllamaClient, + loki: LokiClient, + breaker: CircuitBreaker, + config: Config, + baseline: BaselineManager, + evidence_builder: EvidenceBuilder, + ): + self.ollama = ollama + self.loki = loki + self.breaker = breaker + self.config = config + self.baseline = baseline + self.evidence_builder = evidence_builder + self._stream_guide = build_stream_guide() + + def run( + self, + start_ns: int, + end_ns: int, + counts: dict[str, int], + sim_steward_sample: list[dict], + claude_dev_sample: list[dict], + claude_token_sample: list[dict], + invocations: list[FeatureInvocation], + alert_context: str = "", + trigger_source: str = "scheduled", + alert_names: list[str] | None = None, + ) -> T1Result: + window_minutes = max(1, int((end_ns - start_ns) / 1e9 / 60)) + counts_text = "\n".join(f" {k}: {v}" for k, v in counts.items()) + + samples = dict( + sim_steward_sample=format_log_sample(sim_steward_sample), + sim_steward_count=len(sim_steward_sample), + claude_dev_sample=format_log_sample(claude_dev_sample), + claude_dev_count=len(claude_dev_sample), + claude_token_sample=format_log_sample(claude_token_sample), + claude_token_count=len(claude_token_sample), + ) + + invocations_text = format_invocations(invocations) + baseline_context = self.baseline.get_prompt_context() + system = T1_SYSTEM.format(stream_guide=self._stream_guide) + + # Optional T0 alert context prefix — injected into both calls + alert_prefix = "" + if alert_context: + alert_prefix = ( + f"ALERT CONTEXT (from Grafana):\n{alert_context}\n" + "→ Focus investigation on this signal. Do not suppress even if recent history is quiet.\n\n" + ) + + # Call A: summary (/no_think — fast) + summary_prompt = alert_prefix + T1_SUMMARY_PROMPT.format( + window_minutes=window_minutes, + counts=counts_text, + **samples, + ) + summary_text = "" + cycle_notes = "" + summary_ms = 0 + raw_summary = "" + try: + raw_summary, summary_ms = self.ollama.generate( + self.config.ollama_model_fast, + system + "\n\n" + summary_prompt, + think=False, + ) + self.breaker.record_success() + parsed = _parse_json(raw_summary) + summary_text = parsed.get("summary", "") + cycle_notes = parsed.get("cycle_notes", "") + except Exception as e: + self.breaker.record_failure() + logger.error("T1 summary call failed: %s", e) + + # Call B: anomaly scan (/think) — invocations + baseline context included + anomaly_prompt = alert_prefix + T1_ANOMALY_PROMPT_V3.format( + summary=summary_text or "(summary unavailable)", + counts=counts_text, + invocations_text=invocations_text, + baseline_context=baseline_context, + **samples, + ) + anomalies = [] + anomaly_ms = 0 + raw_anomaly = "" + try: + raw_anomaly, anomaly_ms = self.ollama.generate( + self.config.ollama_model_fast, + system + "\n\n" + anomaly_prompt, + think=True, + ) + self.breaker.record_success() + parsed = _parse_json(raw_anomaly) + anomalies = _normalize_anomalies_v3(parsed.get("anomalies", [])) + except Exception as e: + self.breaker.record_failure() + logger.error("T1 anomaly call failed: %s", e) + + # Build evidence packets for each anomaly, push to Loki + evidence_packets = [] + if anomalies: + evidence_packets = self.evidence_builder.build_many( + anomalies, invocations, start_ns, end_ns + ) + for packet in evidence_packets: + try: + self.loki.push_evidence_packet(packet, env=self.config.env_label) + except Exception as e: + logger.warning("Failed to push evidence packet %s: %s", packet.anomaly_id, e) + + logger.info( + "T1 [%s]: %d invocations, %d anomalies (%d→T2), %d evidence packets, summary=%dms anomaly=%dms", + trigger_source, + len(invocations), + len(anomalies), + sum(1 for a in anomalies if a.get("needs_t2")), + len(evidence_packets), + summary_ms, + anomaly_ms, + ) + + return T1Result( + summary=summary_text, + cycle_notes=cycle_notes, + anomalies=anomalies, + invocations=invocations, + evidence_packets=evidence_packets, + model=self.config.ollama_model_fast, + summary_duration_ms=summary_ms, + anomaly_duration_ms=anomaly_ms, + trigger_source=trigger_source, + alert_names=alert_names or [], + raw_summary_response=raw_summary, + raw_anomaly_response=raw_anomaly, + ) + + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +def _normalize_anomalies_v3(raw: list) -> list[dict]: + """Normalize v3 anomaly dicts from T1 LLM output (superset of v2 fields).""" + if not isinstance(raw, list): + return [] + valid = [] + for a in raw: + if not isinstance(a, dict): + continue + valid.append({ + "id": str(a.get("id", "unknown"))[:64], + "stream": a.get("stream", "unknown"), + "event_type": str(a.get("event_type", ""))[:64], + "description": str(a.get("description", ""))[:500], + "severity": a.get("severity", "info") if a.get("severity") in ("info", "warn", "critical") else "info", + "needs_t2": bool(a.get("needs_t2", False)), + "hypothesis": str(a.get("hypothesis", ""))[:300], + "confidence": float(a.get("confidence", 0.5)) if isinstance(a.get("confidence"), (int, float)) else 0.5, + "trace_id": str(a.get("trace_id", ""))[:64], + "suggested_logql": str(a.get("suggested_logql", ""))[:300], + }) + return valid diff --git a/observability/local/log-sentinel/t2_agent.py b/observability/local/log-sentinel/t2_agent.py new file mode 100644 index 0000000..a10ea91 --- /dev/null +++ b/observability/local/log-sentinel/t2_agent.py @@ -0,0 +1,318 @@ +"""T2 — Deep investigation agent. + +Replaces the T2 half of analyst.py for v3. +Key changes over v2 Analyst.run_t2(): + - Reads evidence packets from Loki (state store), not from T1Result directly + - Queries Sentry for existing issues before forming recommendations + - Produces sentinel_t2_investigation events to Loki + - Creates Grafana annotation per investigation + - Creates Sentry issue if sentry_worthy + high confidence + not already captured + +Input flow: + Loki {event="sentinel_evidence_packet"} (last 15 min) + → SentryClient.search_issues() for each anomaly signature + → qwen3:32b /think + → LokiClient.push_t2_investigation() + → GrafanaClient.annotate_raw() + → SentryClient.capture_message() if warranted +""" + +import json +import logging +import time +from dataclasses import dataclass, field + +from analyst import _parse_json, _normalize_confidence, _normalize_issue_type, _valid_logql +from circuit_breaker import CircuitBreaker +from config import Config +from grafana_client import GrafanaClient +from loki_client import LokiClient +from ollama_client import OllamaClient +from prompts import ( + T2_EVIDENCE_SYSTEM, T2_EVIDENCE_PROMPT, + build_stream_guide, format_evidence_packets_for_t2, format_logql_results, + LOGQL_GEN_SYSTEM, LOGQL_GEN_PROMPT, +) +from sentry_client import SentryClient + +logger = logging.getLogger("sentinel.t2") + +# How far back to pull evidence packets from Loki +_EVIDENCE_LOOKBACK_SEC = 900 # 15 minutes + + +@dataclass +class T2Result: + root_cause: str + issue_type: str + confidence: str + correlation: str + impact: str + recommendation: str + logql_queries_used: list[str] + sentry_worthy: bool + sentry_fingerprint: str + evidence_packet_count: int + sentry_event_id: str | None + model: str + inference_duration_ms: int + logql_gather_duration_ms: int + raw_response: str = field(repr=False, default="") + + @property + def total_duration_ms(self) -> int: + return self.inference_duration_ms + self.logql_gather_duration_ms + + +class T2Agent: + def __init__( + self, + ollama: OllamaClient, + loki: LokiClient, + grafana: GrafanaClient, + sentry: SentryClient, + breaker: CircuitBreaker, + config: Config, + ): + self.ollama = ollama + self.loki = loki + self.grafana = grafana + self.sentry = sentry + self.breaker = breaker + self.config = config + self._stream_guide = build_stream_guide() + + def run( + self, + end_ns: int | None = None, + lookback_sec: int = _EVIDENCE_LOOKBACK_SEC, + forced_packet_ids: list[str] | None = None, + ) -> T2Result | None: + """ + Run T2 investigation over recent evidence packets. + + forced_packet_ids: if set, only process these specific anomaly_ids + (used when T1 immediately escalates critical anomalies) + """ + if end_ns is None: + end_ns = self.loki.now_ns() + start_ns = end_ns - lookback_sec * 1_000_000_000 + + # Step 1: load evidence packets from Loki + packet_dicts = self._load_evidence_packets(start_ns, end_ns, forced_packet_ids) + if not packet_dicts: + logger.info("T2: no evidence packets in window, skipping") + return None + + # Step 2: read Sentry history for context + sentry_context = self._fetch_sentry_context(packet_dicts) + + # Step 3: generate + execute targeted LogQL for additional evidence + gather_start = time.time() + queries = self._generate_logql_queries(packet_dicts, lookback_sec // 60) + logql_results = self._execute_logql_queries(queries, start_ns, end_ns) + gather_ms = int((time.time() - gather_start) * 1000) + + # Step 4: T2 inference + system = T2_EVIDENCE_SYSTEM.format(stream_guide=self._stream_guide) + prompt = T2_EVIDENCE_PROMPT.format( + evidence_text=format_evidence_packets_for_t2(packet_dicts), + sentry_context=sentry_context, + logql_results=format_logql_results(logql_results), + ) + + raw = "" + infer_ms = 0 + try: + raw, infer_ms = self.ollama.generate( + self.config.ollama_model_deep, + system + "\n\n" + prompt, + think=True, + ) + self.breaker.record_success() + except Exception as e: + self.breaker.record_failure() + logger.error("T2 inference failed: %s", e) + + parsed = _parse_json(raw) + all_queries = queries + list(parsed.get("logql_queries_used", [])) + + result = T2Result( + root_cause=parsed.get("root_cause", "Unable to determine root cause."), + issue_type=_normalize_issue_type(parsed.get("issue_type", "unknown")), + confidence=_normalize_confidence(parsed.get("confidence", "low")), + correlation=parsed.get("correlation", "No correlations identified."), + impact=parsed.get("impact", "Impact unknown."), + recommendation=parsed.get("recommendation", "Investigate manually."), + logql_queries_used=all_queries, + sentry_worthy=bool(parsed.get("sentry_worthy", False)), + sentry_fingerprint=str(parsed.get("sentry_fingerprint", ""))[:100], + evidence_packet_count=len(packet_dicts), + sentry_event_id=None, + model=self.config.ollama_model_deep, + inference_duration_ms=infer_ms, + logql_gather_duration_ms=gather_ms, + raw_response=raw, + ) + + # Step 5: push investigation to Loki + Grafana + self._push_investigation(result, packet_dicts, end_ns) + self._annotate_grafana(result) + + # Step 6: create Sentry issue if warranted + if result.sentry_worthy and result.confidence == "high": + event_id = self._create_sentry_issue(result, packet_dicts) + result.sentry_event_id = event_id + + logger.info( + "T2 complete: confidence=%s sentry=%s packets=%d gather=%dms infer=%dms queries=%d", + result.confidence, result.sentry_worthy, + len(packet_dicts), gather_ms, infer_ms, len(all_queries), + ) + return result + + # ── Private ─────────────────────────────────────────────────────────────── + + def _load_evidence_packets( + self, + start_ns: int, + end_ns: int, + forced_ids: list[str] | None, + ) -> list[dict]: + logql = '{app="sim-steward", event="sentinel_evidence_packet"}' + packets = self.loki.query_lines(logql, start_ns, end_ns, limit=100) + if forced_ids: + packets = [p for p in packets if p.get("anomaly_id") in forced_ids] + # Dedup by anomaly_id, keep most recent + seen: dict[str, dict] = {} + for p in packets: + aid = p.get("anomaly_id", "") + if aid not in seen or p.get("assembled_at_ns", 0) > seen[aid].get("assembled_at_ns", 0): + seen[aid] = p + return list(seen.values()) + + def _fetch_sentry_context(self, packet_dicts: list[dict]) -> str: + if not packet_dicts: + return "(no Sentry history available)" + # Build a query from the most severe anomaly descriptions + critical = [p for p in packet_dicts if p.get("severity") == "critical"] + sample = (critical or packet_dicts)[:3] + streams = list({p.get("detector_stream", "") for p in sample if p.get("detector_stream")}) + query = " ".join(streams) + " " + " ".join( + p.get("anomaly_description", "")[:40] for p in sample + ) + try: + issues = self.sentry.search_issues(query=query.strip()[:200], limit=5) + if not issues: + return "(no matching Sentry issues found)" + lines = [] + for issue in issues: + lines.append( + f" [{issue.get('level', '?').upper()}] {issue.get('title', '?')[:80]}" + f" (status={issue.get('status', '?')}, times_seen={issue.get('count', '?')})" + ) + if issue.get("lastSeen"): + lines.append(f" last_seen: {issue['lastSeen']}") + return "\n".join(lines) + except Exception as e: + logger.debug("Sentry context fetch failed: %s", e) + return "(Sentry unavailable)" + + def _generate_logql_queries( + self, + packet_dicts: list[dict], + window_minutes: int, + ) -> list[str]: + # Seed with suggested_logql from evidence packets + seeded = [ + p["suggested_logql"] for p in packet_dicts + if p.get("suggested_logql") and _valid_logql(p["suggested_logql"]) + ] + + if not packet_dicts: + return seeded[:5] + + anomaly_descriptions = "\n".join( + f"- {p.get('anomaly_id', '?')}: {p.get('anomaly_description', '')[:80]}" + for p in packet_dicts[:5] + ) + prompt = LOGQL_GEN_SYSTEM + "\n\n" + LOGQL_GEN_PROMPT.format( + anomaly_descriptions=anomaly_descriptions, + window_minutes=window_minutes, + ) + try: + raw, _ = self.ollama.generate( + self.config.ollama_model_fast, + prompt, + think=False, + temperature=0.0, + ) + generated = json.loads(raw) if raw.strip().startswith("[") else [] + if isinstance(generated, list): + combined = seeded + [q for q in generated if isinstance(q, str)] + return [q.strip() for q in combined if _valid_logql(q)][:5] + except Exception as e: + logger.debug("T2 LogQL gen failed: %s", e) + + return [q for q in seeded if _valid_logql(q)][:5] + + def _execute_logql_queries( + self, queries: list[str], start_ns: int, end_ns: int + ) -> dict[str, list[dict]]: + results = {} + for query in queries: + try: + lines = self.loki.query_lines(query, start_ns, end_ns, limit=50) + results[query] = lines + except Exception as e: + logger.debug("T2 LogQL execute failed (%s): %s", query[:60], e) + results[query] = [] + return results + + def _push_investigation( + self, result: T2Result, packet_dicts: list[dict], end_ns: int + ) -> None: + try: + self.loki.push_t2_investigation(result, packet_dicts, env=self.config.env_label) + except Exception as e: + logger.warning("Failed to push T2 investigation to Loki: %s", e) + + def _annotate_grafana(self, result: T2Result) -> None: + try: + severity_tag = "critical" if result.confidence == "high" and result.sentry_worthy else "investigation" + self.grafana.annotate_raw( + title=f"T2 Investigation [{result.confidence}]: {result.root_cause[:80]}", + text=( + f"Root cause: {result.root_cause}
" + f"Recommendation: {result.recommendation}
" + f"Type: {result.issue_type} | Packets: {result.evidence_packet_count} | " + f"Model: {result.model}" + ), + tags=["t2", result.issue_type, result.confidence, severity_tag], + ) + except Exception as e: + logger.debug("T2 Grafana annotation failed: %s", e) + + def _create_sentry_issue( + self, result: T2Result, packet_dicts: list[dict] + ) -> str | None: + try: + streams = list({p.get("detector_stream", "") for p in packet_dicts if p.get("detector_stream")}) + fingerprint = result.sentry_fingerprint or f"t2.{result.issue_type}.{streams[0] if streams else 'unknown'}" + return self.sentry.capture_behavioral_finding( + title=result.root_cause[:120], + issue_type=result.issue_type, + recommendation=result.recommendation, + confidence=result.confidence, + fingerprint=fingerprint, + context={ + "root_cause": result.root_cause, + "correlation": result.correlation, + "impact": result.impact, + "evidence_packet_count": result.evidence_packet_count, + "model": result.model, + }, + ) + except Exception as e: + logger.warning("T2 Sentry issue creation failed: %s", e) + return None diff --git a/observability/local/log-sentinel/t3_agent.py b/observability/local/log-sentinel/t3_agent.py new file mode 100644 index 0000000..3cfbf09 --- /dev/null +++ b/observability/local/log-sentinel/t3_agent.py @@ -0,0 +1,329 @@ +"""T3 — Synthesis agent. + +Runs on a mode-dependent schedule (dev: 2h, prod: 4h) or on T2 critical escalation. +Answers: "What was the user trying to do, and did it work?" + +What T3 does: + 1. Query Loki for T1 evidence packets + T2 investigations for the synthesis window + 2. Query Sentry for open issues + recent releases + 3. Build session narratives via NarrativeBuilder + 4. Run qwen3:32b /think for 7 synthesis passes (single LLM call) + 5. Update baselines.json via BaselineManager + 6. Emit sentinel_threshold_recommendation per drifted T0 threshold + 7. Push sentinel_synthesis + sentinel_narrative events to Loki + +Mode differences: + dev — 2h cadence, focus: Claude sessions, tool usage, code activity + prod — 4h cadence, focus: iRacing sessions, feature stability, user-facing errors +""" + +import logging +import time +from dataclasses import dataclass, field +from datetime import datetime, timezone + +from analyst import _parse_json +from baseline import BaselineManager +from circuit_breaker import CircuitBreaker +from config import Config +from grafana_client import GrafanaClient +from loki_client import LokiClient +from narrative import NarrativeBuilder +from ollama_client import OllamaClient +from prompts import T3_SYSTEM, T3_SYNTHESIS_PROMPT, build_stream_guide +from sentry_client import SentryClient +from trace import FeatureInvocation + +logger = logging.getLogger("sentinel.t3") + +# Lookbacks per mode for pulling Loki evidence +_MODE_LOOKBACK = { + "dev": 2 * 3600, + "prod": 4 * 3600, +} + + +@dataclass +class T3Result: + period_summary: str + sessions_analyzed: int + features_worked: list[str] + features_failed: list[str] + recurring_patterns: list[dict] + cost_summary: dict + regression_detected: bool + regression_detail: str + action_items: list[str] + baselines_updated: bool + threshold_recommendations: list[dict] + session_narratives: list[dict] # list of {session_id, narrative_text, ...} + model: str + inference_duration_ms: int + raw_response: str = field(repr=False, default="") + + +class T3Agent: + def __init__( + self, + ollama: OllamaClient, + loki: LokiClient, + grafana: GrafanaClient, + sentry: SentryClient, + baseline: BaselineManager, + breaker: CircuitBreaker, + config: Config, + ): + self.ollama = ollama + self.loki = loki + self.grafana = grafana + self.sentry = sentry + self.baseline = baseline + self.breaker = breaker + self.config = config + self.narrative_builder = NarrativeBuilder() + self._stream_guide = build_stream_guide() + + def run( + self, + end_ns: int | None = None, + invocations: list[FeatureInvocation] | None = None, + lookback_sec: int | None = None, + trigger: str = "scheduled", + ) -> T3Result: + """ + Run T3 synthesis. + + invocations: if provided (e.g. from same-cycle T1 run), used for narratives. + Otherwise T3 uses only Loki-stored invocation summaries. + trigger: "scheduled" | "t2_escalation" + """ + if end_ns is None: + end_ns = self.loki.now_ns() + if lookback_sec is None: + lookback_sec = _MODE_LOOKBACK.get(self.config.sentinel_mode, 7200) + start_ns = end_ns - lookback_sec * 1_000_000_000 + + mode = self.config.sentinel_mode + window_description = _format_window(start_ns, end_ns, mode) + + # Step 1: load evidence from Loki + evidence_packets = self._load_evidence_packets(start_ns, end_ns) + investigations = self._load_investigations(start_ns, end_ns) + + # Step 2: Sentry context + sentry_issues_text, sentry_releases_text = self._fetch_sentry_context() + + # Step 3: build session narratives + session_narratives = [] + if invocations: + all_anomalies = [ep for ep in evidence_packets] + session_narratives = self.narrative_builder.build_all( + invocations=invocations, + anomaly_dicts=all_anomalies, + t2_investigation_dicts=investigations, + ) + + narratives_text = _format_narratives_for_prompt(session_narratives) + + # Step 4: T3 LLM synthesis + system = T3_SYSTEM.format(stream_guide=self._stream_guide) + prompt = T3_SYNTHESIS_PROMPT.format( + window_description=window_description, + mode=mode, + evidence_summary=_format_evidence_summary(evidence_packets), + investigation_summary=_format_investigation_summary(investigations), + sentry_issues=sentry_issues_text, + recent_releases=sentry_releases_text, + session_narratives=narratives_text, + ) + + raw = "" + infer_ms = 0 + try: + raw, infer_ms = self.ollama.generate( + self.config.ollama_model_deep, + system + "\n\n" + prompt, + think=True, + ) + self.breaker.record_success() + except Exception as e: + self.breaker.record_failure() + logger.error("T3 inference failed: %s", e) + + parsed = _parse_json(raw) + + # Step 5: update baselines + baselines_updated = False + threshold_recs = [] + try: + self.baseline.compute_and_save(lookback_sec=lookback_sec) + threshold_recs = self.baseline.get_threshold_recommendations() + baselines_updated = True + logger.info("T3: baselines updated, %d threshold recommendations", len(threshold_recs)) + except Exception as e: + logger.warning("T3 baseline update failed: %s", e) + + result = T3Result( + period_summary=parsed.get("period_summary", ""), + sessions_analyzed=int(parsed.get("sessions_analyzed", len(session_narratives))), + features_worked=parsed.get("features_worked", []), + features_failed=parsed.get("features_failed", []), + recurring_patterns=parsed.get("recurring_patterns", []), + cost_summary=parsed.get("cost_summary", {}), + regression_detected=bool(parsed.get("regression_detected", False)), + regression_detail=parsed.get("regression_detail", ""), + action_items=parsed.get("action_items", []), + baselines_updated=baselines_updated, + threshold_recommendations=threshold_recs, + session_narratives=session_narratives, + model=self.config.ollama_model_deep, + inference_duration_ms=infer_ms, + raw_response=raw, + ) + + # Step 6: push all outputs + self._push_outputs(result, end_ns, trigger) + self._annotate_grafana(result, trigger) + + logger.info( + "T3 [%s/%s]: %d sessions, %d patterns, regression=%s, baselines=%s, %dms", + mode, trigger, + result.sessions_analyzed, + len(result.recurring_patterns), + result.regression_detected, + result.baselines_updated, + infer_ms, + ) + return result + + # ── Private ─────────────────────────────────────────────────────────────── + + def _load_evidence_packets(self, start_ns: int, end_ns: int) -> list[dict]: + logql = '{app="sim-steward", event="sentinel_evidence_packet"}' + try: + return self.loki.query_lines(logql, start_ns, end_ns, limit=200) + except Exception as e: + logger.warning("T3 evidence packet load failed: %s", e) + return [] + + def _load_investigations(self, start_ns: int, end_ns: int) -> list[dict]: + logql = '{app="sim-steward", event="sentinel_t2_investigation"}' + try: + return self.loki.query_lines(logql, start_ns, end_ns, limit=50) + except Exception as e: + logger.warning("T3 investigation load failed: %s", e) + return [] + + def _fetch_sentry_context(self) -> tuple[str, str]: + issues_text = "(Sentry unavailable)" + releases_text = "(no release data)" + try: + issues = self.sentry.search_issues(query="is:unresolved", limit=20) + if issues: + lines = [ + f" [{i.get('level', '?').upper()}] {i.get('title', '?')[:80]}" + f" (times_seen={i.get('count', '?')}, last={i.get('lastSeen', '?')[:10]})" + for i in issues + ] + issues_text = "\n".join(lines) + else: + issues_text = "(no open Sentry issues)" + except Exception as e: + logger.debug("T3 Sentry issues fetch failed: %s", e) + + try: + releases = self.sentry.find_releases(limit=5) + if releases: + lines = [ + f" {r.get('version', '?')} released {r.get('dateCreated', '?')[:10]}" + for r in releases + ] + releases_text = "\n".join(lines) + else: + releases_text = "(no releases found)" + except Exception as e: + logger.debug("T3 Sentry releases fetch failed: %s", e) + + return issues_text, releases_text + + def _push_outputs(self, result: T3Result, end_ns: int, trigger: str) -> None: + # Push synthesis summary + try: + self.loki.push_synthesis(result, trigger=trigger, env=self.config.env_label) + except Exception as e: + logger.warning("T3: failed to push synthesis to Loki: %s", e) + + # Push per-session narratives + for narrative in result.session_narratives: + try: + self.loki.push_narrative(narrative, env=self.config.env_label) + except Exception as e: + logger.debug("T3: failed to push narrative for %s: %s", narrative.get("session_id"), e) + + # Push threshold recommendations + for rec in result.threshold_recommendations: + try: + self.loki.push_threshold_recommendation(rec, env=self.config.env_label) + except Exception as e: + logger.debug("T3: failed to push threshold rec: %s", e) + + def _annotate_grafana(self, result: T3Result, trigger: str) -> None: + try: + regression_note = f" ⚠️ Regression: {result.regression_detail[:60]}" if result.regression_detected else "" + self.grafana.annotate_raw( + title=f"T3 Synthesis [{self.config.sentinel_mode}]: {result.sessions_analyzed} sessions", + text=( + f"{result.period_summary[:200]}{regression_note}
" + f"Patterns: {len(result.recurring_patterns)} | " + f"Baselines updated: {result.baselines_updated} | " + f"Trigger: {trigger}" + ), + tags=["t3", "synthesis", self.config.sentinel_mode, trigger], + ) + except Exception as e: + logger.debug("T3 Grafana annotation failed: %s", e) + + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +def _format_window(start_ns: int, end_ns: int, mode: str) -> str: + start_dt = datetime.fromtimestamp(start_ns / 1e9, tz=timezone.utc) + end_dt = datetime.fromtimestamp(end_ns / 1e9, tz=timezone.utc) + return ( + f"{start_dt.strftime('%Y-%m-%d %H:%M')} – {end_dt.strftime('%H:%M')} UTC " + f"({int((end_ns - start_ns) / 3.6e12):.0f}h window, mode={mode})" + ) + + +def _format_evidence_summary(packets: list[dict]) -> str: + if not packets: + return " (none)" + lines = [] + for p in packets[:20]: + lines.append( + f" [{p.get('severity', '?').upper()}] {p.get('anomaly_description', '')[:80]}" + ) + if len(packets) > 20: + lines.append(f" [... {len(packets) - 20} more]") + return "\n".join(lines) + + +def _format_investigation_summary(investigations: list[dict]) -> str: + if not investigations: + return " (none)" + lines = [] + for inv in investigations[:10]: + lines.append( + f" [{inv.get('confidence', '?')}] {inv.get('root_cause', '')[:80]}" + f" (type={inv.get('issue_type', '?')})" + ) + return "\n".join(lines) + + +def _format_narratives_for_prompt(session_narratives: list[dict]) -> str: + if not session_narratives: + return " (no session narratives available — no invocations this window)" + parts = [] + for n in session_narratives[:10]: + parts.append(n.get("narrative_text", "")[:600]) + return "\n\n".join(parts) diff --git a/observability/local/log-sentinel/tests/__init__.py b/observability/local/log-sentinel/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/observability/local/log-sentinel/timeline.py b/observability/local/log-sentinel/timeline.py new file mode 100644 index 0000000..649961e --- /dev/null +++ b/observability/local/log-sentinel/timeline.py @@ -0,0 +1,200 @@ +"""Cross-stream timeline builder — correlates events from all Loki streams.""" + +import logging +from dataclasses import dataclass, field +from datetime import datetime, timezone + +from loki_client import LokiClient +from circuit_breaker import CircuitBreaker + +logger = logging.getLogger("sentinel.timeline") + +# Streams to query and their display names +STREAMS = [ + ("sim-steward", '{app="sim-steward"} | json'), + ("claude-dev-logging", '{app="claude-dev-logging"} | json'), + ("claude-token-metrics", '{app="claude-token-metrics"} | json'), +] + +# Events to exclude from the timeline (too noisy) +_SKIP_EVENTS = {"sentinel_log", "sentinel_cycle", "sentinel_analyst_run", "sentinel_timeline_built"} + +# Temporal correlation window (nanoseconds) +_TEMPORAL_WINDOW_NS = 30 * 1_000_000_000 + + +@dataclass +class TimelineEvent: + ts_ns: int + ts_iso: str + stream: str + event_type: str + domain: str + component: str + message: str + session_id: str | None + subsession_id: str | None + raw: dict = field(repr=False) + + +class TimelineBuilder: + def __init__(self, loki: LokiClient, breaker: CircuitBreaker): + self.loki = loki + self.breaker = breaker + + def build( + self, + start_ns: int, + end_ns: int, + limit_per_stream: int = 200, + ) -> list[TimelineEvent]: + """Query all streams, merge and sort chronologically.""" + if not self.breaker.allow_request(): + logger.warning("Timeline build skipped: Loki circuit open") + return [] + + all_events: list[TimelineEvent] = [] + try: + for stream_name, logql in STREAMS: + lines = self.loki.query_lines(logql, start_ns, end_ns, limit=limit_per_stream) + self.breaker.record_success() + for line in lines: + ev = self._parse_event(stream_name, line) + if ev: + all_events.append(ev) + except Exception as e: + self.breaker.record_failure() + logger.error("Timeline build error: %s", e) + return all_events + + all_events.sort(key=lambda e: e.ts_ns) + return all_events + + def _parse_event(self, stream: str, line: dict) -> TimelineEvent | None: + event_type = line.get("event", "") + if event_type in _SKIP_EVENTS: + return None + + # Parse timestamp — prefer the log's own timestamp field, fallback to now + ts_ns = 0 + ts_iso = line.get("timestamp", "") + if ts_iso: + try: + dt = datetime.fromisoformat(ts_iso.replace("Z", "+00:00")) + ts_ns = int(dt.timestamp() * 1e9) + except (ValueError, TypeError): + pass + if not ts_ns: + ts_ns = self.loki.now_ns() + ts_iso = datetime.now(timezone.utc).isoformat() + + return TimelineEvent( + ts_ns=ts_ns, + ts_iso=ts_iso, + stream=stream, + event_type=event_type or "unknown", + domain=line.get("domain", ""), + component=line.get("component", ""), + message=line.get("message", ""), + session_id=line.get("session_id") or None, + subsession_id=line.get("subsession_id") or None, + raw=line, + ) + + def get_active_sessions(self, events: list[TimelineEvent]) -> list[str]: + """Return distinct session_ids seen in the event list.""" + seen = [] + for ev in events: + if ev.session_id and ev.session_id not in seen: + seen.append(ev.session_id) + return seen + + def to_prompt_text(self, events: list[TimelineEvent], max_events: int = 60) -> str: + """Format timeline as human-readable numbered lines for LLM consumption.""" + if not events: + return "(no events in this window)" + + truncated = len(events) > max_events + shown = events[-max_events:] if truncated else events + + # Group by session_id + sessions: dict[str, list[TimelineEvent]] = {} + no_session: list[TimelineEvent] = [] + + for ev in shown: + if ev.session_id: + sessions.setdefault(ev.session_id, []).append(ev) + else: + no_session.append(ev) + + lines = [] + counter = 1 + + for sid, evts in sessions.items(): + # Find subsession if present + sub = next((e.subsession_id for e in evts if e.subsession_id), None) + header = f"SESSION {sid[:8]}" + if sub: + header += f" [subsession {sub}]" + lines.append(header) + for ev in evts: + lines.append(_format_event_line(counter, ev)) + counter += 1 + lines.append("") + + if no_session: + lines.append("CO-OCCURRING (no session correlation)") + for ev in no_session: + lines.append(_format_event_line(counter, ev)) + counter += 1 + + if truncated: + lines.append( + f"\n[NOTE: {len(events) - max_events} earlier events not shown. " + f"Earliest: {events[0].ts_iso}, Latest: {events[-1].ts_iso}]" + ) + + return "\n".join(lines) + + def get_stats(self, events: list[TimelineEvent]) -> dict: + sessions = self.get_active_sessions(events) + streams = list({e.stream for e in events}) + return { + "event_count": len(events), + "session_count": len(sessions), + "streams_queried": streams, + } + + +def _format_event_line(idx: int, ev: TimelineEvent) -> str: + # Extract time portion only (HH:MM:SS) + try: + t = ev.ts_iso[11:19] + except (IndexError, TypeError): + t = "??:??:??" + + # Pick the most informative extra field from raw + extra = _pick_extra(ev) + extra_str = f" {extra}" if extra else "" + + return ( + f" [{idx:03d}] {t} {ev.stream:<25} {ev.event_type:<30}{extra_str}" + ) + + +def _pick_extra(ev: TimelineEvent) -> str: + """Extract a short key=value summary from the raw event for the timeline.""" + raw = ev.raw + candidates = [ + ("action", raw.get("action")), + ("tool", raw.get("tool_name")), + ("event_type", raw.get("hook_type")), + ("track", raw.get("track_display_name")), + ("driver", raw.get("display_name")), + ("cost_usd", raw.get("cost_usd")), + ("tokens", raw.get("total_tokens")), + ("error", raw.get("error")), + ("duration_ms", raw.get("duration_ms")), + ] + parts = [f"{k}={v}" for k, v in candidates if v is not None and v != ""] + return " ".join(parts[:3]) diff --git a/observability/local/log-sentinel/trace.py b/observability/local/log-sentinel/trace.py new file mode 100644 index 0000000..5d336a4 --- /dev/null +++ b/observability/local/log-sentinel/trace.py @@ -0,0 +1,225 @@ +"""Feature invocation model — groups timeline events into traceable user actions. + +Three correlation strategies (applied in order): + 1. trace_id exact — events share a trace_id field (plugin + dashboard instrumented) + 2. temporal — events cluster within 150ms with expected sequence patterns + 3. inferred — fallback: group by session_id + 1-minute time bucket +""" + +import logging +import uuid +from dataclasses import dataclass, field + +from timeline import TimelineEvent + +logger = logging.getLogger("sentinel.trace") + +# Temporal grouping window (nanoseconds) +_TEMPORAL_WINDOW_NS = 150_000_000 # 150ms + +# Events that anchor the start of a new invocation in temporal mode +_ANCHOR_EVENTS = { + "dashboard_ui_event", + "action_dispatched", + "iracing_session_start", + "iracing_replay_seek", +} + +# Events that signal the end of an invocation +_TERMINAL_EVENTS = { + "action_result", + "iracing_session_end", +} + +# Inferred grouping bucket (nanoseconds) +_BUCKET_NS = 60 * 1_000_000_000 # 1 minute + + +@dataclass +class FeatureInvocation: + invocation_id: str # trace_id if available, else generated UUID + correlation_method: str # "trace_id" | "temporal" | "inferred" + start_ts_ns: int + end_ts_ns: int + action_type: str # "replay_seek" | "incident_review" | "session_start" | etc. + trigger_event: TimelineEvent # first event in this invocation + events: list[TimelineEvent] # all events belonging to this invocation + success: bool | None # did the feature complete? None = unknown + error: str | None # error message if failed + duration_ms: int + streams_involved: list[str] # which Loki streams contributed events + + def to_summary_dict(self) -> dict: + """Compact serializable summary for Loki push and LLM context.""" + return { + "invocation_id": self.invocation_id, + "correlation_method": self.correlation_method, + "action_type": self.action_type, + "success": self.success, + "error": self.error, + "duration_ms": self.duration_ms, + "event_count": len(self.events), + "streams": self.streams_involved, + "start_ts_ns": self.start_ts_ns, + "end_ts_ns": self.end_ts_ns, + } + + +class InvocationBuilder: + """Groups a flat list of TimelineEvents into FeatureInvocation objects.""" + + def build(self, events: list[TimelineEvent]) -> list[FeatureInvocation]: + """ + Returns invocations built from the event list. + Events are consumed across three passes; any event can only belong to one invocation. + """ + remaining = list(events) + invocations: list[FeatureInvocation] = [] + + # Pass 1 — exact trace_id grouping + trace_invocations, remaining = self._group_by_trace_id(remaining) + invocations.extend(trace_invocations) + + # Pass 2 — temporal window grouping + temporal_invocations, remaining = self._group_temporal(remaining) + invocations.extend(temporal_invocations) + + # Pass 3 — inferred (session + time bucket) + inferred_invocations = self._group_inferred(remaining) + invocations.extend(inferred_invocations) + + logger.debug( + "InvocationBuilder: %d events → %d invocations (%d trace_id, %d temporal, %d inferred)", + len(events), + len(invocations), + len(trace_invocations), + len(temporal_invocations), + len(inferred_invocations), + ) + return sorted(invocations, key=lambda i: i.start_ts_ns) + + # ── Pass 1: exact trace_id ───────────────────────────────────────────── + + def _group_by_trace_id( + self, events: list[TimelineEvent] + ) -> tuple[list[FeatureInvocation], list[TimelineEvent]]: + groups: dict[str, list[TimelineEvent]] = {} + leftover: list[TimelineEvent] = [] + + for ev in events: + tid = ev.raw.get("trace_id") + if tid: + groups.setdefault(tid, []).append(ev) + else: + leftover.append(ev) + + invocations = [ + _build_invocation(group, "trace_id", trace_id=tid) + for tid, group in groups.items() + ] + return invocations, leftover + + # ── Pass 2: temporal window ──────────────────────────────────────────── + + def _group_temporal( + self, events: list[TimelineEvent] + ) -> tuple[list[FeatureInvocation], list[TimelineEvent]]: + if not events: + return [], [] + + sorted_events = sorted(events, key=lambda e: e.ts_ns) + groups: list[list[TimelineEvent]] = [] + current: list[TimelineEvent] = [] + + for ev in sorted_events: + if not current: + current = [ev] + continue + + gap = ev.ts_ns - current[-1].ts_ns + is_anchor = ev.event_type in _ANCHOR_EVENTS + + if is_anchor or gap > _TEMPORAL_WINDOW_NS: + if current: + groups.append(current) + current = [ev] + else: + current.append(ev) + + if current: + groups.append(current) + + # Drop single-event groups with no action signal — too noisy + meaningful = [g for g in groups if len(g) > 1 or g[0].event_type in _ANCHOR_EVENTS] + leftover = [ev for g in groups if g not in meaningful for ev in g] + + invocations = [_build_invocation(g, "temporal") for g in meaningful] + return invocations, leftover + + # ── Pass 3: inferred (session + time bucket) ─────────────────────────── + + def _group_inferred(self, events: list[TimelineEvent]) -> list[FeatureInvocation]: + if not events: + return [] + + buckets: dict[str, list[TimelineEvent]] = {} + for ev in events: + sid = ev.session_id or "no_session" + bucket = ev.ts_ns // _BUCKET_NS + key = f"{sid}:{bucket}" + buckets.setdefault(key, []).append(ev) + + return [_build_invocation(group, "inferred") for group in buckets.values()] + + +# ── Helpers ──────────────────────────────────────────────────────────────── + +def _build_invocation( + events: list[TimelineEvent], + method: str, + trace_id: str | None = None, +) -> FeatureInvocation: + sorted_events = sorted(events, key=lambda e: e.ts_ns) + start_ns = sorted_events[0].ts_ns + end_ns = sorted_events[-1].ts_ns + duration_ms = max(0, (end_ns - start_ns) // 1_000_000) + + # action_type: prefer action_dispatched.raw["action"], else trigger event_type + action_type = "unknown" + for ev in sorted_events: + if ev.event_type == "action_dispatched": + action_type = ev.raw.get("action") or ev.event_type + break + if action_type == "unknown": + action_type = sorted_events[0].event_type or "unknown" + + # success / error: look for terminal events + success: bool | None = None + error: str | None = None + for ev in sorted_events: + if ev.event_type in _TERMINAL_EVENTS or ev.event_type.endswith("_result"): + raw_success = ev.raw.get("success") + raw_error = ev.raw.get("error") + if raw_error: + success = False + error = str(raw_error)[:200] + break + if raw_success is not None: + success = bool(raw_success) + break + + streams = list({ev.stream for ev in sorted_events}) + + return FeatureInvocation( + invocation_id=trace_id or str(uuid.uuid4()), + correlation_method=method, + start_ts_ns=start_ns, + end_ts_ns=end_ns, + action_type=action_type, + trigger_event=sorted_events[0], + events=sorted_events, + success=success, + error=error, + duration_ms=duration_ms, + streams_involved=streams, + ) diff --git a/observability/local/logs/claude-session-metrics.jsonl b/observability/local/logs/claude-session-metrics.jsonl new file mode 100644 index 0000000..031585b --- /dev/null +++ b/observability/local/logs/claude-session-metrics.jsonl @@ -0,0 +1 @@ +{"event":"claude_session_metrics","session_id":"94406f61-7d81-49e0-8b78-cea2011dff2e","project":"local","machine":"WIN-PC","env":"local","timestamp":"2026-03-26T21:22:07.242Z","total_input_tokens":669,"total_output_tokens":142861,"total_cache_creation_tokens":1867125,"total_cache_read_tokens":86930799,"total_tokens":143530,"assistant_turns":422,"tool_use_count":0,"model":"claude-opus-4-6","effort":"med","thinking":true,"cost_usd":176.1294} diff --git a/src/SimSteward.Plugin/SimStewardPlugin.DataCaptureSuite.cs b/src/SimSteward.Plugin/SimStewardPlugin.DataCaptureSuite.cs index 1175f9c..5255eaf 100644 --- a/src/SimSteward.Plugin/SimStewardPlugin.DataCaptureSuite.cs +++ b/src/SimSteward.Plugin/SimStewardPlugin.DataCaptureSuite.cs @@ -419,12 +419,13 @@ private void TickPreflight() return; } - // Seek to near-end of replay for L2 - int seekTarget = Math.Max(0, _replayFrameTotal - 10); + // Seek to end of replay using ReplaySearch(ToEnd) — more reliable than + // frame-based seek (ReplayFrameNumEnd can be 0 or stale, which would + // seek to frame 0 and read SessionState at replay start instead of end). + _preflightSettleTicks = 0; try { - _irsdk.ReplaySetPlaySpeed(1, false); - _irsdk.ReplaySetPlayPosition(IRacingSdkEnum.RpyPosMode.Begin, seekTarget); + _irsdk.ReplaySearch(IRacingSdkEnum.RpySrchMode.ToEnd); } catch (Exception ex) { @@ -444,8 +445,10 @@ private void TickPreflight() { _preflightSettleTicks++; int frame = SafeGetInt("ReplayFrameNum"); - int seekTarget = Math.Max(0, _replayFrameTotal - 10); - if (Math.Abs(frame - seekTarget) <= 30 || _preflightSettleTicks > 300) + // ReplaySearch(ToEnd) is fire-and-forget; we don't have an exact target frame. + // Settle when: near ReplayFrameNumEnd (if valid) OR after 60 ticks (1s min wait). + bool nearEnd = _replayFrameTotal > 0 && frame >= _replayFrameTotal - 60; + if (nearEnd || _preflightSettleTicks >= 60 || _preflightSettleTicks > 300) { int sessionState = 0; try { sessionState = _irsdk.Data.GetInt("SessionState"); } catch { } diff --git a/token-cost-dashboard.png b/token-cost-dashboard.png new file mode 100644 index 0000000..742ae82 Binary files /dev/null and b/token-cost-dashboard.png differ