diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md
index dedcaf7..411818c 100644
--- a/.claude/CLAUDE.md
+++ b/.claude/CLAUDE.md
@@ -150,4 +150,31 @@ All session context fields fall back to `"not in session"` (use `SessionLogging.
- [ ] New iRacing SDK event handler → structured log with `domain="iracing"`
- [ ] `iracing_incident` / `incident_detected` log → full uniqueness signature (`unique_user_id`, start/end frame, camera)
-**Canonical reference:** [docs/RULES-ActionCoverage.md](../docs/RULES-ActionCoverage.md)
\ No newline at end of file
+**Canonical reference:** [docs/RULES-ActionCoverage.md](../docs/RULES-ActionCoverage.md)
+
+---
+
+## Grafana Alert Covenant
+
+Every behavioral change to the plugin, dashboard, or LLM integration MUST include a Grafana alert review. **Alert silence ≠ alert passing.**
+
+### Change → Domain quick-reference
+
+| Change type | Domain to check |
+|---|---|
+| New `DispatchAction` branch | Domain 3 — `action-failure-streak` thresholds |
+| New iRacing SDK event | Domains 3 + 7 — session/replay rules |
+| New Claude API integration | Domains 4 + 5 — session health + cost |
+| New MCP tool | Domain 4 — `mcp-service-errors`, `tool-loop-detected` |
+| Log event renamed/removed | Search alert YAMLs — alert will go **silent**, not fire |
+| New log event/field | Consider whether a new alert rule is warranted |
+| Sentinel code change | Domain 6 — self-health rules |
+
+### PR Checklist addition
+
+- [ ] Reviewed impacted Grafana alert domains (see table above)
+- [ ] Verified no alert queries break silently if log events were renamed/removed
+- [ ] Considered new alert rule if new log events were added
+
+**Alert YAML files:** `observability/local/grafana/provisioning/alerting/` (46 rules, 8 domains)
+**Canonical reference:** [docs/RULES-GrafanaAlerts.md](../docs/RULES-GrafanaAlerts.md)
\ No newline at end of file
diff --git a/.playwright-mcp/console-2026-03-26T21-06-47-084Z.log b/.playwright-mcp/console-2026-03-26T21-06-47-084Z.log
new file mode 100644
index 0000000..c4778cd
--- /dev/null
+++ b/.playwright-mcp/console-2026-03-26T21-06-47-084Z.log
@@ -0,0 +1,2 @@
+[ 885ms] [WARNING] is deprecated. Please include @ http://localhost:3000/login:0
+[ 23600ms] [ERROR] Failed to load resource: the server responded with a status of 401 (Unauthorized) @ http://localhost:3000/login:0
diff --git a/.playwright-mcp/console-2026-03-26T21-07-25-658Z.log b/.playwright-mcp/console-2026-03-26T21-07-25-658Z.log
new file mode 100644
index 0000000..6b1e95a
--- /dev/null
+++ b/.playwright-mcp/console-2026-03-26T21-07-25-658Z.log
@@ -0,0 +1,3 @@
+[ 292ms] [WARNING] is deprecated. Please include @ http://localhost:3000/login:0
+[ 15683ms] [ERROR] Failed to load resource: the server responded with a status of 401 (Unauthorized) @ http://localhost:3000/login:0
+[ 49060ms] [ERROR] Failed to load resource: the server responded with a status of 401 (Unauthorized) @ http://localhost:3000/login:0
diff --git a/.playwright-mcp/console-2026-03-26T21-11-17-962Z.log b/.playwright-mcp/console-2026-03-26T21-11-17-962Z.log
new file mode 100644
index 0000000..1b67327
--- /dev/null
+++ b/.playwright-mcp/console-2026-03-26T21-11-17-962Z.log
@@ -0,0 +1,2 @@
+[ 602ms] [WARNING] is deprecated. Please include @ http://localhost:3000/login:0
+[ 18682ms] [WARNING] is deprecated. Please include @ http://localhost:3000/d/claude-token-cost?orgId=1&from=now-7d&to=now&kiosk:0
diff --git a/.playwright-mcp/console-2026-03-26T21-11-55-773Z.log b/.playwright-mcp/console-2026-03-26T21-11-55-773Z.log
new file mode 100644
index 0000000..a28d881
--- /dev/null
+++ b/.playwright-mcp/console-2026-03-26T21-11-55-773Z.log
@@ -0,0 +1 @@
+[ 648ms] [WARNING] is deprecated. Please include @ http://localhost:3000/d/claude-cache-context?orgId=1&from=now-7d&to=now:0
diff --git a/.playwright-mcp/console-2026-03-26T21-12-23-787Z.log b/.playwright-mcp/console-2026-03-26T21-12-23-787Z.log
new file mode 100644
index 0000000..c11747f
--- /dev/null
+++ b/.playwright-mcp/console-2026-03-26T21-12-23-787Z.log
@@ -0,0 +1,2 @@
+[ 636ms] [WARNING] is deprecated. Please include @ http://localhost:3000/d/simsteward-log-sentinel?orgId=1&from=now-6h&to=now:0
+[ 59082ms] [ERROR] WebSocket connection to 'ws://localhost:3000/api/live/ws' failed: Connection closed before receiving a handshake response @ http://localhost:3000/public/build/3855.c53eb219979d7cb3b2d4.js:1312
diff --git a/cache-context-dashboard.png b/cache-context-dashboard.png
new file mode 100644
index 0000000..000cc30
Binary files /dev/null and b/cache-context-dashboard.png differ
diff --git a/docs/RULES-GrafanaAlerts.md b/docs/RULES-GrafanaAlerts.md
new file mode 100644
index 0000000..7692acd
--- /dev/null
+++ b/docs/RULES-GrafanaAlerts.md
@@ -0,0 +1,93 @@
+# Grafana Alert Rules — Development Covenant
+
+Every behavioral change to the plugin, dashboard, or LLM integration **must include a
+corresponding Grafana alert review**. Silence is not the same as passing.
+
+**Canonical spec:** `docs/superpowers/specs/2026-03-30-grafana-alerts-design.md`
+**Alert YAML files:** `observability/local/grafana/provisioning/alerting/`
+
+---
+
+## Change → Domain Mapping
+
+| Change type | Domain(s) to review |
+|---|---|
+| New action handler in `DispatchAction` | Domain 3 — check `action-failure-streak` thresholds |
+| New iRacing SDK event handler | Domain 3 and/or Domain 7 — check incident/replay rules |
+| New Claude API integration | Domains 4 + 5 — session health and cost rules |
+| New MCP tool added | Domain 4 — `mcp-service-errors`, `tool-loop-detected` |
+| New log event or field added | Check all domains — does it need a new alert? |
+| Removing or renaming a log event | Search alert YAMLs for old name — alert will go **silent**, not fire |
+| Changing cost fields in token metrics | Domain 5 — all cost threshold alerts |
+| Changing session lifecycle events | Domains 3, 4, 8 — session start/end correlation |
+| Sentinel code change | Domain 6 — self-health rules |
+| Grafana dashboard change | Domain 8 — cross-stream rules may need annotation updates |
+
+---
+
+## Alert Silence ≠ Alert Passing
+
+When you rename or remove a log event:
+- The alert query will return **no data** (not 0)
+- If `noDataState: OK` — the alert silently stops firing
+- This is a **silent regression** — harder to detect than a real alert
+
+Always check `noDataState` when modifying events that existing alerts depend on.
+
+---
+
+## Testing New Alerts
+
+To verify an alert fires correctly before relying on it:
+
+1. **Write a test event to Loki** via the gateway:
+ ```bash
+ curl -X POST http://localhost:3500/loki/api/v1/push \
+ -H "Content-Type: application/json" \
+ -d '{
+ "streams": [{
+ "stream": {"app": "sim-steward", "env": "local", "level": "ERROR"},
+ "values": [["'"$(date +%s%N)"'", "{\"level\":\"ERROR\",\"event\":\"test\",\"message\":\"test alert\"}"]]
+ }]
+ }'
+ ```
+
+2. **Temporarily lower the threshold** in the alert rule to `0` and set the evaluation interval to `10s` in Grafana UI (do not commit this change).
+
+3. **Verify the alert fires** in Grafana UI → Alerting → Alert Rules within the evaluation window.
+
+4. **Verify the `/trigger` webhook** receives the payload:
+ ```bash
+ # Check log-sentinel logs
+ docker compose logs log-sentinel --tail=20
+ ```
+
+5. **Restore the threshold** before committing any YAML changes.
+
+---
+
+## Alert Catalog Summary
+
+| File | Domains | Count |
+|---|---|---|
+| `rules-infrastructure.yml` | 1+2: Infrastructure & Deploy Quality | 10 |
+| `rules-iracing.yml` | 3+7: iRacing Session + Replay | 10 |
+| `rules-claude-sessions.yml` | 4: Claude Code Session Health | 7 |
+| `rules-token-cost.yml` | 5: Token & Cost Budget | 7 |
+| `rules-sentinel-health.yml` | 6: Sentinel Self-Health | 7 |
+| `rules-cross-stream.yml` | 8: Cross-Stream Correlation | 5 |
+| **Total** | | **46** |
+
+T2-tier alerts (skip `needs_t2` gate, escalate immediately):
+`subagent-explosion`, `tool-loop-detected`, `session-cost-critical`, `daily-spend-critical`,
+`ws-claude-coinflict`, `session-token-abandon`, `action-fail-session-fail`, `deploy-triple-signal`
+
+---
+
+## PR Checklist Addition
+
+For any PR modifying plugin behavior, add to the review checklist:
+
+- [ ] Reviewed Grafana alert domains for impacted change type (see table above)
+- [ ] If log events were renamed/removed: verified no alert queries silently break
+- [ ] If new log events added: considered whether a new alert rule is warranted
diff --git a/docs/superpowers/specs/2026-03-30-grafana-alerts-design.md b/docs/superpowers/specs/2026-03-30-grafana-alerts-design.md
new file mode 100644
index 0000000..88c365c
--- /dev/null
+++ b/docs/superpowers/specs/2026-03-30-grafana-alerts-design.md
@@ -0,0 +1,218 @@
+# Grafana Alerts Design — Log Sentinel Layer 0
+**Date:** 2026-03-30
+**Status:** Approved
+
+---
+
+## Context
+
+The log-sentinel V2 LLM investigation pipeline (T1 triage + T2 agentic tool loop) is expensive to run continuously — qwen3:8b T1 scan on a 6700 XT takes 60-90 seconds, T2 takes 3-4 minutes. Running this on a fixed hourly poll means real incidents can sit undetected for up to 60 minutes, and the models waste cycles on quiet periods.
+
+Grafana Alerts solves this as **Layer 0**: always-on, no GPU cost, fires webhooks only when something is actually wrong. The sentinel switches from polling to event-driven. When Grafana fires, it delivers structured alert context (labels, values, timeframe) directly in the webhook payload — T1 skips cold-start gathering for the relevant domain and goes straight to targeted investigation.
+
+**Layer 0 (Grafana Alerts) → Layer 1 (T1 fast triage) → Layer 2 (T2 agentic tool loop)**
+
+---
+
+## Alert Architecture
+
+### Transport: Webhook-Only
+Grafana alert notifications route exclusively to log-sentinel's `/trigger` HTTP endpoint. No email, Slack, or PagerDuty at this stage. The sentinel logs every trigger, runs the appropriate tier, and emits findings to Loki (queryable by Grafana dashboards).
+
+### Provisioning Structure
+All alerts are provisioned as code — no manual UI configuration:
+```
+observability/local/grafana/provisioning/alerting/
+ contact-points.yml # webhook endpoint definition
+ notification-policies.yml # routing: all alerts → webhook
+ rules-infrastructure.yml # Domains 1+2
+ rules-iracing.yml # Domain 3+7
+ rules-claude-sessions.yml # Domain 4
+ rules-token-cost.yml # Domain 5
+ rules-sentinel-health.yml # Domain 6
+ rules-cross-stream.yml # Domain 8
+```
+
+### Trigger Tier Labeling
+Every alert rule carries a `trigger_tier` label (`t1` or `t2`). The sentinel's `/trigger` endpoint reads this label and routes accordingly — T1 for most alerts, T2 for critical multi-signal correlations.
+
+---
+
+## Alert Catalog
+
+### Domain 1+2: Infrastructure & Deploy Quality (10 alerts)
+
+| Alert ID | LogQL / Condition | Severity | Tier |
+|---|---|---|---|
+| `bridge-start-failed` | `count_over_time({app="sim-steward"} \| json \| event="plugin_lifecycle" \| level="ERROR" [5m]) > 0` | critical | T1 |
+| `plugin-never-ready` | plugin_lifecycle start, no ready within 60s | warn | T1 |
+| `sentinel-cycle-stalled` | No `sentinel_cycle` event in 90 min | critical | T1 |
+| `ollama-unreachable` | `sentinel_health` event with `ollama_reachable=false` | critical | T1 |
+| `loki-circuit-open` | `sentinel_health` with `loki_circuit_open=true` | critical | T1 |
+| `post-deploy-warn-rate` | WARN rate > 5/min in 10 min after lifecycle event | warn | T1 |
+| `bridge-failure-post-deploy` | ERROR in sim-steward within 15 min of plugin_start | critical | T1 |
+| `plugin-slow-start` | Time from plugin_lifecycle start → ready > 30s | warn | T1 |
+| `error-spike-post-deploy` | Error count doubles vs prior 15 min window after deploy | warn | T1 |
+| `error-spike-general` | `count_over_time({app="sim-steward"} \| json \| level="ERROR" [10m]) > 10` | warn | T1 |
+
+### Domain 3: iRacing Session Behavior (5 alerts)
+
+| Alert ID | Condition | Severity | Tier |
+|---|---|---|---|
+| `session-no-actions` | Session active 15+ min, zero `action_dispatched` events | warn | T1 |
+| `session-no-end` | `iracing_session_start` with no `iracing_session_end` within 4h | warn | T1 |
+| `action-failure-streak` | 3+ consecutive `action_result` errors in same session | critical | T1 |
+| `websocket-disconnect-spike` | 3+ `websocket_disconnect` events in 5 min | warn | T1 |
+| `incident-detection-zero` | iRacing session > 30 min, zero `iracing_incident` events | warn | T1 |
+
+### Domain 4: Claude Code Session Health (7 alerts)
+
+| Alert ID | Condition | Severity | Tier |
+|---|---|---|---|
+| `session-abandoned` | Session start, no completion token entry, no activity for 30 min | warn | T1 |
+| `claude-error-spike` | 5+ ERROR entries in claude-dev-logging in 5 min | warn | T1 |
+| `permission-flood` | 10+ permission-related log entries in 5 min | warn | T1 |
+| `subagent-explosion` | Subagent spawn count > 20 in single session | warn | T2 |
+| `mcp-service-errors` | MCP call failures > 5 in 10 min | warn | T1 |
+| `tool-loop-detected` | Same tool called 5+ times in same session without progress | warn | T2 |
+| `session-zero-output` | Session completes (token entry exists), zero assistant messages logged | warn | T1 |
+
+### Domain 5: Token/Cost Budget (7 alerts)
+
+| Alert ID | Condition | Severity | Tier |
+|---|---|---|---|
+| `session-cost-spike` | Single session cost > $1.00 | warn | T1 |
+| `session-cost-critical` | Single session cost > $3.00 | critical | T2 |
+| `daily-spend-warning` | Rolling 24h spend > $10.00 | warn | T1 |
+| `daily-spend-critical` | Rolling 24h spend > $25.00 | critical | T2 |
+| `tool-use-flood` | Tool calls per session > 100 | warn | T1 |
+| `unexpected-model` | Model field not in approved set (claude-opus-4, claude-sonnet-4-6, etc.) | warn | T1 |
+| `cache-hit-rate-low` | Cache hit rate < 20% over 1h (when sessions active) | info | T1 |
+
+### Domain 6: Sentinel Self-Health (7 alerts)
+
+| Alert ID | Condition | Severity | Tier |
+|---|---|---|---|
+| `sentinel-cycle-stalled` | No `sentinel_cycle` event in 90 min | critical | T1 |
+| `detector-error-rate` | Detector errors > 3 in single cycle | warn | T1 |
+| `t1-slow` | T1 inference duration > 120s | warn | T1 |
+| `t2-slow` | T2 tool loop duration > 300s | warn | T1 |
+| `sentry-flood` | Sentry-worthy findings > 5 in 1h | warn | T1 |
+| `findings-flood` | Total findings > 20 in single cycle | warn | T1 |
+| `zero-findings-48h` | No findings at all in 48h (system may be suppressing) | info | T1 |
+
+### Domain 7: Replay & Incident Investigation (5 alerts)
+
+| Alert ID | Condition | Severity | Tier |
+|---|---|---|---|
+| `replay-no-seeks` | Replay started, zero `iracing_replay_seek` in 5 min | warn | T1 |
+| `incident-detection-stall` | iRacing session active > 30 min, zero `iracing_incident` events in replay mode | warn | T1 |
+| `incident-camera-stuck` | Same `camera_view` on 3+ consecutive incidents | info | T1 |
+| `replay-session-no-close` | Replay session start, no session_end within 2h | warn | T1 |
+| `action-incident-gap` | Incident detected, no `action_dispatched` within 10 min | info | T1 |
+
+### Domain 8: Cross-Stream Correlation (5 alerts)
+*Implemented as multi-query rules using Grafana `math` expressions — fires only when both conditions true simultaneously.*
+
+| Alert ID | Streams | Condition | Severity | Tier |
+|---|---|---|---|---|
+| `ws-claude-coinflict` | sim-steward + claude-dev-logging | WebSocket disconnect + Claude ERROR in same 5-min window | warn | T2 |
+| `session-token-abandon` | claude-dev-logging + claude-token-metrics | Session ERROR + no token entry for that session_id | warn | T2 |
+| `action-fail-session-fail` | sim-steward + claude-dev-logging | `action_result` errors + Claude session ERROR within 10 min | critical | T2 |
+| `deploy-triple-signal` | all 3 streams | 2+ streams show elevated error rate within 15 min of plugin lifecycle event | critical | T2 |
+| `cost-spike-tool-flood` | claude-dev-logging + claude-token-metrics | Tool call count spike + session cost spike in same cycle | warn | T1 |
+
+**Total: 46 alerts across 8 domains.**
+
+---
+
+## `/trigger` Endpoint Design
+
+The log-sentinel app gains a new HTTP endpoint:
+
+```
+POST /trigger
+Content-Type: application/json
+
+{
+ "alerts": [{
+ "labels": {
+ "alertname": "ws-claude-coinflict",
+ "trigger_tier": "t2",
+ "severity": "warn"
+ },
+ "annotations": {
+ "summary": "WebSocket disconnects co-occurring with Claude errors",
+ "description": "3 ws_disconnect events and 2 Claude ERROR entries in 5-min window ending 14:32:00"
+ },
+ "startsAt": "2026-03-30T14:32:00Z",
+ "endsAt": "0001-01-01T00:00:00Z"
+ }]
+}
+```
+
+Sentinel behavior on receipt:
+1. Parse alert labels — extract `alertname`, `trigger_tier`, `severity`
+2. Derive lookback window from `startsAt` (default: 30 min before alert fired)
+3. If `trigger_tier=t1`: run T1 with alert context injected into summary prompt
+4. If `trigger_tier=t2`: run T1 (for context) then immediately run T2 — skip the `needs_t2` gate
+5. Deduplicate: if the same `alertname` triggered within `SENTINEL_DEDUP_WINDOW_SEC`, skip
+6. Log `sentinel_trigger` event to Loki with alert metadata
+
+Alert context injection into T1 prompt:
+```
+ALERT CONTEXT (from Grafana):
+ Alert: ws-claude-coinflict (warn)
+ Fired: 2026-03-30 14:32:00 UTC
+ Description: 3 ws_disconnect events and 2 Claude ERROR entries in 5-min window
+ → Focus investigation on this signal. Do not suppress even if recent history is quiet.
+```
+
+---
+
+## Alert Covenant (Living Document)
+
+**Every behavioral change to the plugin, dashboard, or LLM integration must include a corresponding Grafana alert review.**
+
+When adding or changing:
+- A new action handler → check Domain 3 (action-failure-streak thresholds)
+- A new Claude integration → check Domain 4 + 5
+- A new log event or field → check if it should trigger an alert in the relevant domain
+- Removing a log event → check if any alert depends on it (alert will go silent, not fire)
+
+Alert silence ≠ alert passing. Test new alerts by writing a test event to Loki via the gateway and verifying the alert fires within its evaluation window.
+
+**Canonical reference: `docs/RULES-GrafanaAlerts.md`** (to be added to CLAUDE.md)
+
+---
+
+## Implementation Files
+
+### New files
+- `observability/local/grafana/provisioning/alerting/contact-points.yml`
+- `observability/local/grafana/provisioning/alerting/notification-policies.yml`
+- `observability/local/grafana/provisioning/alerting/rules-infrastructure.yml`
+- `observability/local/grafana/provisioning/alerting/rules-iracing.yml`
+- `observability/local/grafana/provisioning/alerting/rules-claude-sessions.yml`
+- `observability/local/grafana/provisioning/alerting/rules-token-cost.yml`
+- `observability/local/grafana/provisioning/alerting/rules-sentinel-health.yml`
+- `observability/local/grafana/provisioning/alerting/rules-cross-stream.yml`
+- `docs/RULES-GrafanaAlerts.md`
+
+### Modified files
+- `observability/local/log-sentinel/app.py` — add `POST /trigger` endpoint
+- `observability/local/log-sentinel/sentinel.py` — add `trigger_cycle()` method (alert-context-aware T1/T2 dispatch)
+- `observability/local/log-sentinel/config.py` — no new fields needed (uses existing dedup window)
+- `observability/local/docker-compose.yml` — no changes needed (grafana already provisioned, port 3000)
+- `.claude/CLAUDE.md` — add alert covenant reference
+
+---
+
+## Verification
+
+1. **Provisioning loads**: `docker compose up grafana` — check Grafana UI → Alerting → Alert Rules shows all 46 rules
+2. **Webhook fires**: Manually set an alert rule to always-firing in Grafana UI, verify `/trigger` receives POST and logs `sentinel_trigger` event to Loki
+3. **T1 trigger path**: Confirm T1 runs after a non-critical alert fires, `sentinel_analyst_run` appears in logs with `trigger_source=grafana_alert`
+4. **T2 direct trigger**: Confirm T2 runs immediately (skipping `needs_t2` gate) when `trigger_tier=t2` alert fires
+5. **Dedup**: Fire same alert twice within dedup window, verify second is silently skipped
+6. **Cross-stream rule**: Write test events to both sim-steward and claude-dev-logging streams via Loki push API, verify `ws-claude-coinflict` fires
diff --git a/log-sentinel-dashboard.png b/log-sentinel-dashboard.png
new file mode 100644
index 0000000..533bef0
Binary files /dev/null and b/log-sentinel-dashboard.png differ
diff --git a/observability/local/docker-compose.yml b/observability/local/docker-compose.yml
index df097fa..6164380 100644
--- a/observability/local/docker-compose.yml
+++ b/observability/local/docker-compose.yml
@@ -60,3 +60,33 @@ services:
interval: 10s
timeout: 5s
retries: 5
+
+ log-sentinel:
+ build: ./log-sentinel
+ depends_on:
+ loki:
+ condition: service_healthy
+ ports:
+ - "8081:8081"
+ environment:
+ - LOKI_URL=http://loki:3100
+ - GRAFANA_URL=http://grafana:3000
+ - GRAFANA_USER=${GRAFANA_ADMIN_USER:-admin}
+ - GRAFANA_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
+ - OLLAMA_URL=http://host.docker.internal:11434
+ - OLLAMA_MODEL_FAST=qwen3:8b
+ - OLLAMA_MODEL_DEEP=qwen3:32b
+ - SENTINEL_POLL_INTERVAL_SEC=3600
+ - SENTINEL_LOOKBACK_SEC=3600
+ - SENTINEL_T2_ENABLED=true
+ - SENTINEL_T2_PROACTIVE_INTERVAL_SEC=3600
+ - SENTINEL_DEDUP_WINDOW_SEC=300
+ - SENTINEL_SENTRY_DSN=${SENTINEL_SENTRY_DSN:-}
+ - SIMSTEWARD_LOG_ENV=${SIMSTEWARD_LOG_ENV:-local}
+ volumes:
+ - ${GRAFANA_STORAGE_PATH:-S:/sim-steward-grafana-storage}/log-sentinel:/data
+ healthcheck:
+ test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8081/health').read()"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
diff --git a/observability/local/grafana/provisioning/alerting/contact-points.yml b/observability/local/grafana/provisioning/alerting/contact-points.yml
new file mode 100644
index 0000000..4414ef7
--- /dev/null
+++ b/observability/local/grafana/provisioning/alerting/contact-points.yml
@@ -0,0 +1,12 @@
+apiVersion: 1
+
+contactPoints:
+ - orgId: 1
+ name: log-sentinel-webhook
+ receivers:
+ - uid: log-sentinel-webhook-recv
+ type: webhook
+ settings:
+ url: http://log-sentinel:8081/trigger
+ httpMethod: POST
+ disableResolveMessage: true
diff --git a/observability/local/grafana/provisioning/alerting/notification-policies.yml b/observability/local/grafana/provisioning/alerting/notification-policies.yml
new file mode 100644
index 0000000..f1d6e22
--- /dev/null
+++ b/observability/local/grafana/provisioning/alerting/notification-policies.yml
@@ -0,0 +1,10 @@
+apiVersion: 1
+
+policies:
+ - orgId: 1
+ receiver: log-sentinel-webhook
+ group_by: ['alertname']
+ group_wait: 0s
+ group_interval: 1m
+ repeat_interval: 4h
+ routes: []
diff --git a/observability/local/grafana/provisioning/alerting/rules-claude-sessions.yml b/observability/local/grafana/provisioning/alerting/rules-claude-sessions.yml
new file mode 100644
index 0000000..0e03d97
--- /dev/null
+++ b/observability/local/grafana/provisioning/alerting/rules-claude-sessions.yml
@@ -0,0 +1,246 @@
+apiVersion: 1
+
+groups:
+ - orgId: 1
+ name: Claude Code Session Health
+ folder: Log Sentinel
+ interval: 1m
+ rules:
+
+ - uid: session-abandoned
+ title: Session Abandoned
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 1800, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="claude-dev-logging"} | json | level="ERROR" [30m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: lt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: Alerting
+ execErrState: Error
+ for: 30m
+ annotations:
+ summary: Claude session started but no activity for 30 minutes
+ description: Session start detected with no activity, errors, or completion events for 30 minutes
+ labels:
+ alertname: session-abandoned
+ severity: warn
+ trigger_tier: t1
+
+ - uid: claude-error-spike
+ title: Claude Error Spike
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 300, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="claude-dev-logging"} | json | level="ERROR" [5m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [4], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: 5+ ERROR entries in claude-dev-logging in 5 minutes
+ description: Elevated error rate in Claude session logging — possible API or tool failure
+ labels:
+ alertname: claude-error-spike
+ severity: warn
+ trigger_tier: t1
+
+ - uid: permission-flood
+ title: Permission Flood
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 300, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="claude-dev-logging"} | json | message=~"(?i).*permission.*" [5m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [9], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: 10+ permission-related log entries in 5 minutes
+ description: Possible permission configuration problem or tool permission loop
+ labels:
+ alertname: permission-flood
+ severity: warn
+ trigger_tier: t1
+
+ - uid: subagent-explosion
+ title: Subagent Explosion
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 3600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="claude-dev-logging"} | json | message=~"(?i).*subagent.*spawn.*" [60m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [19], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: Subagent spawn count exceeded 20 in single session
+ description: Unusually high subagent spawning — possible recursive agent loop or over-parallelization
+ labels:
+ alertname: subagent-explosion
+ severity: warn
+ trigger_tier: t2
+
+ - uid: mcp-service-errors
+ title: MCP Service Errors
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="claude-dev-logging"} | json | message=~"(?i).*mcp.*error.*|.*error.*mcp.*" [10m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [4], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: 5+ MCP call failures in 10 minutes
+ description: MCP server appears to be failing — multiple call errors detected
+ labels:
+ alertname: mcp-service-errors
+ severity: warn
+ trigger_tier: t1
+
+ - uid: tool-loop-detected
+ title: Tool Loop Detected
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 1800, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="claude-dev-logging"} | json | level="WARN" | message=~"(?i).*tool.*loop.*|.*repeated.*tool.*" [30m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: Same tool called 5+ times in session without progress
+ description: Possible stuck agent — repeated tool invocations without forward progress
+ labels:
+ alertname: tool-loop-detected
+ severity: warn
+ trigger_tier: t2
+
+ - uid: session-zero-output
+ title: Session Zero Output
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 3600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="claude-token-metrics"} | json [60m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: Session completed with token entry but zero assistant messages logged
+ description: Session appears to have run but produced no output — possible silent failure
+ labels:
+ alertname: session-zero-output
+ severity: warn
+ trigger_tier: t1
diff --git a/observability/local/grafana/provisioning/alerting/rules-cross-stream.yml b/observability/local/grafana/provisioning/alerting/rules-cross-stream.yml
new file mode 100644
index 0000000..3c63a78
--- /dev/null
+++ b/observability/local/grafana/provisioning/alerting/rules-cross-stream.yml
@@ -0,0 +1,267 @@
+apiVersion: 1
+
+groups:
+ - orgId: 1
+ name: Cross-Stream Correlation
+ folder: Log Sentinel
+ interval: 1m
+ rules:
+
+ # ws-claude-coinflict: WebSocket disconnect + Claude ERROR in same 5-min window
+ - uid: ws-claude-coinflict
+ title: WebSocket + Claude Error Conflict
+ condition: D
+ data:
+ - refId: A
+ relativeTimeRange: { from: 300, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | event="websocket_disconnect" [5m])'
+ instant: true
+ refId: A
+ - refId: B
+ relativeTimeRange: { from: 300, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="claude-dev-logging"} | json | level="ERROR" [5m])'
+ instant: true
+ refId: B
+ - refId: C
+ datasourceUid: __expr__
+ model:
+ type: math
+ expression: "$A > 0 && $B > 0"
+ refId: C
+ - refId: D
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: D
+ conditions:
+ - evaluator: { params: [0], type: gt }
+ operator: { type: and }
+ query: { params: [C] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: WebSocket disconnects co-occurring with Claude errors
+ description: WebSocket disconnect and Claude ERROR events detected in the same 5-minute window
+ labels:
+ alertname: ws-claude-coinflict
+ severity: warn
+ trigger_tier: t2
+
+ # session-token-abandon: Claude session ERROR + no token entry for that session
+ - uid: session-token-abandon
+ title: Session Error Without Token Entry
+ condition: D
+ data:
+ - refId: A
+ relativeTimeRange: { from: 3600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="claude-dev-logging"} | json | level="ERROR" [1h])'
+ instant: true
+ refId: A
+ - refId: B
+ relativeTimeRange: { from: 3600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="claude-token-metrics"} | json [1h])'
+ instant: true
+ refId: B
+ - refId: C
+ datasourceUid: __expr__
+ model:
+ type: math
+ expression: "$A > 0 && $B == 0"
+ refId: C
+ - refId: D
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: D
+ conditions:
+ - evaluator: { params: [0], type: gt }
+ operator: { type: and }
+ query: { params: [C] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 5m
+ annotations:
+ summary: Session ERROR entries with no corresponding token metrics
+ description: Claude session errors present but no token/cost entry — session may have been abandoned or crashed before completion
+ labels:
+ alertname: session-token-abandon
+ severity: warn
+ trigger_tier: t2
+
+ # action-fail-session-fail: action_result errors + Claude session ERROR within 10 min
+ - uid: action-fail-session-fail
+ title: Action Failure + Session Failure
+ condition: D
+ data:
+ - refId: A
+ relativeTimeRange: { from: 600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | event="action_result" | level="ERROR" [10m])'
+ instant: true
+ refId: A
+ - refId: B
+ relativeTimeRange: { from: 600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="claude-dev-logging"} | json | level="ERROR" [10m])'
+ instant: true
+ refId: B
+ - refId: C
+ datasourceUid: __expr__
+ model:
+ type: math
+ expression: "$A > 0 && $B > 0"
+ refId: C
+ - refId: D
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: D
+ conditions:
+ - evaluator: { params: [0], type: gt }
+ operator: { type: and }
+ query: { params: [C] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: Plugin action failures coinciding with Claude session errors
+ description: action_result errors and Claude session errors detected within the same 10-minute window — possible cascading failure
+ labels:
+ alertname: action-fail-session-fail
+ severity: critical
+ trigger_tier: t2
+
+ # deploy-triple-signal: 2+ streams elevated error rate within 15 min of plugin lifecycle event
+ - uid: deploy-triple-signal
+ title: Deploy Triple Signal
+ condition: E
+ data:
+ - refId: A
+ relativeTimeRange: { from: 900, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | level="ERROR" [15m])'
+ instant: true
+ refId: A
+ - refId: B
+ relativeTimeRange: { from: 900, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="claude-dev-logging"} | json | level="ERROR" [15m])'
+ instant: true
+ refId: B
+ - refId: C
+ relativeTimeRange: { from: 900, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | event="plugin_lifecycle" [15m])'
+ instant: true
+ refId: C
+ - refId: D
+ datasourceUid: __expr__
+ model:
+ type: math
+ expression: "($A > 5 ? 1 : 0) + ($B > 5 ? 1 : 0) + ($C > 0 ? 1 : 0)"
+ refId: D
+ - refId: E
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: E
+ conditions:
+ - evaluator: { params: [1], type: gt }
+ operator: { type: and }
+ query: { params: [D] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: Multiple streams showing elevated errors within 15 min of deploy
+ description: Deploy triple signal — plugin lifecycle event plus 2+ streams with elevated error rates
+ labels:
+ alertname: deploy-triple-signal
+ severity: critical
+ trigger_tier: t2
+
+ # cost-spike-tool-flood: Tool call count spike + session cost spike in same cycle
+ - uid: cost-spike-tool-flood
+ title: Cost Spike + Tool Flood
+ condition: D
+ data:
+ - refId: A
+ relativeTimeRange: { from: 3600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'max_over_time({app="claude-token-metrics"} | json | unwrap cost_usd [1h])'
+ instant: true
+ refId: A
+ - refId: B
+ relativeTimeRange: { from: 3600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'max_over_time({app="claude-token-metrics"} | json | unwrap tool_calls [1h])'
+ instant: true
+ refId: B
+ - refId: C
+ datasourceUid: __expr__
+ model:
+ type: math
+ expression: "$A > 0.5 && $B > 50"
+ refId: C
+ - refId: D
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: D
+ conditions:
+ - evaluator: { params: [0], type: gt }
+ operator: { type: and }
+ query: { params: [C] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: High tool call count coinciding with elevated session cost
+ description: Tool use flood and cost spike occurring together — likely agentic loop with real cost impact
+ labels:
+ alertname: cost-spike-tool-flood
+ severity: warn
+ trigger_tier: t1
diff --git a/observability/local/grafana/provisioning/alerting/rules-infrastructure.yml b/observability/local/grafana/provisioning/alerting/rules-infrastructure.yml
new file mode 100644
index 0000000..d5e471a
--- /dev/null
+++ b/observability/local/grafana/provisioning/alerting/rules-infrastructure.yml
@@ -0,0 +1,348 @@
+apiVersion: 1
+
+groups:
+ - orgId: 1
+ name: Infrastructure & Deploy Quality
+ folder: Log Sentinel
+ interval: 1m
+ rules:
+
+ - uid: bridge-start-failed
+ title: Bridge Start Failed
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 300, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | event="plugin_lifecycle" | level="ERROR" [5m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: Plugin lifecycle ERROR on bridge start
+ description: A plugin_lifecycle ERROR event was detected in the last 5 minutes
+ labels:
+ alertname: bridge-start-failed
+ severity: critical
+ trigger_tier: t1
+
+ - uid: plugin-never-ready
+ title: Plugin Never Ready
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 3600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | event="plugin_lifecycle" | message=~".*ready.*" [60m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: lt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: Alerting
+ execErrState: Error
+ for: 1m
+ annotations:
+ summary: Plugin started but never reached ready state
+ description: Plugin lifecycle start event exists but no ready event within 60 minutes
+ labels:
+ alertname: plugin-never-ready
+ severity: warn
+ trigger_tier: t1
+
+ - uid: post-deploy-warn-rate
+ title: High WARN Rate After Deploy
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | level="WARN" [10m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [50], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 2m
+ annotations:
+ summary: Elevated WARN rate after deploy
+ description: More than 50 WARN entries in 10 minutes following a lifecycle event
+ labels:
+ alertname: post-deploy-warn-rate
+ severity: warn
+ trigger_tier: t1
+
+ - uid: bridge-failure-post-deploy
+ title: Bridge ERROR After Deploy
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 900, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | level="ERROR" [15m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: ERROR in sim-steward within 15 min of plugin start
+ description: Bridge ERROR detected shortly after deploy — may indicate startup regression
+ labels:
+ alertname: bridge-failure-post-deploy
+ severity: critical
+ trigger_tier: t1
+
+ - uid: plugin-slow-start
+ title: Plugin Slow Start
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 300, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | event="plugin_lifecycle" | message=~".*start_duration.*" | __error__="" [5m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: Plugin startup exceeded 30s threshold
+ description: Time from plugin_lifecycle start to ready exceeded 30 seconds
+ labels:
+ alertname: plugin-slow-start
+ severity: warn
+ trigger_tier: t1
+
+ - uid: error-spike-post-deploy
+ title: Error Spike After Deploy
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | level="ERROR" [10m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [5], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 2m
+ annotations:
+ summary: Error count doubled vs prior window after deploy
+ description: Error spike detected in 10-minute window following deploy event
+ labels:
+ alertname: error-spike-post-deploy
+ severity: warn
+ trigger_tier: t1
+
+ - uid: error-spike-general
+ title: General Error Spike
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | level="ERROR" [10m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [10], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 2m
+ annotations:
+ summary: More than 10 ERROR logs in 10-minute window
+ description: General error spike detected — not necessarily deploy-related
+ labels:
+ alertname: error-spike-general
+ severity: warn
+ trigger_tier: t1
+
+ - uid: ollama-unreachable
+ title: Ollama Unreachable
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_health" | ollama_reachable="false" [10m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: Ollama is not reachable from log-sentinel
+ description: sentinel_health event recorded ollama_reachable=false
+ labels:
+ alertname: ollama-unreachable
+ severity: critical
+ trigger_tier: t1
+
+ - uid: loki-circuit-open
+ title: Loki Circuit Breaker Open
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_health" | loki_circuit_open="true" [10m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: Loki circuit breaker is open in log-sentinel
+ description: sentinel_health event recorded loki_circuit_open=true — Loki queries are failing
+ labels:
+ alertname: loki-circuit-open
+ severity: critical
+ trigger_tier: t1
+
+ - uid: sentinel-cycle-stalled
+ title: Sentinel Cycle Stalled
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 5400, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_cycle" [90m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: lt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: Alerting
+ execErrState: Error
+ for: 5m
+ annotations:
+ summary: No sentinel_cycle event in 90 minutes
+ description: Log sentinel appears to be stalled — no analysis cycles have completed
+ labels:
+ alertname: sentinel-cycle-stalled
+ severity: critical
+ trigger_tier: t1
diff --git a/observability/local/grafana/provisioning/alerting/rules-iracing.yml b/observability/local/grafana/provisioning/alerting/rules-iracing.yml
new file mode 100644
index 0000000..39b3ab4
--- /dev/null
+++ b/observability/local/grafana/provisioning/alerting/rules-iracing.yml
@@ -0,0 +1,354 @@
+apiVersion: 1
+
+groups:
+ - orgId: 1
+ name: iRacing Session Behavior
+ folder: Log Sentinel
+ interval: 1m
+ rules:
+
+ - uid: session-no-actions
+ title: Session No Actions
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 900, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | event="action_dispatched" [15m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: lt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: Alerting
+ execErrState: Error
+ for: 15m
+ annotations:
+ summary: iRacing session active with zero action_dispatched events
+ description: Session has been active 15+ minutes with no user actions dispatched
+ labels:
+ alertname: session-no-actions
+ severity: warn
+ trigger_tier: t1
+
+ - uid: action-failure-streak
+ title: Action Failure Streak
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | event="action_result" | level="ERROR" [10m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [2], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: 3+ consecutive action_result errors in session
+ description: Multiple consecutive action failures detected — possible stuck state or feature regression
+ labels:
+ alertname: action-failure-streak
+ severity: critical
+ trigger_tier: t1
+
+ - uid: websocket-disconnect-spike
+ title: WebSocket Disconnect Spike
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 300, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | event="websocket_disconnect" [5m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [2], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: 3+ WebSocket disconnects in 5 minutes
+ description: Elevated WebSocket disconnect rate detected — dashboard connectivity unstable
+ labels:
+ alertname: websocket-disconnect-spike
+ severity: warn
+ trigger_tier: t1
+
+ - uid: incident-detection-zero
+ title: Incident Detection Zero
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 1800, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | event="iracing_incident" [30m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: lt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: Alerting
+ execErrState: Error
+ for: 30m
+ annotations:
+ summary: iRacing session active 30+ min with zero incident events
+ description: Incident detection may be broken — no iracing_incident events despite active session
+ labels:
+ alertname: incident-detection-zero
+ severity: warn
+ trigger_tier: t1
+
+ - uid: session-no-end
+ title: Session No End Event
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 14400, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | event="iracing_session_end" [4h])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: lt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: Alerting
+ execErrState: Error
+ for: 4h
+ annotations:
+ summary: iracing_session_start with no iracing_session_end within 4 hours
+ description: Session end event not received — possible session lifecycle tracking failure
+ labels:
+ alertname: session-no-end
+ severity: warn
+ trigger_tier: t1
+
+ - orgId: 1
+ name: Replay & Incident Investigation
+ folder: Log Sentinel
+ interval: 1m
+ rules:
+
+ - uid: replay-no-seeks
+ title: Replay No Seeks
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 300, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | event="iracing_replay_seek" [5m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: lt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: Alerting
+ execErrState: Error
+ for: 5m
+ annotations:
+ summary: Replay started but no seek events in 5 minutes
+ description: Replay mode active with zero iracing_replay_seek events — may indicate broken replay controls
+ labels:
+ alertname: replay-no-seeks
+ severity: warn
+ trigger_tier: t1
+
+ - uid: incident-detection-stall
+ title: Incident Detection Stall in Replay
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 1800, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | event="iracing_incident" [30m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: lt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: Alerting
+ execErrState: Error
+ for: 30m
+ annotations:
+ summary: Replay session active 30+ min with zero incident events
+ description: No incidents detected during replay — detector may be broken in replay mode
+ labels:
+ alertname: incident-detection-stall
+ severity: warn
+ trigger_tier: t1
+
+ - uid: incident-camera-stuck
+ title: Incident Camera Stuck
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | event="iracing_incident" [10m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [2], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: Same camera_view on 3+ consecutive incidents
+ description: Camera may be stuck — same camera_view repeated across multiple incident events
+ labels:
+ alertname: incident-camera-stuck
+ severity: info
+ trigger_tier: t1
+
+ - uid: replay-session-no-close
+ title: Replay Session No Close
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 7200, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | event="iracing_session_end" | mode="replay" [2h])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: lt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: Alerting
+ execErrState: Error
+ for: 2h
+ annotations:
+ summary: Replay session start with no session_end within 2 hours
+ description: Replay session lifecycle may be broken — no session end event received
+ labels:
+ alertname: replay-session-no-close
+ severity: warn
+ trigger_tier: t1
+
+ - uid: action-incident-gap
+ title: Action-Incident Gap
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward"} | json | event="iracing_incident" [10m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 10m
+ annotations:
+ summary: Incident detected, no action_dispatched within 10 minutes
+ description: User may not have reviewed the incident — no action followed the incident event
+ labels:
+ alertname: action-incident-gap
+ severity: info
+ trigger_tier: t1
diff --git a/observability/local/grafana/provisioning/alerting/rules-sentinel-health.yml b/observability/local/grafana/provisioning/alerting/rules-sentinel-health.yml
new file mode 100644
index 0000000..6488a25
--- /dev/null
+++ b/observability/local/grafana/provisioning/alerting/rules-sentinel-health.yml
@@ -0,0 +1,246 @@
+apiVersion: 1
+
+groups:
+ - orgId: 1
+ name: Sentinel Self-Health
+ folder: Log Sentinel
+ interval: 1m
+ rules:
+
+ - uid: sentinel-stalled
+ title: Sentinel Cycle Stalled (Health)
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 5400, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_cycle" [90m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: lt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: Alerting
+ execErrState: Error
+ for: 5m
+ annotations:
+ summary: No sentinel_cycle event in 90 minutes
+ description: Log sentinel appears stalled — no completed analysis cycles
+ labels:
+ alertname: sentinel-cycle-stalled-health
+ severity: critical
+ trigger_tier: t1
+
+ - uid: detector-error-rate
+ title: Detector Error Rate
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_analyst_run" | level="ERROR" [10m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [2], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: Analyst run errors exceeding 3 in a single cycle
+ description: Multiple analysis errors detected — Ollama or Loki connectivity may be failing
+ labels:
+ alertname: detector-error-rate
+ severity: warn
+ trigger_tier: t1
+
+ - uid: t1-slow
+ title: T1 Inference Slow
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'max_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_analyst_run" | tier="t1" | unwrap duration_ms [10m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [120000], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: T1 inference duration exceeded 120 seconds
+ description: T1 triage is running slower than expected — GPU may be under load or model is too large
+ labels:
+ alertname: t1-slow
+ severity: warn
+ trigger_tier: t1
+
+ - uid: t2-slow
+ title: T2 Inference Slow
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 1800, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'max_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_t2_investigation" | unwrap inference_duration_ms [30m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [300000], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: T2 inference duration exceeded 300 seconds
+ description: T2 investigation is taking too long — deep model may be under heavy load
+ labels:
+ alertname: t2-slow
+ severity: warn
+ trigger_tier: t1
+
+ - uid: sentry-flood
+ title: Sentry Flood
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 3600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_sentry_issue" [1h])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [4], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: More than 5 Sentry-worthy findings in 1 hour
+ description: Sentinel is creating too many Sentry issues — possible false positive storm or real incident
+ labels:
+ alertname: sentry-flood
+ severity: warn
+ trigger_tier: t1
+
+ - uid: findings-flood
+ title: Findings Flood
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_finding" [10m])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [19], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: More than 20 findings in a single cycle
+ description: Finding flood detected — sentinel may be over-sensitive or a real incident is occurring
+ labels:
+ alertname: findings-flood
+ severity: warn
+ trigger_tier: t1
+
+ - uid: zero-findings-48h
+ title: Zero Findings 48h
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 172800, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="sim-steward", component="log-sentinel"} | json | event="sentinel_finding" [48h])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: lt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: Alerting
+ execErrState: Error
+ for: 1h
+ annotations:
+ summary: No sentinel findings at all in 48 hours
+ description: System may be suppressing findings or the sentinel is not running correctly
+ labels:
+ alertname: zero-findings-48h
+ severity: info
+ trigger_tier: t1
diff --git a/observability/local/grafana/provisioning/alerting/rules-token-cost.yml b/observability/local/grafana/provisioning/alerting/rules-token-cost.yml
new file mode 100644
index 0000000..b2509ba
--- /dev/null
+++ b/observability/local/grafana/provisioning/alerting/rules-token-cost.yml
@@ -0,0 +1,246 @@
+apiVersion: 1
+
+groups:
+ - orgId: 1
+ name: Token & Cost Budget
+ folder: Log Sentinel
+ interval: 1m
+ rules:
+
+ - uid: session-cost-spike
+ title: Session Cost Spike
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 3600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'max_over_time({app="claude-token-metrics"} | json | unwrap cost_usd [1h])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [1.0], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: Single session cost exceeded $1.00
+ description: A Claude session cost more than $1.00 — review for efficiency
+ labels:
+ alertname: session-cost-spike
+ severity: warn
+ trigger_tier: t1
+
+ - uid: session-cost-critical
+ title: Session Cost Critical
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 3600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'max_over_time({app="claude-token-metrics"} | json | unwrap cost_usd [1h])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [3.0], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: Single session cost exceeded $3.00
+ description: Critical cost threshold exceeded — session may be in a runaway loop
+ labels:
+ alertname: session-cost-critical
+ severity: critical
+ trigger_tier: t2
+
+ - uid: daily-spend-warning
+ title: Daily Spend Warning
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 86400, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'sum_over_time({app="claude-token-metrics"} | json | unwrap cost_usd [24h])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [10.0], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: Rolling 24h spend exceeded $10.00
+ description: Daily spend warning threshold hit — review recent session costs
+ labels:
+ alertname: daily-spend-warning
+ severity: warn
+ trigger_tier: t1
+
+ - uid: daily-spend-critical
+ title: Daily Spend Critical
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 86400, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'sum_over_time({app="claude-token-metrics"} | json | unwrap cost_usd [24h])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [25.0], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: Rolling 24h spend exceeded $25.00
+ description: Critical daily spend threshold hit — immediate review required
+ labels:
+ alertname: daily-spend-critical
+ severity: critical
+ trigger_tier: t2
+
+ - uid: tool-use-flood
+ title: Tool Use Flood
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 3600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'max_over_time({app="claude-token-metrics"} | json | unwrap tool_calls [1h])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [100], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: Tool calls per session exceeded 100
+ description: Unusually high tool call count — possible agentic loop or over-tooling
+ labels:
+ alertname: tool-use-flood
+ severity: warn
+ trigger_tier: t1
+
+ - uid: unexpected-model
+ title: Unexpected Model Used
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 3600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'count_over_time({app="claude-token-metrics"} | json | model!~"claude-opus-4.*|claude-sonnet-4.*|claude-haiku-4.*" [1h])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0], type: gt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 0s
+ annotations:
+ summary: Unexpected model name in token metrics
+ description: A model field value outside the approved set was detected in claude-token-metrics
+ labels:
+ alertname: unexpected-model
+ severity: warn
+ trigger_tier: t1
+
+ - uid: cache-hit-rate-low
+ title: Cache Hit Rate Low
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange: { from: 3600, to: 0 }
+ datasourceUid: loki_local
+ model:
+ datasource: { type: loki, uid: loki_local }
+ editorMode: code
+ expr: 'avg_over_time({app="claude-token-metrics"} | json | unwrap cache_read_ratio [1h])'
+ instant: true
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ model:
+ type: classic_conditions
+ refId: B
+ conditions:
+ - evaluator: { params: [0.2], type: lt }
+ operator: { type: and }
+ query: { params: [A] }
+ reducer: { type: last }
+ noDataState: OK
+ execErrState: Error
+ for: 15m
+ annotations:
+ summary: Cache hit rate below 20% over 1 hour
+ description: Low cache read ratio — context caching may be misconfigured or inactive
+ labels:
+ alertname: cache-hit-rate-low
+ severity: info
+ trigger_tier: t1
diff --git a/observability/local/grafana/provisioning/dashboards/claude-cache-context.json b/observability/local/grafana/provisioning/dashboards/claude-cache-context.json
new file mode 100644
index 0000000..4355693
--- /dev/null
+++ b/observability/local/grafana/provisioning/dashboards/claude-cache-context.json
@@ -0,0 +1,882 @@
+{
+ "id": null,
+ "uid": "claude-cache-context",
+ "title": "Claude Code — Cache & Context Health",
+ "description": "Cache hit rates, context pressure signals, per-turn token burn, and token budget analysis for Claude Code sessions.",
+ "tags": [
+ "claude-code",
+ "cache",
+ "context",
+ "observability"
+ ],
+ "timezone": "browser",
+ "editable": true,
+ "graphTooltip": 1,
+ "time": {
+ "from": "now-7d",
+ "to": "now"
+ },
+ "refresh": "30s",
+ "schemaVersion": 39,
+ "fiscalYearStartMonth": 0,
+ "liveNow": false,
+ "style": "dark",
+ "templating": {
+ "list": [
+ {
+ "name": "session_id",
+ "label": "Session",
+ "type": "query",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "query": "{app=\"claude-token-metrics\"} | json session_id",
+ "regex": "session_id\":\"([^\"]+)",
+ "refresh": 2,
+ "includeAll": true,
+ "multi": false,
+ "allValue": ".*",
+ "current": {
+ "text": "All",
+ "value": "$__all"
+ },
+ "sort": 2
+ },
+ {
+ "name": "model",
+ "label": "Model",
+ "type": "query",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "query": "{app=\"claude-token-metrics\"} | json model",
+ "regex": "model\":\"([^\"]+)",
+ "refresh": 2,
+ "includeAll": true,
+ "multi": true,
+ "allValue": ".*",
+ "current": {
+ "text": "All",
+ "value": "$__all"
+ },
+ "sort": 1
+ },
+ {
+ "name": "project",
+ "label": "Project",
+ "type": "query",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "query": "{app=\"claude-token-metrics\"} | json project",
+ "regex": "project\":\"([^\"]+)",
+ "refresh": 2,
+ "includeAll": true,
+ "multi": true,
+ "allValue": ".*",
+ "current": {
+ "text": "All",
+ "value": "$__all"
+ },
+ "sort": 1
+ },
+ {
+ "name": "effort",
+ "label": "Effort",
+ "type": "query",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "query": "{app=\"claude-token-metrics\"} | json effort",
+ "regex": "effort\":\"([^\"]+)",
+ "refresh": 2,
+ "includeAll": true,
+ "multi": true,
+ "allValue": ".*",
+ "current": {
+ "text": "All",
+ "value": "$__all"
+ },
+ "sort": 1
+ }
+ ]
+ },
+ "panels": [
+ {
+ "type": "row",
+ "title": "Cache Health Summary",
+ "collapsed": false,
+ "gridPos": { "x": 0, "y": 0, "w": 24, "h": 1 }
+ },
+ {
+ "id": 1,
+ "title": "Context from Cache",
+ "description": "% of all context tokens served from cache. High values (>90%) are expected — Claude Code reuses a large context window across turns. This is NOT a per-request hit/miss rate; every turn hits the cache.",
+ "type": "gauge",
+ "gridPos": { "x": 0, "y": 1, "w": 6, "h": 5 },
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_cache_read_tokens [$__interval]))",
+ "legendFormat": "Cache Read",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "B",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_cache_creation_tokens [$__interval]))",
+ "legendFormat": "Cache Creation",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "D",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_input_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_input_tokens [$__interval]))",
+ "legendFormat": "Input",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "C",
+ "datasource": { "type": "__expr__", "uid": "__expr__" },
+ "type": "math",
+ "expression": "$A / ($A + $B + $D) * 100",
+ "hide": false
+ }
+ ],
+ "options": {
+ "orientation": "auto",
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true,
+ "reduceOptions": {
+ "values": false,
+ "calcs": ["lastNotNull"],
+ "fields": ""
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "unit": "percent",
+ "min": 0,
+ "max": 100,
+ "noValue": "0",
+ "color": { "mode": "thresholds" },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#F2495C" },
+ { "value": 30, "color": "#FF9830" },
+ { "value": 50, "color": "#73BF69" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 2,
+ "title": "Cache Read Tokens",
+ "type": "stat",
+ "gridPos": { "x": 6, "y": 1, "w": 6, "h": 5 },
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__interval]))",
+ "legendFormat": "Cache Read",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "textMode": "value",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": {
+ "values": false,
+ "calcs": ["sum"],
+ "fields": ""
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "unit": "short",
+ "decimals": 1,
+ "color": { "mode": "fixed", "fixedColor": "#B877D9" },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{ "value": null, "color": "#B877D9" }]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 3,
+ "title": "Cache Creation Tokens",
+ "type": "stat",
+ "gridPos": { "x": 12, "y": 1, "w": 6, "h": 5 },
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_creation_tokens [$__interval]))",
+ "legendFormat": "Cache Creation",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "textMode": "value",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": {
+ "values": false,
+ "calcs": ["sum"],
+ "fields": ""
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "unit": "short",
+ "decimals": 1,
+ "color": { "mode": "fixed", "fixedColor": "#FF9830" },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{ "value": null, "color": "#FF9830" }]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 4,
+ "title": "Cache Reuse Ratio",
+ "type": "stat",
+ "gridPos": { "x": 18, "y": 1, "w": 6, "h": 5 },
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_cache_creation_tokens [$__interval]))",
+ "legendFormat": "Creation",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "B",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_cache_read_tokens [$__interval]))",
+ "legendFormat": "Read",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "C",
+ "datasource": { "type": "__expr__", "uid": "__expr__" },
+ "type": "math",
+ "expression": "$B / $A",
+ "hide": false
+ }
+ ],
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "textMode": "value",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": {
+ "values": false,
+ "calcs": ["lastNotNull"],
+ "fields": ""
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "decimals": 1,
+ "color": { "mode": "thresholds" },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#F2495C" },
+ { "value": 2, "color": "#FF9830" },
+ { "value": 5, "color": "#73BF69" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "type": "row",
+ "title": "Cache Efficiency Over Time",
+ "collapsed": false,
+ "gridPos": { "x": 0, "y": 6, "w": 24, "h": 1 }
+ },
+ {
+ "id": 5,
+ "title": "Context from Cache Trend",
+ "type": "timeseries",
+ "gridPos": { "x": 0, "y": 7, "w": 24, "h": 8 },
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_cache_read_tokens [$__interval])) / (sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_cache_read_tokens [$__interval])) + sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_cache_creation_tokens [$__interval])) + sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, session_id, total_input_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | session_id=~\"$session_id\" | unwrap total_input_tokens [$__interval]))) * 100",
+ "legendFormat": "Context from Cache %",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "legend": { "displayMode": "list", "placement": "bottom" },
+ "tooltip": { "mode": "multi", "sort": "desc" }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "unit": "percent",
+ "min": 0,
+ "max": 100,
+ "noValue": "0",
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "drawStyle": "line",
+ "lineInterpolation": "smooth",
+ "gradientMode": "opacity",
+ "lineWidth": 2,
+ "fillOpacity": 18,
+ "pointSize": 5,
+ "showPoints": "auto",
+ "spanNulls": 3600000,
+ "axisLabel": "Hit Rate %",
+ "thresholdsStyle": {
+ "mode": "line"
+ }
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#F2495C" },
+ { "value": 50, "color": "#73BF69" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "type": "row",
+ "title": "Cache by Model & Effort",
+ "collapsed": false,
+ "gridPos": { "x": 0, "y": 15, "w": 24, "h": 1 }
+ },
+ {
+ "id": 6,
+ "title": "Context from Cache by Model",
+ "type": "barchart",
+ "gridPos": { "x": 0, "y": 16, "w": 12, "h": 8 },
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum by (model) (sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__range])) / (sum by (model) (sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__range])) + sum by (model) (sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_creation_tokens [$__range])) + sum by (model) (sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_input_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_input_tokens [$__range]))) * 100",
+ "legendFormat": "{{model}}",
+ "queryType": "instant"
+ }
+ ],
+ "options": {
+ "orientation": "horizontal",
+ "showValue": "always",
+ "barWidth": 0.8,
+ "groupWidth": 0.7,
+ "legend": { "displayMode": "list", "placement": "bottom" },
+ "tooltip": { "mode": "single" }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "unit": "percent",
+ "decimals": 1,
+ "min": 0,
+ "max": 100,
+ "noValue": "0",
+ "color": { "mode": "palette-classic" },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#F2495C" },
+ { "value": 30, "color": "#FF9830" },
+ { "value": 50, "color": "#73BF69" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 7,
+ "title": "Context from Cache by Effort",
+ "type": "barchart",
+ "gridPos": { "x": 12, "y": 16, "w": 12, "h": 8 },
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum by (effort) (sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__range])) / (sum by (effort) (sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__range])) + sum by (effort) (sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_creation_tokens [$__range])) + sum by (effort) (sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_input_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_input_tokens [$__range]))) * 100",
+ "legendFormat": "{{effort}}",
+ "queryType": "instant"
+ }
+ ],
+ "options": {
+ "orientation": "horizontal",
+ "showValue": "always",
+ "barWidth": 0.8,
+ "groupWidth": 0.7,
+ "legend": { "displayMode": "list", "placement": "bottom" },
+ "tooltip": { "mode": "single" }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "unit": "percent",
+ "decimals": 1,
+ "min": 0,
+ "max": 100,
+ "noValue": "0",
+ "color": { "mode": "palette-classic" },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#F2495C" },
+ { "value": 30, "color": "#FF9830" },
+ { "value": 50, "color": "#73BF69" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "type": "row",
+ "title": "Context Pressure",
+ "collapsed": false,
+ "gridPos": { "x": 0, "y": 24, "w": 24, "h": 1 }
+ },
+ {
+ "id": 8,
+ "title": "Compactions",
+ "type": "stat",
+ "gridPos": { "x": 0, "y": 25, "w": 8, "h": 7 },
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"lifecycle\"} | json hook_type, model, project, effort | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | hook_type=\"pre-compact\" [$__interval]))",
+ "legendFormat": "Compactions",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "textMode": "value",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": {
+ "values": false,
+ "calcs": ["sum"],
+ "fields": ""
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "decimals": 0,
+ "color": { "mode": "thresholds" },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#73BF69" },
+ { "value": 5, "color": "#F2495C" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 9,
+ "title": "Compaction Rate",
+ "type": "timeseries",
+ "gridPos": { "x": 8, "y": 25, "w": 8, "h": 7 },
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"lifecycle\"} | json hook_type, model, project, effort | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | hook_type=\"pre-compact\" [$__interval]))",
+ "legendFormat": "Compactions",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "legend": { "displayMode": "list", "placement": "bottom" },
+ "tooltip": { "mode": "single" }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "decimals": 0,
+ "color": { "mode": "fixed", "fixedColor": "#FF9830" },
+ "custom": {
+ "drawStyle": "line",
+ "lineInterpolation": "smooth",
+ "gradientMode": "opacity",
+ "fillOpacity": 25,
+ "lineWidth": 2,
+ "pointSize": 5,
+ "showPoints": "auto",
+ "spanNulls": 3600000,
+ "axisLabel": "Compactions"
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#73BF69" },
+ { "value": 5, "color": "#F2495C" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 10,
+ "title": "Avg Turns Before Compaction",
+ "type": "stat",
+ "gridPos": { "x": 16, "y": 25, "w": 8, "h": 7 },
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"tokens\"} | json project | project=~\"$project\" [$__range]))",
+ "legendFormat": "Total Turns",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "B",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"lifecycle\"} | json hook_type, project | project=~\"$project\" | hook_type=\"pre-compact\" [$__range]))",
+ "legendFormat": "Total Compactions",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "C",
+ "datasource": { "type": "__expr__", "uid": "__expr__" },
+ "type": "math",
+ "expression": "$A / $B",
+ "hide": false
+ }
+ ],
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "textMode": "value",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": {
+ "values": false,
+ "calcs": ["lastNotNull"],
+ "fields": ""
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "decimals": 1,
+ "color": { "mode": "fixed", "fixedColor": "#8AB8FF" },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{ "value": null, "color": "#8AB8FF" }]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "type": "row",
+ "title": "Per-Turn Token Burn",
+ "collapsed": false,
+ "gridPos": { "x": 0, "y": 32, "w": 24, "h": 1 }
+ },
+ {
+ "id": 11,
+ "title": "Per-Turn Token Flow (excl. Cache Read)",
+ "description": "Tokens spent per turn: input, output, and cache creation. Cache Read is excluded — it dominates the scale and is shown separately in Cache Trend above.",
+ "type": "timeseries",
+ "gridPos": { "x": 0, "y": 33, "w": 12, "h": 8 },
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-dev-logging\", component=\"tokens\"} | json model, project, effort, turn_input_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap turn_input_tokens [$__interval]))",
+ "legendFormat": "Input Tokens",
+ "queryType": "range"
+ },
+ {
+ "refId": "B",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-dev-logging\", component=\"tokens\"} | json model, project, effort, turn_output_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap turn_output_tokens [$__interval]))",
+ "legendFormat": "Output Tokens",
+ "queryType": "range"
+ },
+ {
+ "refId": "D",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-dev-logging\", component=\"tokens\"} | json model, project, effort, turn_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap turn_cache_creation_tokens [$__interval]))",
+ "legendFormat": "Cache Creation Tokens",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "legend": { "displayMode": "list", "placement": "bottom" },
+ "tooltip": { "mode": "multi", "sort": "desc" }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "unit": "short",
+ "decimals": 1,
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "drawStyle": "line",
+ "lineInterpolation": "smooth",
+ "gradientMode": "opacity",
+ "lineWidth": 2,
+ "fillOpacity": 18,
+ "pointSize": 5,
+ "showPoints": "auto",
+ "spanNulls": 3600000,
+ "stacking": { "mode": "none" },
+ "axisLabel": "Tokens",
+ "scaleDistribution": { "type": "log", "log": 2 }
+ }
+ },
+ "overrides": [
+ {
+ "matcher": { "id": "byName", "options": "Cache Creation Tokens" },
+ "properties": [
+ { "id": "color", "value": { "mode": "fixed", "fixedColor": "#FF9830" } }
+ ]
+ }
+ ]
+ }
+ },
+ {
+ "id": 12,
+ "title": "Turn-by-Turn Output Burst",
+ "type": "timeseries",
+ "gridPos": { "x": 12, "y": 33, "w": 12, "h": 8 },
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-dev-logging\", component=\"tokens\"} | json model, project, effort, turn_output_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap turn_output_tokens [$__interval]))",
+ "legendFormat": "Output per Turn",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "legend": { "displayMode": "list", "placement": "bottom" },
+ "tooltip": { "mode": "single" }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "decimals": 0,
+ "color": { "mode": "fixed", "fixedColor": "#FF6D00" },
+ "custom": {
+ "drawStyle": "line",
+ "lineInterpolation": "smooth",
+ "gradientMode": "opacity",
+ "lineWidth": 2,
+ "fillOpacity": 20,
+ "pointSize": 5,
+ "showPoints": "auto",
+ "spanNulls": 3600000,
+ "axisLabel": "Output Tokens"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "type": "row",
+ "title": "Token Budget",
+ "collapsed": false,
+ "gridPos": { "x": 0, "y": 41, "w": 24, "h": 1 }
+ },
+ {
+ "id": 13,
+ "title": "Token Type Distribution",
+ "type": "piechart",
+ "gridPos": { "x": 0, "y": 42, "w": 12, "h": 8 },
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_input_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_input_tokens [$__interval]))",
+ "legendFormat": "Input",
+ "queryType": "range"
+ },
+ {
+ "refId": "B",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_output_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_output_tokens [$__interval]))",
+ "legendFormat": "Output",
+ "queryType": "range"
+ },
+ {
+ "refId": "C",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__interval]))",
+ "legendFormat": "Cache Read",
+ "queryType": "range"
+ },
+ {
+ "refId": "D",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_creation_tokens [$__interval]))",
+ "legendFormat": "Cache Creation",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "pieType": "donut",
+ "displayLabels": ["name", "percent"],
+ "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] },
+ "tooltip": { "mode": "single" },
+ "reduceOptions": {
+ "values": false,
+ "calcs": ["sum"],
+ "fields": ""
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "unit": "short",
+ "decimals": 1,
+ "color": { "mode": "palette-classic" }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 14,
+ "title": "Output Efficiency",
+ "type": "gauge",
+ "description": "Output tokens per total context token consumed. Higher = more output generated per unit of input.",
+ "gridPos": { "x": 12, "y": 42, "w": 12, "h": 7 },
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_output_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_output_tokens [$__interval]))",
+ "legendFormat": "Output",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "B",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_input_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_input_tokens [$__interval]))",
+ "legendFormat": "Input",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "D",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_creation_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_creation_tokens [$__interval]))",
+ "legendFormat": "Cache Creation",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "E",
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json model, project, effort, total_cache_read_tokens | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__interval]))",
+ "legendFormat": "Cache Read",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "C",
+ "datasource": { "type": "__expr__", "uid": "__expr__" },
+ "type": "math",
+ "expression": "$A / ($B + $D + $E)",
+ "hide": false
+ }
+ ],
+ "options": {
+ "orientation": "auto",
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true,
+ "reduceOptions": {
+ "values": false,
+ "calcs": ["lastNotNull"],
+ "fields": ""
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "unit": "percentunit",
+ "min": 0,
+ "max": 0.05,
+ "decimals": 2,
+ "color": { "mode": "thresholds" },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#F2495C" },
+ { "value": 0.002, "color": "#FF9830" },
+ { "value": 0.01, "color": "#73BF69" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ }
+ ]
+}
diff --git a/observability/local/grafana/provisioning/dashboards/claude-code-overview.json b/observability/local/grafana/provisioning/dashboards/claude-code-overview.json
index de6c222..a75af81 100644
--- a/observability/local/grafana/provisioning/dashboards/claude-code-overview.json
+++ b/observability/local/grafana/provisioning/dashboards/claude-code-overview.json
@@ -73,6 +73,69 @@
"h": 1
}
},
+ {
+ "id": 20,
+ "title": "Session Cost",
+ "type": "stat",
+ "transparent": true,
+ "gridPos": {
+ "x": 20,
+ "y": 1,
+ "w": 4,
+ "h": 4
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json session_id | session_id=~\"$session_id\" | unwrap cost_usd [$__interval]))",
+ "legendFormat": "Cost",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "textMode": "value",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "text": {
+ "titleSize": 12,
+ "valueSize": 36
+ },
+ "reduceOptions": {
+ "values": false,
+ "calcs": ["sum"],
+ "fields": ""
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "$0.00",
+ "unit": "currencyUSD",
+ "decimals": 4,
+ "color": {
+ "mode": "thresholds"
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#73BF69" },
+ { "value": 5, "color": "#FF9830" },
+ { "value": 20, "color": "#F2495C" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
{
"id": 1,
"title": "Tool Calls",
@@ -81,7 +144,7 @@
"gridPos": {
"x": 0,
"y": 1,
- "w": 5,
+ "w": 4,
"h": 4
},
"datasource": {
@@ -95,14 +158,14 @@
"type": "loki",
"uid": "loki_local"
},
- "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=~\"tool|mcp-.*\"} | json hook_type, session_id | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" [$__range]))",
+ "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=~\"tool|mcp-.*\"} | json hook_type, session_id | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" [$__interval]))",
"legendFormat": "Tool Calls",
"queryType": "range"
}
],
"options": {
"colorMode": "background-gradient",
- "graphMode": "none",
+ "graphMode": "area",
"textMode": "value",
"justifyMode": "center",
"orientation": "auto",
@@ -144,9 +207,9 @@
"type": "stat",
"transparent": true,
"gridPos": {
- "x": 5,
+ "x": 4,
"y": 1,
- "w": 5,
+ "w": 4,
"h": 4
},
"datasource": {
@@ -160,14 +223,14 @@
"type": "loki",
"uid": "loki_local"
},
- "expr": "sum(count_over_time({app=\"claude-dev-logging\", level=\"ERROR\"} | json session_id | session_id=~\"$session_id\" [$__range]))",
+ "expr": "sum(count_over_time({app=\"claude-dev-logging\", level=\"ERROR\"} | json session_id | session_id=~\"$session_id\" [$__interval]))",
"legendFormat": "Errors",
"queryType": "range"
}
],
"options": {
"colorMode": "background-gradient",
- "graphMode": "none",
+ "graphMode": "area",
"textMode": "value",
"justifyMode": "center",
"orientation": "auto",
@@ -216,9 +279,9 @@
"type": "stat",
"transparent": true,
"gridPos": {
- "x": 10,
+ "x": 8,
"y": 1,
- "w": 5,
+ "w": 4,
"h": 4
},
"datasource": {
@@ -232,14 +295,14 @@
"type": "loki",
"uid": "loki_local"
},
- "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"agent\"} | json hook_type, session_id | session_id=~\"$session_id\" | hook_type=\"subagent-start\" [$__range]))",
+ "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"agent\"} | json hook_type, session_id | session_id=~\"$session_id\" | hook_type=\"subagent-start\" [$__interval]))",
"legendFormat": "Agents",
"queryType": "range"
}
],
"options": {
"colorMode": "background-gradient",
- "graphMode": "none",
+ "graphMode": "area",
"textMode": "value",
"justifyMode": "center",
"orientation": "auto",
@@ -281,9 +344,9 @@
"type": "stat",
"transparent": true,
"gridPos": {
- "x": 15,
+ "x": 12,
"y": 1,
- "w": 5,
+ "w": 4,
"h": 4
},
"datasource": {
@@ -297,14 +360,14 @@
"type": "loki",
"uid": "loki_local"
},
- "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"user\"} | json hook_type, session_id | session_id=~\"$session_id\" | hook_type=\"user-prompt-submit\" [$__range]))",
+ "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"user\"} | json hook_type, session_id | session_id=~\"$session_id\" | hook_type=\"user-prompt-submit\" [$__interval]))",
"legendFormat": "Prompts",
"queryType": "range"
}
],
"options": {
"colorMode": "background-gradient",
- "graphMode": "none",
+ "graphMode": "area",
"textMode": "value",
"justifyMode": "center",
"orientation": "auto",
@@ -346,7 +409,7 @@
"type": "stat",
"transparent": true,
"gridPos": {
- "x": 20,
+ "x": 16,
"y": 1,
"w": 4,
"h": 4
@@ -362,14 +425,14 @@
"type": "loki",
"uid": "loki_local"
},
- "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"user\"} | json hook_type, session_id | session_id=~\"$session_id\" | hook_type=\"permission-request\" [$__range]))",
+ "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"user\"} | json hook_type, session_id | session_id=~\"$session_id\" | hook_type=\"permission-request\" [$__interval]))",
"legendFormat": "Permissions",
"queryType": "range"
}
],
"options": {
"colorMode": "background-gradient",
- "graphMode": "none",
+ "graphMode": "area",
"textMode": "value",
"justifyMode": "center",
"orientation": "auto",
@@ -515,7 +578,7 @@
"type": "loki",
"uid": "loki_local"
},
- "expr": "sum by (component) (count_over_time({app=\"claude-dev-logging\"} | json session_id | session_id=~\"$session_id\" [$__range]))",
+ "expr": "sum by (component) (count_over_time({app=\"claude-dev-logging\"} | json session_id | session_id=~\"$session_id\" [$__interval]))",
"legendFormat": "{{component}}",
"queryType": "range"
}
@@ -597,6 +660,21 @@
}
]
},
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "tokens"
+ },
+ "properties": [
+ {
+ "id": "color",
+ "value": {
+ "fixedColor": "#FF6D00",
+ "mode": "fixed"
+ }
+ }
+ ]
+ },
{
"matcher": {
"id": "byName",
@@ -757,7 +835,7 @@
{
"id": 9,
"title": "Top Tools Used",
- "description": "% share of each tool across all post-tool-use events.",
+ "description": "Tool call counts across all post-tool-use events. mcp__ prefix stripped for readability.",
"type": "table",
"transparent": true,
"gridPos": {
@@ -777,8 +855,8 @@
"type": "loki",
"uid": "loki_local"
},
- "expr": "sum by (tool_name) (count_over_time({app=\"claude-dev-logging\", component=~\"tool|mcp-.*\"} | json hook_type, tool_name, session_id | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | tool_name != \"\" [$__range])) / ignoring(tool_name) group_left() sum(count_over_time({app=\"claude-dev-logging\", component=~\"tool|mcp-.*\"} | json hook_type, tool_name, session_id | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | tool_name != \"\" [$__range])) * 100",
- "legendFormat": "{{tool_name}}",
+ "expr": "sum by (short_name) (count_over_time({app=\"claude-dev-logging\", component=~\"tool|mcp-.*\"} | json hook_type, tool_name, session_id | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | tool_name != \"\" | line_format \"{{.tool_name}}\" | regexp \"(?:mcp__)?(?P.*)\" [$__range]))",
+ "legendFormat": "{{short_name}}",
"queryType": "range"
}
],
@@ -824,8 +902,8 @@
},
"fieldConfig": {
"defaults": {
- "unit": "percent",
- "decimals": 1,
+ "unit": "short",
+ "decimals": 0,
"custom": {
"inspect": false,
"width": 0
@@ -843,22 +921,12 @@
"properties": [
{
"id": "custom.width",
- "value": 200
- },
- {
- "id": "max",
- "value": 100
- },
- {
- "id": "min",
- "value": 0
+ "value": 90
},
{
"id": "custom.cellOptions",
"value": {
- "type": "gauge",
- "mode": "basic",
- "valueDisplayMode": "color"
+ "type": "auto"
}
}
]
@@ -871,7 +939,7 @@
"properties": [
{
"id": "custom.width",
- "value": 170
+ "value": 200
}
]
}
diff --git a/observability/local/grafana/provisioning/dashboards/claude-token-cost.json b/observability/local/grafana/provisioning/dashboards/claude-token-cost.json
new file mode 100644
index 0000000..d03f5fd
--- /dev/null
+++ b/observability/local/grafana/provisioning/dashboards/claude-token-cost.json
@@ -0,0 +1,1540 @@
+{
+ "id": null,
+ "uid": "claude-token-cost",
+ "title": "Claude Code — Token & Cost Intelligence",
+ "description": "Token usage, cost tracking, cache economics, and model efficiency for Claude Code sessions.",
+ "tags": [
+ "claude-code",
+ "tokens",
+ "cost",
+ "observability"
+ ],
+ "timezone": "browser",
+ "editable": true,
+ "graphTooltip": 1,
+ "time": {
+ "from": "now-7d",
+ "to": "now"
+ },
+ "refresh": "30s",
+ "schemaVersion": 39,
+ "fiscalYearStartMonth": 0,
+ "liveNow": false,
+ "style": "dark",
+ "templating": {
+ "list": [
+ {
+ "name": "model",
+ "label": "Model",
+ "type": "query",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "query": "{app=\"claude-token-metrics\"} | json",
+ "regex": "\"model\":\"([^\"]+)\"",
+ "refresh": 2,
+ "includeAll": true,
+ "multi": true,
+ "allValue": ".*",
+ "current": {
+ "text": "All",
+ "value": "$__all"
+ },
+ "sort": 1
+ },
+ {
+ "name": "project",
+ "label": "Project",
+ "type": "query",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "query": "{app=\"claude-token-metrics\"} | json",
+ "regex": "\"project\":\"([^\"]+)\"",
+ "refresh": 2,
+ "includeAll": true,
+ "multi": true,
+ "allValue": ".*",
+ "current": {
+ "text": "All",
+ "value": "$__all"
+ },
+ "sort": 1
+ },
+ {
+ "name": "effort",
+ "label": "Effort",
+ "type": "query",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "query": "{app=\"claude-token-metrics\"} | json",
+ "regex": "\"effort\":\"([^\"]+)\"",
+ "refresh": 2,
+ "includeAll": true,
+ "multi": true,
+ "allValue": ".*",
+ "current": {
+ "text": "All",
+ "value": "$__all"
+ },
+ "sort": 1
+ }
+ ]
+ },
+ "panels": [
+ {
+ "type": "row",
+ "title": "Spend Summary",
+ "collapsed": false,
+ "gridPos": {
+ "x": 0,
+ "y": 0,
+ "w": 24,
+ "h": 1
+ }
+ },
+ {
+ "id": 1,
+ "title": "Total Spend",
+ "type": "stat",
+ "gridPos": {
+ "x": 0,
+ "y": 1,
+ "w": 6,
+ "h": 5
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap cost_usd [$__interval]))",
+ "legendFormat": "Total Spend",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": {
+ "calcs": ["sum"],
+ "fields": "",
+ "values": false
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "$0.00",
+ "unit": "currencyUSD",
+ "decimals": 4,
+ "color": {
+ "mode": "thresholds"
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#73BF69" },
+ { "value": 5, "color": "#FF9830" },
+ { "value": 20, "color": "#F2495C" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 2,
+ "title": "Sessions",
+ "type": "stat",
+ "gridPos": {
+ "x": 6,
+ "y": 1,
+ "w": 6,
+ "h": 5
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"lifecycle\"} | json | hook_type=\"session-end\" | project=~\"$project\" [$__interval]))",
+ "legendFormat": "Sessions",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": {
+ "calcs": ["sum"],
+ "fields": "",
+ "values": false
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "decimals": 0,
+ "color": {
+ "mode": "fixed",
+ "fixedColor": "#5794F2"
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#5794F2" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 3,
+ "title": "Avg Cost / Session",
+ "type": "stat",
+ "gridPos": {
+ "x": 12,
+ "y": 1,
+ "w": 6,
+ "h": 5
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap cost_usd [$__range]))",
+ "legendFormat": "Total Spend",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "B",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"lifecycle\"} | json | hook_type=\"session-end\" | project=~\"$project\" [$__range]))",
+ "legendFormat": "Sessions",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "C",
+ "datasource": {
+ "type": "__expr__",
+ "uid": "__expr__"
+ },
+ "type": "math",
+ "expression": "$A / $B",
+ "hide": false
+ }
+ ],
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "$0.00",
+ "unit": "currencyUSD",
+ "decimals": 4,
+ "color": {
+ "mode": "thresholds"
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#73BF69" },
+ { "value": 5, "color": "#FF9830" },
+ { "value": 20, "color": "#F2495C" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 4,
+ "title": "Projected Monthly",
+ "type": "stat",
+ "gridPos": {
+ "x": 18,
+ "y": 1,
+ "w": 6,
+ "h": 5
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(rate({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap cost_usd [$__range])) * 2592000",
+ "legendFormat": "Projected Monthly",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "$0.00",
+ "unit": "currencyUSD",
+ "decimals": 2,
+ "color": {
+ "mode": "thresholds"
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#73BF69" },
+ { "value": 100, "color": "#FF9830" },
+ { "value": 300, "color": "#F2495C" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "type": "row",
+ "title": "Spend Trend",
+ "collapsed": false,
+ "gridPos": {
+ "x": 0,
+ "y": 6,
+ "w": 24,
+ "h": 1
+ }
+ },
+ {
+ "id": 5,
+ "title": "Cost Over Time",
+ "type": "timeseries",
+ "gridPos": {
+ "x": 0,
+ "y": 7,
+ "w": 24,
+ "h": 8
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum by (model) (sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap cost_usd [$__interval]))",
+ "legendFormat": "{{model}}",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "legend": {
+ "displayMode": "list",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "unit": "currencyUSD",
+ "decimals": 4,
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "drawStyle": "line",
+ "lineInterpolation": "smooth",
+ "fillOpacity": 18,
+ "gradientMode": "opacity",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "showPoints": "auto",
+ "spanNulls": 3600000,
+ "stacking": {
+ "mode": "normal",
+ "group": "A"
+ },
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "USD",
+ "scaleDistribution": {
+ "type": "linear"
+ }
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#73BF69" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "type": "row",
+ "title": "Where Money Goes",
+ "collapsed": false,
+ "gridPos": {
+ "x": 0,
+ "y": 15,
+ "w": 24,
+ "h": 1
+ }
+ },
+ {
+ "id": 6,
+ "title": "Cost by Model",
+ "type": "piechart",
+ "gridPos": {
+ "x": 0,
+ "y": 16,
+ "w": 8,
+ "h": 8
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum by (model) (sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap cost_usd [$__interval]))",
+ "legendFormat": "{{model}}",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "pieType": "donut",
+ "displayLabels": ["name", "percent"],
+ "reduceOptions": {
+ "calcs": ["sum"],
+ "fields": "",
+ "values": false
+ },
+ "legend": {
+ "displayMode": "list",
+ "placement": "right",
+ "values": ["value", "percent"]
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "unit": "currencyUSD",
+ "decimals": 4,
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 7,
+ "title": "Cost by Project",
+ "type": "piechart",
+ "gridPos": {
+ "x": 8,
+ "y": 16,
+ "w": 8,
+ "h": 8
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum by (project) (sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap cost_usd [$__interval]))",
+ "legendFormat": "{{project}}",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "pieType": "donut",
+ "displayLabels": ["name", "percent"],
+ "reduceOptions": {
+ "calcs": ["sum"],
+ "fields": "",
+ "values": false
+ },
+ "legend": {
+ "displayMode": "list",
+ "placement": "right",
+ "values": ["value", "percent"]
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "unit": "currencyUSD",
+ "decimals": 4,
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 8,
+ "title": "Sessions by Effort",
+ "type": "piechart",
+ "gridPos": {
+ "x": 16,
+ "y": 16,
+ "w": 8,
+ "h": 8
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum by (effort) (count_over_time({app=\"claude-token-metrics\"} | json model, project, effort, cost_usd | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | cost_usd != \"\" [$__interval]))",
+ "legendFormat": "{{effort}}",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "pieType": "donut",
+ "displayLabels": ["name", "percent"],
+ "reduceOptions": {
+ "calcs": ["sum"],
+ "fields": "",
+ "values": false
+ },
+ "legend": {
+ "displayMode": "list",
+ "placement": "right",
+ "values": ["value", "percent"]
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "type": "row",
+ "title": "Cache Economics",
+ "collapsed": false,
+ "gridPos": {
+ "x": 0,
+ "y": 24,
+ "w": 24,
+ "h": 1
+ }
+ },
+ {
+ "id": 9,
+ "title": "Context from Cache",
+ "description": "% of all context tokens served from cache. High values are expected — Claude Code reuses a large context window. Every turn hits the cache; this measures token efficiency, not request hit rate.",
+ "type": "gauge",
+ "gridPos": {
+ "x": 0,
+ "y": 25,
+ "w": 8,
+ "h": 7
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__range]))",
+ "legendFormat": "Cache Read",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "B",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_input_tokens [$__range]))",
+ "legendFormat": "Fresh Input",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "D",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_creation_tokens [$__range]))",
+ "legendFormat": "Cache Creation",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "C",
+ "datasource": {
+ "type": "__expr__",
+ "uid": "__expr__"
+ },
+ "type": "math",
+ "expression": "$A / ($A + $B + $D) * 100",
+ "hide": false
+ }
+ ],
+ "options": {
+ "orientation": "auto",
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true,
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "unit": "percent",
+ "min": 0,
+ "max": 100,
+ "color": {
+ "mode": "thresholds"
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#F2495C" },
+ { "value": 30, "color": "#FF9830" },
+ { "value": 50, "color": "#73BF69" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 10,
+ "title": "Total Cache Read Tokens",
+ "type": "stat",
+ "gridPos": {
+ "x": 8,
+ "y": 25,
+ "w": 8,
+ "h": 7
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__interval]))",
+ "legendFormat": "Cache Read Tokens",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": {
+ "calcs": ["sum"],
+ "fields": "",
+ "values": false
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "unit": "short",
+ "decimals": 1,
+ "color": {
+ "mode": "fixed",
+ "fixedColor": "#73BF69"
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#73BF69" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 11,
+ "title": "Cache Trend",
+ "description": "Cache Read vs Cache Creation tokens per interval. Cache Creation (right axis) is orders of magnitude smaller — dual axes show both trends clearly.",
+ "type": "timeseries",
+ "gridPos": {
+ "x": 16,
+ "y": 25,
+ "w": 8,
+ "h": 7
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__interval]))",
+ "legendFormat": "Cache Read",
+ "queryType": "range"
+ },
+ {
+ "refId": "B",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_creation_tokens [$__interval]))",
+ "legendFormat": "Cache Creation",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "legend": {
+ "displayMode": "list",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "none"
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "unit": "short",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "drawStyle": "line",
+ "lineInterpolation": "smooth",
+ "fillOpacity": 20,
+ "lineWidth": 2,
+ "pointSize": 5,
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "mode": "none",
+ "group": "A"
+ },
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "scaleDistribution": {
+ "type": "linear"
+ }
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#73BF69" }
+ ]
+ }
+ },
+ "overrides": [
+ {
+ "matcher": { "id": "byName", "options": "Cache Creation" },
+ "properties": [
+ { "id": "custom.axisPlacement", "value": "right" },
+ { "id": "color", "value": { "mode": "fixed", "fixedColor": "#FF9830" } }
+ ]
+ }
+ ]
+ }
+ },
+ {
+ "type": "row",
+ "title": "Token Flow",
+ "collapsed": false,
+ "gridPos": {
+ "x": 0,
+ "y": 32,
+ "w": 24,
+ "h": 1
+ }
+ },
+ {
+ "id": 12,
+ "title": "Token Type Distribution",
+ "type": "piechart",
+ "gridPos": {
+ "x": 0,
+ "y": 33,
+ "w": 8,
+ "h": 8
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_input_tokens [$__interval]))",
+ "legendFormat": "Input",
+ "queryType": "range"
+ },
+ {
+ "refId": "B",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_output_tokens [$__interval]))",
+ "legendFormat": "Output",
+ "queryType": "range"
+ },
+ {
+ "refId": "C",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_read_tokens [$__interval]))",
+ "legendFormat": "Cache Read",
+ "queryType": "range"
+ },
+ {
+ "refId": "D",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_cache_creation_tokens [$__interval]))",
+ "legendFormat": "Cache Creation",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "pieType": "donut",
+ "displayLabels": ["name", "percent"],
+ "reduceOptions": {
+ "calcs": ["sum"],
+ "fields": "",
+ "values": false
+ },
+ "legend": {
+ "displayMode": "list",
+ "placement": "right",
+ "values": ["value", "percent"]
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "unit": "short",
+ "decimals": 1,
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 13,
+ "title": "Total Output Tokens",
+ "description": "Total output tokens generated across all sessions in the selected time range.",
+ "type": "stat",
+ "gridPos": {
+ "x": 8,
+ "y": 33,
+ "w": 5,
+ "h": 7
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_output_tokens [$__interval]))",
+ "legendFormat": "Output Tokens",
+ "queryType": "range"
+ }
+ ],
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": {
+ "calcs": ["sum"],
+ "fields": "",
+ "values": false
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "unit": "short",
+ "decimals": 1,
+ "color": {
+ "mode": "fixed",
+ "fixedColor": "#B877D9"
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#B877D9" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 14,
+ "title": "Output Tokens / Dollar",
+ "description": "Output tokens generated per dollar spent. Higher = more efficient.",
+ "type": "stat",
+ "gridPos": {
+ "x": 13,
+ "y": 33,
+ "w": 5,
+ "h": 7
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_output_tokens [$__range]))",
+ "legendFormat": "Output Tokens",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "B",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap cost_usd [$__range]))",
+ "legendFormat": "Cost",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "C",
+ "datasource": {
+ "type": "__expr__",
+ "uid": "__expr__"
+ },
+ "type": "math",
+ "expression": "$A / $B",
+ "hide": false
+ }
+ ],
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "decimals": 0,
+ "unit": "locale",
+ "color": {
+ "mode": "fixed",
+ "fixedColor": "#73BF69"
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#73BF69" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 15,
+ "title": "Avg Turns / Session",
+ "type": "stat",
+ "gridPos": {
+ "x": 18,
+ "y": 33,
+ "w": 6,
+ "h": 7
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(sum_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap assistant_turns [$__range]))",
+ "legendFormat": "Total Turns",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "B",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "sum(count_over_time({app=\"claude-dev-logging\", component=\"lifecycle\"} | json | hook_type=\"session-end\" | project=~\"$project\" [$__range]))",
+ "legendFormat": "Sessions",
+ "queryType": "range",
+ "hide": true
+ },
+ {
+ "refId": "C",
+ "datasource": {
+ "type": "__expr__",
+ "uid": "__expr__"
+ },
+ "type": "math",
+ "expression": "$A / $B",
+ "hide": false
+ }
+ ],
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ }
+ },
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "decimals": 1,
+ "color": {
+ "mode": "fixed",
+ "fixedColor": "#5794F2"
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#5794F2" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "type": "row",
+ "title": "Model Economics",
+ "collapsed": false,
+ "gridPos": {
+ "x": 0,
+ "y": 40,
+ "w": 24,
+ "h": 1
+ }
+ },
+ {
+ "id": 16,
+ "title": "Avg Cost per Session / Model",
+ "type": "barchart",
+ "gridPos": {
+ "x": 0,
+ "y": 41,
+ "w": 12,
+ "h": 8
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "avg by (model) (avg_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap cost_usd [$__range]))",
+ "legendFormat": "{{model}}",
+ "queryType": "instant"
+ }
+ ],
+ "options": {
+ "orientation": "horizontal",
+ "barWidth": 0.8,
+ "groupWidth": 0.7,
+ "showValue": "always",
+ "stacking": "none",
+ "legend": {
+ "displayMode": "list",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ },
+ "xTickLabelRotation": 0
+ },
+ "fieldConfig": {
+ "defaults": {
+ "unit": "currencyUSD",
+ "decimals": 2,
+ "color": {
+ "mode": "palette-classic"
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#73BF69" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "id": 17,
+ "title": "Avg Output Tokens per Model",
+ "type": "barchart",
+ "gridPos": {
+ "x": 12,
+ "y": 41,
+ "w": 12,
+ "h": 8
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "avg by (model) (avg_over_time({app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\" | unwrap total_output_tokens [$__range]))",
+ "legendFormat": "{{model}}",
+ "queryType": "instant"
+ }
+ ],
+ "options": {
+ "orientation": "horizontal",
+ "barWidth": 0.8,
+ "groupWidth": 0.7,
+ "showValue": "always",
+ "stacking": "none",
+ "legend": {
+ "displayMode": "list",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ },
+ "xTickLabelRotation": 0
+ },
+ "fieldConfig": {
+ "defaults": {
+ "unit": "short",
+ "decimals": 1,
+ "color": {
+ "mode": "palette-classic"
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#73BF69" }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "type": "row",
+ "title": "Session Economics",
+ "collapsed": false,
+ "gridPos": {
+ "x": 0,
+ "y": 49,
+ "w": 24,
+ "h": 1
+ }
+ },
+ {
+ "id": 18,
+ "title": "Session Breakdown",
+ "type": "table",
+ "gridPos": {
+ "x": 0,
+ "y": 50,
+ "w": 24,
+ "h": 10
+ },
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "targets": [
+ {
+ "refId": "A",
+ "datasource": {
+ "type": "loki",
+ "uid": "loki_local"
+ },
+ "expr": "{app=\"claude-token-metrics\"} | json | model=~\"$model\" | project=~\"$project\" | effort=~\"$effort\"",
+ "legendFormat": "",
+ "queryType": "range"
+ }
+ ],
+ "transformations": [
+ {
+ "id": "extractFields",
+ "options": {
+ "source": "Line",
+ "format": "json",
+ "replace": false,
+ "keepTime": true
+ }
+ },
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "session_id": {
+ "aggregations": [],
+ "operation": "groupby"
+ },
+ "model": {
+ "aggregations": ["lastNotNull"],
+ "operation": "aggregate"
+ },
+ "effort": {
+ "aggregations": ["lastNotNull"],
+ "operation": "aggregate"
+ },
+ "cost_usd": {
+ "aggregations": ["sum"],
+ "operation": "aggregate"
+ },
+ "total_input_tokens": {
+ "aggregations": ["sum"],
+ "operation": "aggregate"
+ },
+ "total_output_tokens": {
+ "aggregations": ["sum"],
+ "operation": "aggregate"
+ },
+ "total_cache_read_tokens": {
+ "aggregations": ["sum"],
+ "operation": "aggregate"
+ },
+ "assistant_turns": {
+ "aggregations": ["sum"],
+ "operation": "aggregate"
+ }
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+ "Line": true,
+ "id": true,
+ "tsNs": true,
+ "labels": true,
+ "Time": true
+ },
+ "indexByName": {
+ "session_id": 0,
+ "model (lastNotNull)": 1,
+ "effort (lastNotNull)": 2,
+ "cost_usd (sum)": 3,
+ "total_input_tokens (sum)": 4,
+ "total_output_tokens (sum)": 5,
+ "total_cache_read_tokens (sum)": 6,
+ "assistant_turns (sum)": 7
+ },
+ "renameByName": {
+ "session_id": "Session",
+ "model (lastNotNull)": "Model",
+ "effort (lastNotNull)": "Effort",
+ "cost_usd (sum)": "Cost (USD)",
+ "total_input_tokens (sum)": "Input Tokens",
+ "total_output_tokens (sum)": "Output Tokens",
+ "total_cache_read_tokens (sum)": "Cache Read",
+ "assistant_turns (sum)": "Turns"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {},
+ "sort": [
+ {
+ "field": "Cost (USD)",
+ "desc": true
+ }
+ ]
+ }
+ }
+ ],
+ "options": {
+ "showHeader": true,
+ "cellHeight": "sm",
+ "footer": {
+ "show": true,
+ "reducer": ["sum"],
+ "countRows": false,
+ "fields": ["Cost (USD)", "Input Tokens", "Output Tokens", "Cache Read"]
+ },
+ "sortBy": [
+ {
+ "displayName": "Cost (USD)",
+ "desc": true
+ }
+ ]
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#73BF69" }
+ ]
+ }
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Cost (USD)"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "currencyUSD"
+ },
+ {
+ "id": "decimals",
+ "value": 4
+ },
+ {
+ "id": "custom.cellOptions",
+ "value": {
+ "type": "color-background",
+ "mode": "gradient"
+ }
+ },
+ {
+ "id": "thresholds",
+ "value": {
+ "mode": "absolute",
+ "steps": [
+ { "value": null, "color": "#73BF69" },
+ { "value": 1, "color": "#FF9830" },
+ { "value": 5, "color": "#F2495C" }
+ ]
+ }
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Input Tokens"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "locale"
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Output Tokens"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "locale"
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Cache Read"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "locale"
+ }
+ ]
+ }
+ ]
+ }
+ }
+ ]
+}
diff --git a/observability/local/grafana/provisioning/dashboards/claude-token-usage.json b/observability/local/grafana/provisioning/dashboards/claude-token-usage.json
deleted file mode 100644
index fe8f1ad..0000000
--- a/observability/local/grafana/provisioning/dashboards/claude-token-usage.json
+++ /dev/null
@@ -1,893 +0,0 @@
-{
- "id": null,
- "uid": "claude-token-usage",
- "title": "Claude Code — Token Usage",
- "description": "Token consumption, estimated cost, cache efficiency, and session trends across Claude Code sessions.",
- "tags": ["claude-code", "tokens", "cost", "observability"],
- "timezone": "browser",
- "editable": true,
- "graphTooltip": 1,
- "time": { "from": "now-7d", "to": "now" },
- "refresh": "1m",
- "schemaVersion": 39,
- "fiscalYearStartMonth": 0,
- "liveNow": false,
- "style": "dark",
- "templating": {
- "list": [
- {
- "name": "model",
- "label": "Model",
- "type": "query",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "query": "{app=\"claude-token-metrics\"} | json",
- "regex": "\"model\":\"([^\"]+)\"",
- "refresh": 2,
- "includeAll": true,
- "multi": true,
- "allValue": ".*",
- "current": { "text": "All", "value": "$__all" },
- "sort": 1
- },
- {
- "name": "project",
- "label": "Project",
- "type": "query",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "query": "{app=\"claude-token-metrics\"} | json",
- "regex": "\"project\":\"([^\"]+)\"",
- "refresh": 2,
- "includeAll": true,
- "multi": true,
- "allValue": ".*",
- "current": { "text": "All", "value": "$__all" },
- "sort": 1
- },
- {
- "name": "effort",
- "label": "Effort",
- "type": "query",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "query": "{app=\"claude-token-metrics\"} | json",
- "regex": "\"effort\":\"([^\"]+)\"",
- "refresh": 2,
- "includeAll": true,
- "multi": true,
- "allValue": ".*",
- "current": { "text": "All", "value": "$__all" },
- "sort": 1
- },
- {
- "name": "session_id",
- "label": "Session",
- "type": "query",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "query": "{app=\"claude-token-metrics\"} | json",
- "regex": "\"session_id\":\"([^\"]+)\"",
- "refresh": 2,
- "includeAll": true,
- "multi": false,
- "allValue": ".*",
- "current": { "text": "All", "value": "$__all" },
- "sort": 0
- }
- ]
- },
- "panels": [
- {
- "type": "row",
- "title": "Cost Summary",
- "collapsed": false,
- "gridPos": { "x": 0, "y": 0, "w": 24, "h": 1 }
- },
- {
- "id": 1,
- "title": "Output Tokens",
- "description": "Total output (generated) tokens in the selected time range.",
- "type": "stat",
- "transparent": true,
- "gridPos": { "x": 0, "y": 1, "w": 5, "h": 5 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_output_tokens [$__range]))",
- "queryType": "range"
- }
- ],
- "options": {
- "colorMode": "background-gradient",
- "graphMode": "area",
- "textMode": "auto",
- "wideLayout": true,
- "justifyMode": "auto",
- "orientation": "auto",
- "text": { "titleSize": 12, "valueSize": 32 },
- "reduceOptions": { "values": false, "calcs": ["sum"], "fields": "" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "fixed", "fixedColor": "#5794F2" },
- "unit": "short",
- "decimals": 0,
- "thresholds": { "mode": "absolute", "steps": [{ "value": null, "color": "#5794F2" }] }
- },
- "overrides": []
- }
- },
- {
- "id": 2,
- "title": "Est. Cost (USD)",
- "description": "Estimated total spend based on Anthropic public pricing. Cache reads are priced at 10% of input rate.",
- "type": "stat",
- "transparent": true,
- "gridPos": { "x": 5, "y": 1, "w": 5, "h": 5 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap cost_usd [$__range]))",
- "queryType": "range"
- }
- ],
- "options": {
- "colorMode": "background-gradient",
- "graphMode": "area",
- "textMode": "auto",
- "wideLayout": true,
- "justifyMode": "auto",
- "orientation": "auto",
- "text": { "titleSize": 12, "valueSize": 32 },
- "reduceOptions": { "values": false, "calcs": ["sum"], "fields": "" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "thresholds" },
- "unit": "currencyUSD",
- "decimals": 2,
- "thresholds": {
- "mode": "absolute",
- "steps": [
- { "value": null, "color": "#73BF69" },
- { "value": 5, "color": "#FADE2A" },
- { "value": 20, "color": "#FF9830" },
- { "value": 50, "color": "#F2495C" }
- ]
- }
- },
- "overrides": []
- }
- },
- {
- "id": 3,
- "title": "Cache Hit Rate",
- "description": "Fraction of read tokens served from cache (cache_read / (input + cache_creation + cache_read)). Higher is better — reduces cost and latency.",
- "type": "stat",
- "transparent": true,
- "gridPos": { "x": 10, "y": 1, "w": 5, "h": 5 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_cache_read_tokens [$__range]))",
- "queryType": "range",
- "hide": true
- },
- {
- "refId": "B",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_input_tokens [$__range]))",
- "queryType": "range",
- "hide": true
- },
- {
- "refId": "C",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_cache_creation_tokens [$__range]))",
- "queryType": "range",
- "hide": true
- },
- {
- "refId": "D",
- "datasource": { "type": "__expr__", "uid": "__expr__" },
- "type": "math",
- "expression": "($A / ($A + $B + $C)) * 100",
- "hide": false
- }
- ],
- "options": {
- "colorMode": "background-gradient",
- "graphMode": "none",
- "textMode": "auto",
- "wideLayout": true,
- "justifyMode": "auto",
- "orientation": "auto",
- "text": { "titleSize": 12, "valueSize": 32 },
- "reduceOptions": { "values": false, "calcs": ["lastNotNull"], "fields": "" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "thresholds" },
- "unit": "percent",
- "decimals": 1,
- "min": 0,
- "max": 100,
- "thresholds": {
- "mode": "absolute",
- "steps": [
- { "value": null, "color": "#F2495C" },
- { "value": 40, "color": "#FF9830" },
- { "value": 70, "color": "#73BF69" }
- ]
- }
- },
- "overrides": []
- }
- },
- {
- "id": 4,
- "title": "Sessions",
- "description": "Number of completed Claude Code sessions in the selected time range.",
- "type": "stat",
- "transparent": true,
- "gridPos": { "x": 15, "y": 1, "w": 4, "h": 5 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum(count_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" [$__range]))",
- "queryType": "range"
- }
- ],
- "options": {
- "colorMode": "none",
- "graphMode": "area",
- "textMode": "auto",
- "wideLayout": true,
- "justifyMode": "auto",
- "orientation": "auto",
- "text": { "titleSize": 12, "valueSize": 32 },
- "reduceOptions": { "values": false, "calcs": ["sum"], "fields": "" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "fixed", "fixedColor": "#A0A0A0" },
- "unit": "short",
- "decimals": 0,
- "thresholds": { "mode": "absolute", "steps": [{ "value": null, "color": "#A0A0A0" }] }
- },
- "overrides": []
- }
- },
- {
- "id": 5,
- "title": "Avg Cost / Session",
- "description": "Average estimated cost per completed session.",
- "type": "stat",
- "transparent": true,
- "gridPos": { "x": 19, "y": 1, "w": 5, "h": 5 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "avg_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap cost_usd [$__range])",
- "queryType": "range"
- }
- ],
- "options": {
- "colorMode": "background-gradient",
- "graphMode": "none",
- "textMode": "auto",
- "wideLayout": true,
- "justifyMode": "auto",
- "orientation": "auto",
- "text": { "titleSize": 12, "valueSize": 32 },
- "reduceOptions": { "values": false, "calcs": ["mean"], "fields": "" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "thresholds" },
- "unit": "currencyUSD",
- "decimals": 3,
- "thresholds": {
- "mode": "absolute",
- "steps": [
- { "value": null, "color": "#73BF69" },
- { "value": 1, "color": "#FADE2A" },
- { "value": 5, "color": "#FF9830" },
- { "value": 15, "color": "#F2495C" }
- ]
- }
- },
- "overrides": []
- }
- },
-
- {
- "type": "row",
- "title": "Token Burn — All Types",
- "collapsed": false,
- "gridPos": { "x": 0, "y": 6, "w": 24, "h": 1 }
- },
- {
- "id": 6,
- "title": "Token Consumption Over Time",
- "description": "Stacked view of all four token categories per session window. Cache reads typically dominate — that's a good sign.",
- "type": "timeseries",
- "transparent": true,
- "gridPos": { "x": 0, "y": 7, "w": 24, "h": 10 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "Cache Read",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_cache_read_tokens [$__interval]))",
- "legendFormat": "Cache Read",
- "queryType": "range"
- },
- {
- "refId": "Output",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_output_tokens [$__interval]))",
- "legendFormat": "Output",
- "queryType": "range"
- },
- {
- "refId": "Cache Create",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_cache_creation_tokens [$__interval]))",
- "legendFormat": "Cache Create",
- "queryType": "range"
- },
- {
- "refId": "Input",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_input_tokens [$__interval]))",
- "legendFormat": "Input",
- "queryType": "range"
- }
- ],
- "options": {
- "legend": {
- "displayMode": "table",
- "placement": "right",
- "calcs": ["sum", "mean", "max"]
- },
- "tooltip": { "mode": "multi", "sort": "desc" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "palette-classic" },
- "unit": "short",
- "custom": {
- "drawStyle": "line",
- "lineInterpolation": "smooth",
- "lineWidth": 2,
- "fillOpacity": 20,
- "gradientMode": "opacity",
- "showPoints": "never",
- "spanNulls": false,
- "axisBorderShow": false,
- "stacking": { "mode": "normal", "group": "A" }
- }
- },
- "overrides": [
- {
- "matcher": { "id": "byName", "options": "Cache Read" },
- "properties": [
- { "id": "color", "value": { "fixedColor": "#73BF69", "mode": "fixed" } },
- { "id": "custom.fillOpacity", "value": 25 }
- ]
- },
- {
- "matcher": { "id": "byName", "options": "Output" },
- "properties": [{ "id": "color", "value": { "fixedColor": "#5794F2", "mode": "fixed" } }]
- },
- {
- "matcher": { "id": "byName", "options": "Cache Create" },
- "properties": [{ "id": "color", "value": { "fixedColor": "#FF9830", "mode": "fixed" } }]
- },
- {
- "matcher": { "id": "byName", "options": "Input" },
- "properties": [{ "id": "color", "value": { "fixedColor": "#6C7280", "mode": "fixed" } }]
- }
- ]
- }
- },
-
- {
- "type": "row",
- "title": "Daily Usage by Model",
- "collapsed": false,
- "gridPos": { "x": 0, "y": 17, "w": 24, "h": 1 }
- },
- {
- "id": 7,
- "title": "Output Tokens per Day — by Model",
- "description": "Stacked daily bars showing output token volume per model. Reveals model switching and high-burn days at a glance.",
- "type": "timeseries",
- "transparent": true,
- "gridPos": { "x": 0, "y": 18, "w": 16, "h": 9 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum by (model) (sum_over_time({app=\"claude-token-metrics\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_output_tokens [$__interval]))",
- "legendFormat": "{{model}}",
- "queryType": "range"
- }
- ],
- "options": {
- "legend": {
- "displayMode": "table",
- "placement": "right",
- "calcs": ["sum", "max"]
- },
- "tooltip": { "mode": "multi", "sort": "desc" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "palette-classic" },
- "unit": "short",
- "custom": {
- "drawStyle": "bars",
- "lineWidth": 1,
- "fillOpacity": 80,
- "gradientMode": "none",
- "showPoints": "never",
- "spanNulls": false,
- "axisBorderShow": false,
- "stacking": { "mode": "normal", "group": "A" },
- "barAlignment": 0
- }
- },
- "overrides": [
- {
- "matcher": { "id": "byRegexp", "options": ".*opus.*" },
- "properties": [{ "id": "color", "value": { "fixedColor": "#B877D9", "mode": "fixed" } }]
- },
- {
- "matcher": { "id": "byRegexp", "options": ".*sonnet.*" },
- "properties": [{ "id": "color", "value": { "fixedColor": "#5794F2", "mode": "fixed" } }]
- },
- {
- "matcher": { "id": "byRegexp", "options": ".*haiku.*" },
- "properties": [{ "id": "color", "value": { "fixedColor": "#73BF69", "mode": "fixed" } }]
- }
- ]
- }
- },
- {
- "id": 8,
- "title": "Spend by Model",
- "description": "Cumulative estimated cost share per model over the selected period.",
- "type": "piechart",
- "transparent": true,
- "gridPos": { "x": 16, "y": 18, "w": 8, "h": 9 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum by (model) (sum_over_time({app=\"claude-token-metrics\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap cost_usd [$__range]))",
- "legendFormat": "{{model}}",
- "queryType": "range"
- }
- ],
- "options": {
- "pieType": "donut",
- "displayLabels": ["name", "percent"],
- "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] },
- "tooltip": { "mode": "multi" },
- "reduceOptions": { "values": false, "calcs": ["sum"], "fields": "" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "palette-classic" },
- "unit": "currencyUSD",
- "decimals": 3
- },
- "overrides": [
- {
- "matcher": { "id": "byRegexp", "options": ".*opus.*" },
- "properties": [{ "id": "color", "value": { "fixedColor": "#B877D9", "mode": "fixed" } }]
- },
- {
- "matcher": { "id": "byRegexp", "options": ".*sonnet.*" },
- "properties": [{ "id": "color", "value": { "fixedColor": "#5794F2", "mode": "fixed" } }]
- },
- {
- "matcher": { "id": "byRegexp", "options": ".*haiku.*" },
- "properties": [{ "id": "color", "value": { "fixedColor": "#73BF69", "mode": "fixed" } }]
- }
- ]
- }
- },
-
- {
- "type": "row",
- "title": "Effort & Cache Efficiency",
- "collapsed": false,
- "gridPos": { "x": 0, "y": 27, "w": 24, "h": 1 }
- },
- {
- "id": 9,
- "title": "Cost by Effort Level",
- "description": "Standard = default mode. Extended thinking = thinking blocks enabled. Fast = /fast mode.",
- "type": "barchart",
- "transparent": true,
- "gridPos": { "x": 0, "y": 28, "w": 8, "h": 9 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum by (effort) (sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\"} | json | session_id=~\"$session_id\" | unwrap cost_usd [$__range]))",
- "legendFormat": "{{effort}}",
- "queryType": "range"
- }
- ],
- "transformations": [
- { "id": "reduce", "options": { "reducers": ["sum"] } },
- { "id": "sortBy", "options": { "fields": [{ "desc": true, "displayName": "Sum" }] } }
- ],
- "options": {
- "orientation": "horizontal",
- "barWidth": 0.7,
- "groupWidth": 0.7,
- "showValue": "always",
- "stacking": "none",
- "xTickLabelMaxLength": 24,
- "legend": { "displayMode": "hidden" },
- "tooltip": { "mode": "multi" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "palette-classic" },
- "unit": "currencyUSD",
- "decimals": 3,
- "custom": { "fillOpacity": 80, "gradientMode": "none" }
- },
- "overrides": [
- {
- "matcher": { "id": "byName", "options": "standard" },
- "properties": [{ "id": "color", "value": { "fixedColor": "#5794F2", "mode": "fixed" } }]
- },
- {
- "matcher": { "id": "byName", "options": "extended_thinking" },
- "properties": [{ "id": "color", "value": { "fixedColor": "#B877D9", "mode": "fixed" } }]
- },
- {
- "matcher": { "id": "byName", "options": "fast" },
- "properties": [{ "id": "color", "value": { "fixedColor": "#73BF69", "mode": "fixed" } }]
- }
- ]
- }
- },
- {
- "id": 10,
- "title": "Output Tokens by Effort Level",
- "description": "Session count and output token volume per effort mode.",
- "type": "barchart",
- "transparent": true,
- "gridPos": { "x": 8, "y": 28, "w": 8, "h": 9 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum by (effort) (sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\"} | json | session_id=~\"$session_id\" | unwrap total_output_tokens [$__range]))",
- "legendFormat": "{{effort}}",
- "queryType": "range"
- }
- ],
- "transformations": [
- { "id": "reduce", "options": { "reducers": ["sum"] } },
- { "id": "sortBy", "options": { "fields": [{ "desc": true, "displayName": "Sum" }] } }
- ],
- "options": {
- "orientation": "horizontal",
- "barWidth": 0.7,
- "groupWidth": 0.7,
- "showValue": "always",
- "stacking": "none",
- "xTickLabelMaxLength": 24,
- "legend": { "displayMode": "hidden" },
- "tooltip": { "mode": "multi" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "palette-classic" },
- "unit": "short",
- "decimals": 0,
- "custom": { "fillOpacity": 80, "gradientMode": "none" }
- },
- "overrides": [
- {
- "matcher": { "id": "byName", "options": "standard" },
- "properties": [{ "id": "color", "value": { "fixedColor": "#5794F2", "mode": "fixed" } }]
- },
- {
- "matcher": { "id": "byName", "options": "extended_thinking" },
- "properties": [{ "id": "color", "value": { "fixedColor": "#B877D9", "mode": "fixed" } }]
- },
- {
- "matcher": { "id": "byName", "options": "fast" },
- "properties": [{ "id": "color", "value": { "fixedColor": "#73BF69", "mode": "fixed" } }]
- }
- ]
- }
- },
- {
- "id": 11,
- "title": "Cache Efficiency Over Time",
- "description": "Cache hit rate (%) per session window. Sustained high rates mean context is being efficiently reused across turns.",
- "type": "timeseries",
- "transparent": true,
- "gridPos": { "x": 16, "y": 28, "w": 8, "h": 9 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_cache_read_tokens [$__interval]))",
- "queryType": "range",
- "hide": true
- },
- {
- "refId": "B",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_input_tokens [$__interval]))",
- "queryType": "range",
- "hide": true
- },
- {
- "refId": "C",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap total_cache_creation_tokens [$__interval]))",
- "queryType": "range",
- "hide": true
- },
- {
- "refId": "CacheRate",
- "datasource": { "type": "__expr__", "uid": "__expr__" },
- "type": "math",
- "expression": "($A / ($A + $B + $C)) * 100",
- "legendFormat": "Cache Hit %"
- }
- ],
- "options": {
- "legend": { "displayMode": "hidden" },
- "tooltip": { "mode": "single", "sort": "none" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "thresholds" },
- "unit": "percent",
- "min": 0,
- "max": 100,
- "custom": {
- "drawStyle": "line",
- "lineInterpolation": "smooth",
- "lineWidth": 2,
- "fillOpacity": 20,
- "gradientMode": "scheme",
- "showPoints": "always",
- "pointSize": 5,
- "spanNulls": false,
- "axisBorderShow": false,
- "thresholdsStyle": { "mode": "area" }
- },
- "thresholds": {
- "mode": "absolute",
- "steps": [
- { "value": null, "color": "#F2495C" },
- { "value": 40, "color": "#FF9830" },
- { "value": 70, "color": "#73BF69" }
- ]
- }
- },
- "overrides": []
- }
- },
-
- {
- "type": "row",
- "title": "Session Leaderboard",
- "collapsed": false,
- "gridPos": { "x": 0, "y": 37, "w": 24, "h": 1 }
- },
- {
- "id": 12,
- "title": "Top Sessions by Cost",
- "description": "Most expensive sessions in the selected period. Bar length = estimated USD spend. Identify long/costly outlier sessions here.",
- "type": "barchart",
- "transparent": true,
- "gridPos": { "x": 0, "y": 38, "w": 14, "h": 12 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "topk(15, sum by (session_id) (sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json session_id=\"session_id\" | session_id=~\"$session_id\" | unwrap cost_usd [$__range])))",
- "legendFormat": "{{session_id}}",
- "queryType": "range"
- }
- ],
- "transformations": [
- { "id": "reduce", "options": { "reducers": ["sum"] } },
- { "id": "sortBy", "options": { "fields": [{ "desc": true, "displayName": "Sum" }] } }
- ],
- "options": {
- "orientation": "horizontal",
- "barWidth": 0.7,
- "groupWidth": 0.7,
- "showValue": "always",
- "stacking": "none",
- "xTickLabelMaxLength": 28,
- "legend": { "displayMode": "hidden" },
- "tooltip": { "mode": "single" }
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "continuous-GrYlRd"
- },
- "unit": "currencyUSD",
- "decimals": 3,
- "custom": { "fillOpacity": 85, "gradientMode": "none" }
- },
- "overrides": []
- }
- },
- {
- "id": 13,
- "title": "Recent Session Log",
- "description": "Raw session records. Each line = one completed Claude Code session. Includes model, effort, cost, token counts, and turns.",
- "type": "logs",
- "transparent": true,
- "gridPos": { "x": 14, "y": 38, "w": 10, "h": 12 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "{app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | line_format \"{{.model}} | {{.effort}} | ${{.cost_usd}} | out={{.total_output_tokens}} | turns={{.assistant_turns}} | cache={{.total_cache_read_tokens}} | {{.session_id}}\"",
- "queryType": "range"
- }
- ],
- "options": {
- "dedupStrategy": "none",
- "enableLogDetails": true,
- "prettifyLogMessage": false,
- "showCommonLabels": false,
- "showLabels": false,
- "showTime": true,
- "sortOrder": "Descending",
- "wrapLogMessage": false
- }
- },
-
- {
- "type": "row",
- "title": "Cost Trend",
- "collapsed": false,
- "gridPos": { "x": 0, "y": 50, "w": 24, "h": 1 }
- },
- {
- "id": 14,
- "title": "Daily Spend Trend",
- "description": "Estimated USD cost per day. Spot cost spikes and track efficiency gains over time.",
- "type": "timeseries",
- "transparent": true,
- "gridPos": { "x": 0, "y": 51, "w": 16, "h": 8 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum(sum_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap cost_usd [1d]))",
- "legendFormat": "Daily Cost",
- "queryType": "range"
- }
- ],
- "options": {
- "legend": { "displayMode": "hidden" },
- "tooltip": { "mode": "single", "sort": "none" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "fixed", "fixedColor": "#FADE2A" },
- "unit": "currencyUSD",
- "decimals": 2,
- "custom": {
- "drawStyle": "bars",
- "lineWidth": 1,
- "fillOpacity": 70,
- "gradientMode": "opacity",
- "showPoints": "never",
- "spanNulls": false,
- "axisBorderShow": false,
- "barAlignment": 0
- },
- "thresholds": {
- "mode": "absolute",
- "steps": [
- { "value": null, "color": "#73BF69" },
- { "value": 5, "color": "#FF9830" },
- { "value": 15, "color": "#F2495C" }
- ]
- }
- },
- "overrides": []
- }
- },
- {
- "id": 15,
- "title": "Assistant Turns per Session",
- "description": "Distribution of session depth (number of back-and-forth turns). Long sessions = more complex work or exploration.",
- "type": "timeseries",
- "transparent": true,
- "gridPos": { "x": 16, "y": 51, "w": 8, "h": 8 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "avg_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap assistant_turns [$__interval])",
- "legendFormat": "Avg Turns",
- "queryType": "range"
- },
- {
- "refId": "B",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "max_over_time({app=\"claude-token-metrics\",model=~\"$model\",project=~\"$project\",effort=~\"$effort\"} | json | session_id=~\"$session_id\" | unwrap assistant_turns [$__interval])",
- "legendFormat": "Max Turns",
- "queryType": "range"
- }
- ],
- "options": {
- "legend": { "displayMode": "list", "placement": "bottom" },
- "tooltip": { "mode": "multi", "sort": "desc" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "palette-classic" },
- "unit": "short",
- "decimals": 0,
- "custom": {
- "drawStyle": "line",
- "lineInterpolation": "smooth",
- "lineWidth": 2,
- "fillOpacity": 10,
- "gradientMode": "none",
- "showPoints": "always",
- "pointSize": 5,
- "spanNulls": false,
- "axisBorderShow": false
- }
- },
- "overrides": [
- {
- "matcher": { "id": "byName", "options": "Max Turns" },
- "properties": [
- { "id": "color", "value": { "fixedColor": "#FF9830", "mode": "fixed" } },
- { "id": "custom.lineWidth", "value": 1 },
- { "id": "custom.lineStyle", "value": { "dash": [4, 4], "fill": "dash" } }
- ]
- },
- {
- "matcher": { "id": "byName", "options": "Avg Turns" },
- "properties": [{ "id": "color", "value": { "fixedColor": "#5794F2", "mode": "fixed" } }]
- }
- ]
- }
- }
- ]
-}
diff --git a/observability/local/grafana/provisioning/dashboards/contextstream-deep-dive.json b/observability/local/grafana/provisioning/dashboards/contextstream-deep-dive.json
deleted file mode 100644
index 1a918b1..0000000
--- a/observability/local/grafana/provisioning/dashboards/contextstream-deep-dive.json
+++ /dev/null
@@ -1,432 +0,0 @@
-{
- "id": null,
- "uid": "contextstream-deep-dive",
- "title": "ContextStream — Deep Dive",
- "description": "MCP call patterns, action distribution, object lifecycle, and performance for ContextStream integration.",
- "tags": ["claude-code", "contextstream", "mcp"],
- "timezone": "browser",
- "editable": true,
- "graphTooltip": 1,
- "time": { "from": "now-6h", "to": "now" },
- "refresh": "30s",
- "schemaVersion": 39,
- "fiscalYearStartMonth": 0,
- "liveNow": false,
- "style": "dark",
- "templating": {
- "list": [
- {
- "name": "session_id",
- "label": "Session",
- "type": "query",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "query": "{app=\"claude-dev-logging\", component=\"lifecycle\"} | json | hook_type=\"session-start\"",
- "regex": "session_id\":\"([^\"]+)",
- "refresh": 2,
- "includeAll": true,
- "allValue": ".*",
- "current": { "text": "All", "value": "$__all" },
- "sort": 2
- },
- {
- "name": "cs_action",
- "label": "Action",
- "type": "query",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "query": "{app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | hook_type=\"post-tool-use\"",
- "regex": "hook_payload_tool_input_action\":\"([^\"]+)",
- "refresh": 2,
- "includeAll": true,
- "allValue": ".*",
- "current": { "text": "All", "value": "$__all" },
- "sort": 1
- }
- ]
- },
- "panels": [
- {
- "type": "row",
- "title": "MCP Call Summary",
- "collapsed": false,
- "gridPos": { "x": 0, "y": 0, "w": 24, "h": 1 }
- },
- {
- "id": 1,
- "title": "Total CS Calls",
- "type": "stat",
- "transparent": true,
- "gridPos": { "x": 0, "y": 1, "w": 6, "h": 5 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" [$__range])",
- "queryType": "range"
- }
- ],
- "options": {
- "colorMode": "background-gradient",
- "graphMode": "area",
- "textMode": "auto",
- "wideLayout": true,
- "justifyMode": "auto",
- "orientation": "auto",
- "text": { "titleSize": 12, "valueSize": 32 },
- "reduceOptions": { "values": false, "calcs": ["sum"], "fields": "" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "fixed", "fixedColor": "#B877D9" },
- "thresholds": { "mode": "absolute", "steps": [{ "value": null, "color": "#B877D9" }] }
- },
- "overrides": []
- }
- },
- {
- "id": 2,
- "title": "Failure Rate",
- "type": "gauge",
- "transparent": true,
- "gridPos": { "x": 6, "y": 1, "w": 6, "h": 5 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "errors",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\", level=\"ERROR\"} | json | session_id=~\"$session_id\" [$__range])",
- "queryType": "range",
- "hide": true
- },
- {
- "refId": "total",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" [$__range])",
- "queryType": "range",
- "hide": true
- },
- {
- "refId": "rate",
- "datasource": { "type": "__expr__", "uid": "__expr__" },
- "type": "math",
- "expression": "$errors / $total * 100"
- }
- ],
- "options": {
- "showThresholdLabels": false,
- "showThresholdMarkers": true,
- "reduceOptions": { "values": false, "calcs": ["lastNotNull"], "fields": "" }
- },
- "fieldConfig": {
- "defaults": {
- "unit": "percent",
- "min": 0,
- "max": 100,
- "color": { "mode": "thresholds" },
- "thresholds": {
- "mode": "absolute",
- "steps": [
- { "value": null, "color": "#73BF69" },
- { "value": 5, "color": "#FF9830" },
- { "value": 20, "color": "#F2495C" }
- ]
- }
- },
- "overrides": []
- }
- },
- {
- "id": 3,
- "title": "Calls by Tool",
- "type": "piechart",
- "transparent": true,
- "gridPos": { "x": 12, "y": 1, "w": 12, "h": 5 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum by (tool_name) (count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" [$__range]))",
- "queryType": "range"
- }
- ],
- "options": {
- "pieType": "donut",
- "displayLabels": ["percent"],
- "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] },
- "tooltip": { "mode": "multi" },
- "reduceOptions": { "values": false, "calcs": ["sum"], "fields": "" }
- },
- "fieldConfig": {
- "defaults": { "color": { "mode": "palette-classic" } },
- "overrides": [
- { "matcher": { "id": "byName", "options": "mcp__contextstream__memory" }, "properties": [{ "id": "color", "value": { "fixedColor": "#B877D9", "mode": "fixed" } }] },
- { "matcher": { "id": "byName", "options": "mcp__contextstream__session" }, "properties": [{ "id": "color", "value": { "fixedColor": "#5794F2", "mode": "fixed" } }] },
- { "matcher": { "id": "byName", "options": "mcp__contextstream__search" }, "properties": [{ "id": "color", "value": { "fixedColor": "#73BF69", "mode": "fixed" } }] },
- { "matcher": { "id": "byName", "options": "mcp__contextstream__context" }, "properties": [{ "id": "color", "value": { "fixedColor": "#FF9830", "mode": "fixed" } }] },
- { "matcher": { "id": "byName", "options": "mcp__contextstream__init" }, "properties": [{ "id": "color", "value": { "fixedColor": "#FADE2A", "mode": "fixed" } }] },
- { "matcher": { "id": "byName", "options": "mcp__contextstream__workspace" }, "properties": [{ "id": "color", "value": { "fixedColor": "#8AB8FF", "mode": "fixed" } }] },
- { "matcher": { "id": "byName", "options": "mcp__contextstream__project" }, "properties": [{ "id": "color", "value": { "fixedColor": "#CA95E5", "mode": "fixed" } }] },
- { "matcher": { "id": "byName", "options": "mcp__contextstream__help" }, "properties": [{ "id": "color", "value": { "fixedColor": "#96D98D", "mode": "fixed" } }] }
- ]
- }
- },
- {
- "type": "row",
- "title": "Action Patterns",
- "collapsed": false,
- "gridPos": { "x": 0, "y": 7, "w": 24, "h": 1 }
- },
- {
- "id": 4,
- "title": "Action Breakdown",
- "description": "Which MCP actions are called most frequently.",
- "type": "barchart",
- "transparent": true,
- "gridPos": { "x": 0, "y": 8, "w": 12, "h": 10 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum by (hook_payload_tool_input_action) (count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" [$__range]))",
- "queryType": "range"
- }
- ],
- "options": {
- "orientation": "horizontal",
- "barWidth": 0.7,
- "groupWidth": 0.7,
- "showValue": "auto",
- "stacking": "none",
- "legend": { "displayMode": "hidden" },
- "tooltip": { "mode": "multi" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "fixed", "fixedColor": "#B877D9" },
- "custom": { "fillOpacity": 80, "gradientMode": "hue" }
- },
- "overrides": []
- }
- },
- {
- "id": 5,
- "title": "Action Mix Over Time",
- "description": "How action usage patterns shift during a session.",
- "type": "timeseries",
- "transparent": true,
- "gridPos": { "x": 12, "y": 8, "w": 12, "h": 10 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum by (hook_payload_tool_input_action) (count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" [$__interval]))",
- "legendFormat": "{{hook_payload_tool_input_action}}",
- "queryType": "range"
- }
- ],
- "options": {
- "legend": { "displayMode": "table", "placement": "right", "calcs": ["sum"] },
- "tooltip": { "mode": "multi", "sort": "desc" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "palette-classic" },
- "custom": {
- "drawStyle": "line",
- "lineInterpolation": "smooth",
- "lineWidth": 2,
- "fillOpacity": 10,
- "gradientMode": "opacity",
- "showPoints": "never",
- "spanNulls": false,
- "axisBorderShow": false,
- "stacking": { "mode": "none", "group": "A" }
- }
- },
- "overrides": []
- }
- },
- {
- "type": "row",
- "title": "Object Lifecycle",
- "collapsed": false,
- "gridPos": { "x": 0, "y": 19, "w": 24, "h": 1 }
- },
- {
- "id": 6,
- "title": "CRUD Operations",
- "description": "Create, read, update, and query operation distribution.",
- "type": "barchart",
- "transparent": true,
- "gridPos": { "x": 0, "y": 20, "w": 12, "h": 9 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "Create",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | hook_payload_tool_input_action=~\"create.*|capture.*|import.*|remember\" [$__range])",
- "legendFormat": "Create",
- "queryType": "range"
- },
- {
- "refId": "Read",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | hook_payload_tool_input_action=~\"get.*|list.*\" [$__range])",
- "legendFormat": "Read",
- "queryType": "range"
- },
- {
- "refId": "Update",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | hook_payload_tool_input_action=~\"update.*|supersede.*|complete.*|reorder.*\" [$__range])",
- "legendFormat": "Update",
- "queryType": "range"
- },
- {
- "refId": "Query",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | hook_payload_tool_input_action=~\"search|recall|decisions|summary|timeline|smart_search|decision_trace\" [$__range])",
- "legendFormat": "Query",
- "queryType": "range"
- }
- ],
- "options": {
- "orientation": "vertical",
- "barWidth": 0.6,
- "groupWidth": 0.7,
- "showValue": "always",
- "stacking": "none",
- "legend": { "displayMode": "list", "placement": "bottom" },
- "tooltip": { "mode": "multi" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "palette-classic" },
- "custom": { "fillOpacity": 80, "gradientMode": "hue" }
- },
- "overrides": [
- { "matcher": { "id": "byName", "options": "Create" }, "properties": [{ "id": "color", "value": { "fixedColor": "#73BF69", "mode": "fixed" } }] },
- { "matcher": { "id": "byName", "options": "Read" }, "properties": [{ "id": "color", "value": { "fixedColor": "#5794F2", "mode": "fixed" } }] },
- { "matcher": { "id": "byName", "options": "Update" }, "properties": [{ "id": "color", "value": { "fixedColor": "#FF9830", "mode": "fixed" } }] },
- { "matcher": { "id": "byName", "options": "Query" }, "properties": [{ "id": "color", "value": { "fixedColor": "#B877D9", "mode": "fixed" } }] }
- ]
- }
- },
- {
- "id": 7,
- "title": "Context & Search Patterns",
- "description": "Context refresh frequency and search mode usage over time.",
- "type": "timeseries",
- "transparent": true,
- "gridPos": { "x": 12, "y": 20, "w": 12, "h": 9 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "context",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum by (tool_name) (count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | tool_name=~\".*init|.*context\" [$__interval]))",
- "legendFormat": "{{tool_name}}",
- "queryType": "range"
- },
- {
- "refId": "search",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "sum by (hook_payload_tool_input_mode) (count_over_time({app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | hook_type=\"post-tool-use\" | tool_name=~\".*search\" [$__interval]))",
- "legendFormat": "search:{{hook_payload_tool_input_mode}}",
- "queryType": "range"
- }
- ],
- "options": {
- "legend": { "displayMode": "table", "placement": "right", "calcs": ["sum"] },
- "tooltip": { "mode": "multi", "sort": "desc" }
- },
- "fieldConfig": {
- "defaults": {
- "color": { "mode": "palette-classic" },
- "custom": {
- "drawStyle": "line",
- "lineInterpolation": "smooth",
- "lineWidth": 2,
- "fillOpacity": 15,
- "gradientMode": "opacity",
- "showPoints": "never",
- "spanNulls": false,
- "axisBorderShow": false,
- "stacking": { "mode": "none", "group": "A" }
- }
- },
- "overrides": []
- }
- },
- {
- "type": "row",
- "title": "Errors & Failures",
- "collapsed": false,
- "gridPos": { "x": 0, "y": 30, "w": 24, "h": 1 }
- },
- {
- "id": 8,
- "title": "Error Log",
- "description": "All ContextStream MCP errors.",
- "type": "logs",
- "transparent": true,
- "gridPos": { "x": 0, "y": 31, "w": 24, "h": 8 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "{app=\"claude-dev-logging\", component=\"mcp-contextstream\", level=\"ERROR\"} | json | session_id=~\"$session_id\"",
- "queryType": "range"
- }
- ],
- "options": {
- "showTime": true,
- "showLabels": false,
- "showCommonLabels": false,
- "wrapLogMessage": true,
- "prettifyLogMessage": false,
- "enableLogDetails": true,
- "sortOrder": "Descending",
- "dedupStrategy": "none"
- }
- },
- {
- "type": "row",
- "title": "Full Log Stream",
- "collapsed": true,
- "gridPos": { "x": 0, "y": 40, "w": 24, "h": 1 },
- "panels": [
- {
- "id": 9,
- "title": "ContextStream Logs",
- "type": "logs",
- "transparent": true,
- "gridPos": { "x": 0, "y": 41, "w": 24, "h": 14 },
- "datasource": { "type": "loki", "uid": "loki_local" },
- "targets": [
- {
- "refId": "A",
- "datasource": { "type": "loki", "uid": "loki_local" },
- "expr": "{app=\"claude-dev-logging\", component=\"mcp-contextstream\"} | json | session_id=~\"$session_id\" | line_format \"{{.tool_name}} | action={{.hook_payload_tool_input_action}} | {{.hook_type}}\"",
- "queryType": "range"
- }
- ],
- "options": {
- "showTime": true,
- "showLabels": false,
- "showCommonLabels": false,
- "wrapLogMessage": true,
- "prettifyLogMessage": false,
- "enableLogDetails": true,
- "sortOrder": "Descending",
- "dedupStrategy": "none"
- }
- }
- ]
- }
- ]
-}
diff --git a/observability/local/grafana/provisioning/dashboards/simsteward-deploy-health.json b/observability/local/grafana/provisioning/dashboards/simsteward-deploy-health.json
index 7be1cdb..83539b8 100644
--- a/observability/local/grafana/provisioning/dashboards/simsteward-deploy-health.json
+++ b/observability/local/grafana/provisioning/dashboards/simsteward-deploy-health.json
@@ -1,5 +1,6 @@
{
"annotations": { "list": [] },
+ "description": "Deploy health — deploy markers, plugin bring-up, bridge start, and error volume. deploy.ps1 pushes event=deploy_marker when SIMSTEWARD_LOKI_URL is set. post_deploy_warn=true means post-deploy tests/*.ps1 failed.",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
@@ -8,19 +9,146 @@
"liveNow": false,
"panels": [
{
- "gridPos": { "h": 3, "w": 24, "x": 0, "y": 0 },
- "id": 1,
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "fieldConfig": {
+ "defaults": {
+ "decimals": 0,
+ "color": { "mode": "thresholds" },
+ "thresholds": { "mode": "absolute", "steps": [{ "color": "#5794F2", "value": null }] }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 5, "w": 8, "x": 0, "y": 0 },
+ "id": 8,
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "sum(count_over_time({app=\"sim-steward\", env=\"${env}\"} | json | event=\"deploy_marker\" [$__range]))",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Deploys This Period",
+ "type": "stat"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "fieldConfig": {
+ "defaults": {
+ "decimals": 0,
+ "color": { "mode": "thresholds" },
+ "thresholds": { "mode": "absolute", "steps": [{ "color": "#73BF69", "value": null }, { "color": "#F2495C", "value": 1 }] }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 5, "w": 8, "x": 8, "y": 0 },
+ "id": 9,
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "sum(count_over_time({app=\"sim-steward\", env=\"${env}\"} | json | event=\"deploy_marker\" | post_deploy_warn=\"true\" [$__range]))",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Post-Deploy Warnings",
+ "type": "stat"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "fieldConfig": {
+ "defaults": {
+ "decimals": 0,
+ "color": { "mode": "thresholds" },
+ "thresholds": { "mode": "absolute", "steps": [{ "color": "#73BF69", "value": null }, { "color": "#F2495C", "value": 1 }] }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 5, "w": 8, "x": 16, "y": 0 },
+ "id": 10,
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "sum(count_over_time({app=\"sim-steward\", env=\"${env}\"} | json | event=\"bridge_start_failed\" [$__range]))",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Bridge Start Failures",
+ "type": "stat"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "drawStyle": "line",
+ "lineInterpolation": "smooth",
+ "gradientMode": "opacity",
+ "fillOpacity": 20,
+ "lineWidth": 2,
+ "pointSize": 5,
+ "showPoints": "auto",
+ "spanNulls": 3600000,
+ "axisLabel": "Deploys"
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 6, "w": 24, "x": 0, "y": 5 },
+ "id": 11,
"options": {
- "code": { "language": "markdown", "showLineNumbers": false, "showMiniMap": false },
- "content": "**Deploy health** — Correlates `deploy.ps1` with plugin bring-up in Loki.\n\n- **Deploy markers** — Lines pushed at end of `deploy.ps1` when `SIMSTEWARD_LOKI_URL` is set (`event=deploy_marker`). `post_deploy_warn=true` means post-deploy `tests/*.ps1` failed after retry.\n- **Plugin / bridge** — `plugin_ready` and `bridge_start_failed` show whether SimHub loaded the plugin and WebSocket started.\n- **Errors** — Structured ERROR lines; spike after a bad deploy often means SimHub/plugin mismatch or WS failure.\n\nOpen repo `deploy.ps1` console output for copy failures; this dashboard is **telemetry**, not a full deploy log.",
- "mode": "markdown"
+ "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true },
+ "tooltip": { "mode": "single", "sort": "none" }
},
- "title": "About",
- "type": "text"
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "sum(count_over_time({app=\"sim-steward\", env=\"${env}\"} | json | event=\"deploy_marker\" [$__interval]))",
+ "legendFormat": "Deploys",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Deploy Frequency",
+ "type": "timeseries"
},
{
"datasource": { "type": "loki", "uid": "loki_local" },
- "gridPos": { "h": 10, "w": 24, "x": 0, "y": 3 },
+ "gridPos": { "h": 10, "w": 24, "x": 0, "y": 11 },
"id": 2,
"options": {
"dedupStrategy": "none",
@@ -41,12 +169,12 @@
"refId": "A"
}
],
- "title": "Deploy markers (deploy.ps1 → Loki)",
+ "title": "Deploy Markers (deploy.ps1 → Loki)",
"type": "logs"
},
{
"datasource": { "type": "loki", "uid": "loki_local" },
- "gridPos": { "h": 9, "w": 12, "x": 0, "y": 13 },
+ "gridPos": { "h": 9, "w": 12, "x": 0, "y": 21 },
"id": 3,
"options": {
"dedupStrategy": "none",
@@ -67,12 +195,12 @@
"refId": "A"
}
],
- "title": "Plugin / bridge lifecycle",
+ "title": "Plugin / Bridge Lifecycle",
"type": "logs"
},
{
"datasource": { "type": "loki", "uid": "loki_local" },
- "gridPos": { "h": 9, "w": 12, "x": 12, "y": 13 },
+ "gridPos": { "h": 9, "w": 12, "x": 12, "y": 21 },
"id": 4,
"options": {
"dedupStrategy": "none",
@@ -93,28 +221,30 @@
"refId": "A"
}
],
- "title": "WebSocket bridge failures",
+ "title": "WebSocket Bridge Failures",
"type": "logs"
},
{
"datasource": { "type": "loki", "uid": "loki_local" },
"fieldConfig": {
"defaults": {
- "color": { "mode": "palette-classic" },
+ "color": { "mode": "fixed", "fixedColor": "#F2495C" },
"custom": {
- "axisBorderShow": false,
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "drawStyle": "bars",
- "fillOpacity": 40,
- "lineWidth": 1,
- "showPoints": "never"
+ "drawStyle": "line",
+ "lineInterpolation": "smooth",
+ "gradientMode": "opacity",
+ "fillOpacity": 25,
+ "lineWidth": 2,
+ "pointSize": 5,
+ "showPoints": "auto",
+ "spanNulls": 3600000,
+ "axisLabel": "Errors"
},
"unit": "short"
},
"overrides": []
},
- "gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 },
+ "gridPos": { "h": 8, "w": 24, "x": 0, "y": 30 },
"id": 5,
"options": {
"legend": { "displayMode": "list", "placement": "bottom", "showLegend": true },
@@ -124,18 +254,18 @@
{
"datasource": { "type": "loki", "uid": "loki_local" },
"editorMode": "code",
- "expr": "sum(count_over_time({app=\"sim-steward\", env=\"${env}\", level=\"ERROR\"} [5m]))",
- "legendFormat": "ERROR lines / 5m",
+ "expr": "sum(count_over_time({app=\"sim-steward\", env=\"${env}\", level=\"ERROR\"} [$__interval]))",
+ "legendFormat": "ERROR lines / interval",
"queryType": "range",
"refId": "A"
}
],
- "title": "ERROR log volume (5m buckets)",
+ "title": "ERROR Log Volume",
"type": "timeseries"
},
{
"datasource": { "type": "loki", "uid": "loki_local" },
- "gridPos": { "h": 12, "w": 24, "x": 0, "y": 30 },
+ "gridPos": { "h": 12, "w": 24, "x": 0, "y": 38 },
"id": 6,
"options": {
"dedupStrategy": "none",
@@ -156,12 +286,12 @@
"refId": "A"
}
],
- "title": "Recent ERROR lines (full)",
+ "title": "Recent ERROR Lines (Full)",
"type": "logs"
},
{
"datasource": { "type": "loki", "uid": "loki_local" },
- "gridPos": { "h": 10, "w": 24, "x": 0, "y": 42 },
+ "gridPos": { "h": 10, "w": 24, "x": 0, "y": 50 },
"id": 7,
"options": {
"dedupStrategy": "none",
@@ -182,7 +312,7 @@
"refId": "A"
}
],
- "title": "Failed actions (action_result success=false)",
+ "title": "Failed Actions (action_result success=false)",
"type": "logs"
}
],
@@ -210,7 +340,7 @@
"time": { "from": "now-24h", "to": "now" },
"timepicker": {},
"timezone": "browser",
- "title": "Sim Steward — Deploy health",
+ "title": "Sim Steward — Deploy Health",
"uid": "simsteward-deploy-health",
"version": 1,
"weekStart": ""
diff --git a/observability/local/grafana/provisioning/dashboards/simsteward-log-sentinel.json b/observability/local/grafana/provisioning/dashboards/simsteward-log-sentinel.json
new file mode 100644
index 0000000..79bb00d
--- /dev/null
+++ b/observability/local/grafana/provisioning/dashboards/simsteward-log-sentinel.json
@@ -0,0 +1,749 @@
+{
+ "annotations": { "list": [] },
+ "description": "Autonomous log-analysis pipeline — 16 detectors (app + ops), three-tier LLM (T0 detect → T1 deduplicate → T2 investigate). Cycle every 5 min. component=log-sentinel.",
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 1,
+ "id": null,
+ "links": [],
+ "liveNow": false,
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+ "id": 100,
+ "title": "Sentinel Health",
+ "type": "row"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "fieldConfig": {
+ "defaults": {
+ "decimals": 0,
+ "color": { "mode": "thresholds" },
+ "thresholds": { "mode": "absolute", "steps": [{ "color": "#73BF69", "value": null }] }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 6, "w": 6, "x": 0, "y": 1 },
+ "id": 2,
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "sum(count_over_time({component=\"log-sentinel\"} | json | event=\"sentinel_cycle\" [$__range]))",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Cycles Completed",
+ "type": "stat"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "drawStyle": "line",
+ "fillOpacity": 18,
+ "gradientMode": "opacity",
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "showPoints": "auto",
+ "spanNulls": 3600000
+ },
+ "unit": "ms"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 6, "w": 10, "x": 6, "y": 1 },
+ "id": 3,
+ "options": {
+ "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true },
+ "tooltip": { "mode": "single", "sort": "none" }
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "avg(avg_over_time({component=\"log-sentinel\"} | json | event=\"sentinel_cycle\" | unwrap duration_ms [$__interval]))",
+ "legendFormat": "Avg Cycle Duration",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Cycle Duration",
+ "type": "timeseries"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "fieldConfig": {
+ "defaults": {
+ "decimals": 0,
+ "color": { "mode": "thresholds" },
+ "thresholds": { "mode": "absolute", "steps": [{ "color": "#73BF69", "value": null }, { "color": "#F2495C", "value": 1 }] }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 6, "w": 8, "x": 16, "y": 1 },
+ "id": 4,
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "sum(count_over_time({component=\"log-sentinel\", event=\"sentinel_detector_run\"} | json | error != \"\" [$__range]))",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Detector Errors",
+ "type": "stat"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 },
+ "id": 101,
+ "title": "Findings Overview",
+ "type": "row"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "drawStyle": "line",
+ "lineInterpolation": "smooth",
+ "fillOpacity": 18,
+ "gradientMode": "opacity",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "showPoints": "auto",
+ "spanNulls": 3600000,
+ "axisLabel": "Findings",
+ "stacking": { "mode": "normal", "group": "A" }
+ }
+ },
+ "overrides": [
+ {
+ "matcher": { "id": "byName", "options": "App" },
+ "properties": [{ "id": "color", "value": { "fixedColor": "#F2495C", "mode": "fixed" } }]
+ },
+ {
+ "matcher": { "id": "byName", "options": "Ops" },
+ "properties": [{ "id": "color", "value": { "fixedColor": "#FF9830", "mode": "fixed" } }]
+ }
+ ]
+ },
+ "gridPos": { "h": 8, "w": 16, "x": 0, "y": 8 },
+ "id": 17,
+ "options": {
+ "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true },
+ "tooltip": { "mode": "multi", "sort": "desc" }
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "sum(count_over_time({component=\"log-sentinel\", event=\"sentinel_finding\"} | json | category=\"app\" [$__interval]))",
+ "legendFormat": "App",
+ "queryType": "range",
+ "refId": "A"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "sum(count_over_time({component=\"log-sentinel\", event=\"sentinel_finding\"} | json | category=\"ops\" [$__interval]))",
+ "legendFormat": "Ops",
+ "queryType": "range",
+ "refId": "B"
+ }
+ ],
+ "title": "Findings Over Time",
+ "type": "timeseries"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 8, "w": 8, "x": 16, "y": 8 },
+ "id": 18,
+ "options": {
+ "pieType": "donut",
+ "displayLabels": ["name", "percent"],
+ "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] },
+ "tooltip": { "mode": "single" },
+ "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "sum by (severity) (count_over_time({component=\"log-sentinel\", event=\"sentinel_finding\"} | json [$__range]))",
+ "legendFormat": "{{severity}}",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Findings by Severity",
+ "type": "piechart"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 16 },
+ "id": 102,
+ "title": "App Findings",
+ "type": "row"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "gridPos": { "h": 8, "w": 16, "x": 0, "y": 17 },
+ "id": 6,
+ "options": {
+ "dedupStrategy": "none",
+ "enableLogDetails": true,
+ "prettifyLogMessage": true,
+ "showCommonLabels": false,
+ "showLabels": false,
+ "showTime": true,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "{component=\"log-sentinel\", event=\"sentinel_finding\"} | json | category=\"app\"",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "App Findings",
+ "type": "logs"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "fieldConfig": {
+ "defaults": {
+ "decimals": 0,
+ "color": { "mode": "thresholds" },
+ "thresholds": { "mode": "absolute", "steps": [{ "color": "#73BF69", "value": null }, { "color": "#FF9830", "value": 5 }, { "color": "#F2495C", "value": 15 }] }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 8, "w": 8, "x": 16, "y": 17 },
+ "id": 7,
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "sum(count_over_time({component=\"log-sentinel\", event=\"sentinel_finding\"} | json | category=\"app\" [$__range]))",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "App Finding Count",
+ "type": "stat"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 25 },
+ "id": 103,
+ "title": "Ops Findings",
+ "type": "row"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "gridPos": { "h": 8, "w": 16, "x": 0, "y": 26 },
+ "id": 8,
+ "options": {
+ "dedupStrategy": "none",
+ "enableLogDetails": true,
+ "prettifyLogMessage": true,
+ "showCommonLabels": false,
+ "showLabels": false,
+ "showTime": true,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "{component=\"log-sentinel\", event=\"sentinel_finding\"} | json | category=\"ops\"",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Ops Findings",
+ "type": "logs"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "fieldConfig": {
+ "defaults": {
+ "decimals": 0,
+ "color": { "mode": "thresholds" },
+ "thresholds": { "mode": "absolute", "steps": [{ "color": "#73BF69", "value": null }, { "color": "#FF9830", "value": 5 }, { "color": "#F2495C", "value": 15 }] }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 8, "w": 8, "x": 16, "y": 26 },
+ "id": 9,
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "sum(count_over_time({component=\"log-sentinel\", event=\"sentinel_finding\"} | json | category=\"ops\" [$__range]))",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Ops Finding Count",
+ "type": "stat"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 34 },
+ "id": 110,
+ "title": "Per-Detector Timing",
+ "type": "row"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "drawStyle": "line",
+ "lineInterpolation": "smooth",
+ "fillOpacity": 10,
+ "gradientMode": "opacity",
+ "lineWidth": 2,
+ "pointSize": 4,
+ "showPoints": "auto",
+ "spanNulls": 3600000,
+ "axisLabel": "Duration (ms)"
+ },
+ "unit": "ms"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 8, "w": 24, "x": 0, "y": 35 },
+ "id": 19,
+ "options": {
+ "legend": { "displayMode": "table", "placement": "right", "showLegend": true, "calcs": ["mean", "max"] },
+ "tooltip": { "mode": "multi", "sort": "desc" }
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "avg by (detector) (avg_over_time({component=\"log-sentinel\"} | json | event=\"sentinel_detector_run\" | unwrap duration_ms [$__interval]))",
+ "legendFormat": "{{detector}}",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Detector Duration by Name",
+ "type": "timeseries"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "gridPos": { "h": 8, "w": 24, "x": 0, "y": 43 },
+ "id": 5,
+ "options": {
+ "dedupStrategy": "none",
+ "enableLogDetails": true,
+ "prettifyLogMessage": true,
+ "showCommonLabels": false,
+ "showLabels": false,
+ "showTime": true,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "{component=\"log-sentinel\", event=\"sentinel_detector_run\"} | json",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Detector Runs",
+ "type": "logs"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 51 },
+ "id": 104,
+ "title": "T2 LLM Activity",
+ "type": "row"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "fieldConfig": {
+ "defaults": {
+ "decimals": 0,
+ "color": { "mode": "thresholds" },
+ "thresholds": { "mode": "absolute", "steps": [{ "color": "#73BF69", "value": null }] }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 6, "w": 6, "x": 0, "y": 52 },
+ "id": 11,
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "sum(count_over_time({component=\"log-sentinel\", event=\"sentinel_t2_run\"} [$__range]))",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "T2 Investigations",
+ "type": "stat"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "fieldConfig": {
+ "defaults": {
+ "decimals": 0,
+ "color": { "mode": "thresholds" },
+ "thresholds": { "mode": "absolute", "steps": [{ "color": "#5794F2", "value": null }] }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 6, "w": 6, "x": 6, "y": 52 },
+ "id": 12,
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "sum(count_over_time({component=\"log-sentinel\", event=\"sentinel_t2_run\"} | json | trigger=\"proactive\" [$__range]))",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Proactive Polls",
+ "type": "stat"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "fieldConfig": {
+ "defaults": {
+ "unit": "ms",
+ "decimals": 0,
+ "color": { "mode": "thresholds" },
+ "thresholds": { "mode": "absolute", "steps": [{ "color": "#8AB8FF", "value": null }, { "color": "#FF9830", "value": 10000 }, { "color": "#F2495C", "value": 30000 }] }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 6, "w": 6, "x": 12, "y": 52 },
+ "id": 20,
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": { "calcs": ["mean"], "fields": "", "values": false }
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "avg(avg_over_time({component=\"log-sentinel\"} | json | event=\"sentinel_t2_run\" | unwrap duration_ms [$__range]))",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Avg T2 Duration",
+ "type": "stat"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "fieldConfig": {
+ "defaults": {
+ "decimals": 0,
+ "color": { "mode": "thresholds" },
+ "thresholds": { "mode": "absolute", "steps": [{ "color": "#B877D9", "value": null }] }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 6, "w": 6, "x": 18, "y": 52 },
+ "id": 21,
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": { "calcs": ["mean"], "fields": "", "values": false }
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "avg(avg_over_time({component=\"log-sentinel\"} | json | event=\"sentinel_t2_run\" | unwrap tokens_used [$__range]))",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Avg T2 Tokens",
+ "type": "stat"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "gridPos": { "h": 8, "w": 24, "x": 0, "y": 58 },
+ "id": 10,
+ "options": {
+ "dedupStrategy": "none",
+ "enableLogDetails": true,
+ "prettifyLogMessage": true,
+ "showCommonLabels": false,
+ "showLabels": false,
+ "showTime": true,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "{component=\"log-sentinel\", event=\"sentinel_t2_run\"} | json",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "T2 Run Metrics",
+ "type": "logs"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 66 },
+ "id": 105,
+ "title": "T2 Investigation Reports",
+ "type": "row"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "gridPos": { "h": 10, "w": 24, "x": 0, "y": 67 },
+ "id": 13,
+ "options": {
+ "dedupStrategy": "none",
+ "enableLogDetails": true,
+ "prettifyLogMessage": true,
+ "showCommonLabels": false,
+ "showLabels": false,
+ "showTime": true,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "{component=\"log-sentinel\", event=\"sentinel_investigation\"} | json",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Investigation Reports",
+ "type": "logs"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 77 },
+ "id": 106,
+ "title": "Sentry Issues",
+ "type": "row"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "gridPos": { "h": 6, "w": 16, "x": 0, "y": 78 },
+ "id": 14,
+ "options": {
+ "dedupStrategy": "none",
+ "enableLogDetails": true,
+ "prettifyLogMessage": true,
+ "showCommonLabels": false,
+ "showLabels": false,
+ "showTime": true,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "{component=\"log-sentinel\", event=\"sentinel_sentry_issue\"} | json",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Sentry Issues Created",
+ "type": "logs"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "fieldConfig": {
+ "defaults": {
+ "decimals": 0,
+ "color": { "mode": "thresholds" },
+ "thresholds": { "mode": "absolute", "steps": [{ "color": "#5794F2", "value": null }] }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 6, "w": 8, "x": 16, "y": 78 },
+ "id": 15,
+ "options": {
+ "colorMode": "background-gradient",
+ "graphMode": "area",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "textMode": "value",
+ "text": { "titleSize": 12, "valueSize": 36 },
+ "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "sum(count_over_time({component=\"log-sentinel\", event=\"sentinel_sentry_issue\"} [$__range]))",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Sentry Issues",
+ "type": "stat"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 84 },
+ "id": 107,
+ "title": "Process Logs",
+ "type": "row"
+ },
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "gridPos": { "h": 10, "w": 24, "x": 0, "y": 85 },
+ "id": 16,
+ "options": {
+ "dedupStrategy": "none",
+ "enableLogDetails": true,
+ "prettifyLogMessage": true,
+ "showCommonLabels": false,
+ "showLabels": false,
+ "showTime": true,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "datasource": { "type": "loki", "uid": "loki_local" },
+ "editorMode": "code",
+ "expr": "{component=\"log-sentinel\", event=\"sentinel_log\"} | json",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Sentinel Process Logs",
+ "type": "logs"
+ }
+ ],
+ "refresh": "30s",
+ "schemaVersion": 39,
+ "tags": ["simsteward", "log-sentinel", "observability"],
+ "templating": {
+ "list": [
+ {
+ "current": { "selected": true, "text": "local", "value": "local" },
+ "hide": 0,
+ "includeAll": false,
+ "label": "env",
+ "name": "env",
+ "options": [
+ { "selected": true, "text": "local", "value": "local" },
+ { "selected": false, "text": "production", "value": "production" }
+ ],
+ "query": "local,production",
+ "skipUrlSync": false,
+ "type": "custom"
+ }
+ ]
+ },
+ "time": { "from": "now-6h", "to": "now" },
+ "timepicker": {},
+ "timezone": "browser",
+ "title": "SimSteward — Log Sentinel",
+ "uid": "simsteward-log-sentinel",
+ "version": 1,
+ "weekStart": ""
+}
diff --git a/observability/local/log-sentinel/Dockerfile b/observability/local/log-sentinel/Dockerfile
new file mode 100644
index 0000000..f72d0cb
--- /dev/null
+++ b/observability/local/log-sentinel/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.12-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 8081
+CMD ["python", "app.py"]
diff --git a/observability/local/log-sentinel/__pycache__/circuit_breaker.cpython-313.pyc b/observability/local/log-sentinel/__pycache__/circuit_breaker.cpython-313.pyc
new file mode 100644
index 0000000..d1e105e
Binary files /dev/null and b/observability/local/log-sentinel/__pycache__/circuit_breaker.cpython-313.pyc differ
diff --git a/observability/local/log-sentinel/__pycache__/config.cpython-313.pyc b/observability/local/log-sentinel/__pycache__/config.cpython-313.pyc
new file mode 100644
index 0000000..321aeef
Binary files /dev/null and b/observability/local/log-sentinel/__pycache__/config.cpython-313.pyc differ
diff --git a/observability/local/log-sentinel/__pycache__/grafana_client.cpython-313.pyc b/observability/local/log-sentinel/__pycache__/grafana_client.cpython-313.pyc
new file mode 100644
index 0000000..0bb3368
Binary files /dev/null and b/observability/local/log-sentinel/__pycache__/grafana_client.cpython-313.pyc differ
diff --git a/observability/local/log-sentinel/__pycache__/loki_client.cpython-313.pyc b/observability/local/log-sentinel/__pycache__/loki_client.cpython-313.pyc
new file mode 100644
index 0000000..b2b45cf
Binary files /dev/null and b/observability/local/log-sentinel/__pycache__/loki_client.cpython-313.pyc differ
diff --git a/observability/local/log-sentinel/__pycache__/loki_handler.cpython-313.pyc b/observability/local/log-sentinel/__pycache__/loki_handler.cpython-313.pyc
new file mode 100644
index 0000000..16bfaca
Binary files /dev/null and b/observability/local/log-sentinel/__pycache__/loki_handler.cpython-313.pyc differ
diff --git a/observability/local/log-sentinel/__pycache__/models.cpython-313.pyc b/observability/local/log-sentinel/__pycache__/models.cpython-313.pyc
new file mode 100644
index 0000000..e8dd266
Binary files /dev/null and b/observability/local/log-sentinel/__pycache__/models.cpython-313.pyc differ
diff --git a/observability/local/log-sentinel/__pycache__/query_cache.cpython-313.pyc b/observability/local/log-sentinel/__pycache__/query_cache.cpython-313.pyc
new file mode 100644
index 0000000..894b25d
Binary files /dev/null and b/observability/local/log-sentinel/__pycache__/query_cache.cpython-313.pyc differ
diff --git a/observability/local/log-sentinel/__pycache__/sentry_client.cpython-313.pyc b/observability/local/log-sentinel/__pycache__/sentry_client.cpython-313.pyc
new file mode 100644
index 0000000..27141cd
Binary files /dev/null and b/observability/local/log-sentinel/__pycache__/sentry_client.cpython-313.pyc differ
diff --git a/observability/local/log-sentinel/analyst.py b/observability/local/log-sentinel/analyst.py
new file mode 100644
index 0000000..ce4f94b
--- /dev/null
+++ b/observability/local/log-sentinel/analyst.py
@@ -0,0 +1,376 @@
+"""LLM-driven analyst — T1 fast scan and T2 deep investigation."""
+
+import json
+import logging
+import re
+import time
+from dataclasses import dataclass, field
+
+from circuit_breaker import CircuitBreaker
+from config import Config
+from loki_client import LokiClient
+from ollama_client import OllamaClient
+from prompts import (
+ T1_SYSTEM, T1_SUMMARY_PROMPT, T1_ANOMALY_PROMPT,
+ T2_SYSTEM, T2_INVESTIGATION_PROMPT,
+ LOGQL_GEN_SYSTEM, LOGQL_GEN_PROMPT,
+ build_stream_guide, format_log_sample, format_logql_results,
+)
+from timeline import TimelineEvent
+
+logger = logging.getLogger("sentinel.analyst")
+
+
+@dataclass
+class T1Result:
+ summary: str
+ cycle_notes: str
+ anomalies: list[dict]
+ model: str
+ summary_duration_ms: int
+ anomaly_duration_ms: int
+ raw_summary_response: str
+ raw_anomaly_response: str
+
+ @property
+ def needs_t2(self) -> bool:
+ return any(a.get("needs_t2") for a in self.anomalies)
+
+ @property
+ def total_duration_ms(self) -> int:
+ return self.summary_duration_ms + self.anomaly_duration_ms
+
+
+@dataclass
+class T2Result:
+ root_cause: str
+ issue_type: str
+ confidence: str
+ correlation: str
+ impact: str
+ recommendation: str
+ logql_queries_used: list[str]
+ sentry_worthy: bool
+ model: str
+ inference_duration_ms: int
+ logql_gather_duration_ms: int
+ raw_response: str = field(repr=False)
+
+ @property
+ def total_duration_ms(self) -> int:
+ return self.inference_duration_ms + self.logql_gather_duration_ms
+
+
+class Analyst:
+ def __init__(
+ self,
+ ollama: OllamaClient,
+ loki: LokiClient,
+ breaker: CircuitBreaker,
+ config: Config,
+ ):
+ self.ollama = ollama
+ self.loki = loki
+ self.breaker = breaker
+ self.config = config
+ self._stream_guide = build_stream_guide()
+
+ # ── T1 ──────────────────────────────────────────────────────────────────
+
+ def run_t1(
+ self,
+ start_ns: int,
+ end_ns: int,
+ counts: dict[str, int],
+ sim_steward_sample: list[dict],
+ claude_dev_sample: list[dict],
+ claude_token_sample: list[dict],
+ ) -> T1Result:
+ window_minutes = max(1, int((end_ns - start_ns) / 1e9 / 60))
+ counts_text = "\n".join(f" {k}: {v}" for k, v in counts.items())
+
+ samples = dict(
+ sim_steward_sample=format_log_sample(sim_steward_sample),
+ sim_steward_count=len(sim_steward_sample),
+ claude_dev_sample=format_log_sample(claude_dev_sample),
+ claude_dev_count=len(claude_dev_sample),
+ claude_token_sample=format_log_sample(claude_token_sample),
+ claude_token_count=len(claude_token_sample),
+ )
+
+ system = T1_SYSTEM.format(stream_guide=self._stream_guide)
+
+ # Call A: summary (/no_think — fast)
+ summary_prompt = T1_SUMMARY_PROMPT.format(
+ window_minutes=window_minutes,
+ counts=counts_text,
+ **samples,
+ )
+ summary_text = ""
+ cycle_notes = ""
+ summary_ms = 0
+ raw_summary = ""
+ try:
+ raw_summary, summary_ms = self.ollama.generate(
+ self.config.ollama_model_fast,
+ system + "\n\n" + summary_prompt,
+ think=False,
+ )
+ self.breaker.record_success()
+ parsed = _parse_json(raw_summary)
+ summary_text = parsed.get("summary", "")
+ cycle_notes = parsed.get("cycle_notes", "")
+ except Exception as e:
+ self.breaker.record_failure()
+ logger.error("T1 summary call failed: %s", e)
+
+ # Call B: anomaly scan (/think — reasoning)
+ anomaly_prompt = T1_ANOMALY_PROMPT.format(
+ summary=summary_text or "(summary unavailable)",
+ counts=counts_text,
+ **samples,
+ )
+ anomalies = []
+ anomaly_ms = 0
+ raw_anomaly = ""
+ try:
+ raw_anomaly, anomaly_ms = self.ollama.generate(
+ self.config.ollama_model_fast,
+ system + "\n\n" + anomaly_prompt,
+ think=True,
+ )
+ self.breaker.record_success()
+ parsed = _parse_json(raw_anomaly)
+ anomalies = _normalize_anomalies(parsed.get("anomalies", []))
+ except Exception as e:
+ self.breaker.record_failure()
+ logger.error("T1 anomaly call failed: %s", e)
+
+ logger.info(
+ "T1 complete: %d anomalies (%d need T2), summary=%dms anomaly=%dms",
+ len(anomalies),
+ sum(1 for a in anomalies if a.get("needs_t2")),
+ summary_ms,
+ anomaly_ms,
+ )
+
+ return T1Result(
+ summary=summary_text,
+ cycle_notes=cycle_notes,
+ anomalies=anomalies,
+ model=self.config.ollama_model_fast,
+ summary_duration_ms=summary_ms,
+ anomaly_duration_ms=anomaly_ms,
+ raw_summary_response=raw_summary,
+ raw_anomaly_response=raw_anomaly,
+ )
+
+ # ── T2 ──────────────────────────────────────────────────────────────────
+
+ def run_t2(
+ self,
+ t1_result: T1Result,
+ timeline: list[TimelineEvent],
+ start_ns: int,
+ end_ns: int,
+ ) -> T2Result:
+ window_minutes = max(1, int((end_ns - start_ns) / 1e9 / 60))
+ t2_anomalies = [a for a in t1_result.anomalies if a.get("needs_t2")]
+
+ # Step 1: generate LogQL queries
+ gather_start = time.time()
+ queries = self._generate_logql_queries(t2_anomalies, window_minutes)
+
+ # Step 2: execute queries
+ logql_results = self._execute_logql_queries(queries, start_ns, end_ns)
+ gather_ms = int((time.time() - gather_start) * 1000)
+
+ # Step 3: build T2 prompt
+ from timeline import TimelineBuilder
+ # Use a simple formatter — timeline already built, just need text
+ timeline_text = _format_timeline_for_prompt(timeline)
+
+ anomaly_descriptions = "\n".join(
+ f"- [{a.get('severity','?').upper()}] {a.get('id','?')}: {a.get('description','')}"
+ for a in t2_anomalies
+ )
+
+ system = T2_SYSTEM.format(stream_guide=self._stream_guide)
+ prompt = T2_INVESTIGATION_PROMPT.format(
+ anomaly_descriptions=anomaly_descriptions,
+ window_minutes=window_minutes,
+ timeline_text=timeline_text,
+ logql_results=format_logql_results(logql_results),
+ logql_queries_list=json.dumps(queries),
+ )
+
+ # Step 4: T2 inference
+ raw = ""
+ infer_ms = 0
+ try:
+ raw, infer_ms = self.ollama.generate(
+ self.config.ollama_model_deep,
+ system + "\n\n" + prompt,
+ think=True,
+ )
+ self.breaker.record_success()
+ except Exception as e:
+ self.breaker.record_failure()
+ logger.error("T2 inference failed: %s", e)
+
+ parsed = _parse_json(raw)
+ result = T2Result(
+ root_cause=parsed.get("root_cause", "Unable to determine root cause."),
+ issue_type=_normalize_issue_type(parsed.get("issue_type", "unknown")),
+ confidence=_normalize_confidence(parsed.get("confidence", "low")),
+ correlation=parsed.get("correlation", "No correlations identified."),
+ impact=parsed.get("impact", "Impact unknown."),
+ recommendation=parsed.get("recommendation", "Investigate manually."),
+ logql_queries_used=queries,
+ sentry_worthy=bool(parsed.get("sentry_worthy", False)),
+ model=self.config.ollama_model_deep,
+ inference_duration_ms=infer_ms,
+ logql_gather_duration_ms=gather_ms,
+ raw_response=raw,
+ )
+
+ logger.info(
+ "T2 complete: confidence=%s sentry=%s gather=%dms infer=%dms queries=%d",
+ result.confidence, result.sentry_worthy,
+ gather_ms, infer_ms, len(queries),
+ )
+ return result
+
+ # ── LogQL helpers ────────────────────────────────────────────────────────
+
+ def _generate_logql_queries(
+ self,
+ anomalies: list[dict],
+ window_minutes: int,
+ ) -> list[str]:
+ if not anomalies:
+ return []
+
+ # Seed with any suggested_logql from T1
+ seeded = [a.get("suggested_logql", "") for a in anomalies if a.get("suggested_logql")]
+
+ anomaly_descriptions = "\n".join(
+ f"- {a.get('id','?')}: {a.get('description','')}" for a in anomalies[:5]
+ )
+ prompt = LOGQL_GEN_SYSTEM + "\n\n" + LOGQL_GEN_PROMPT.format(
+ anomaly_descriptions=anomaly_descriptions,
+ window_minutes=window_minutes,
+ )
+ try:
+ raw, _ = self.ollama.generate(
+ self.config.ollama_model_fast,
+ prompt,
+ think=False,
+ temperature=0.0,
+ )
+ generated = json.loads(raw) if raw.strip().startswith("[") else []
+ if isinstance(generated, list):
+ # Combine seeded + generated, validate all
+ combined = seeded + [q for q in generated if isinstance(q, str)]
+ valid = [q.strip() for q in combined if _valid_logql(q)]
+ return valid[:5]
+ except Exception as e:
+ logger.warning("LogQL gen failed: %s", e)
+
+ # Fall back to seeded only
+ return [q for q in seeded if _valid_logql(q)][:5]
+
+ def _execute_logql_queries(
+ self,
+ queries: list[str],
+ start_ns: int,
+ end_ns: int,
+ ) -> dict[str, list[dict]]:
+ results = {}
+ for query in queries:
+ try:
+ lines = self.loki.query_lines(query, start_ns, end_ns, limit=50)
+ results[query] = lines
+ except Exception as e:
+ logger.warning("LogQL execute failed (%s): %s", query[:60], e)
+ results[query] = []
+ return results
+
+
+# ── Helpers ──────────────────────────────────────────────────────────────────
+
+def _parse_json(text: str) -> dict:
+ """Extract and parse the first JSON object or array from text."""
+ if not text:
+ return {}
+ # Try direct parse first
+ text = text.strip()
+ try:
+ return json.loads(text)
+ except json.JSONDecodeError:
+ pass
+ # Find first {...} or [...] block
+ for start_char, end_char in [('{', '}'), ('[', ']')]:
+ start = text.find(start_char)
+ end = text.rfind(end_char)
+ if start != -1 and end > start:
+ try:
+ return json.loads(text[start:end + 1])
+ except json.JSONDecodeError:
+ pass
+ return {}
+
+
+def _normalize_anomalies(raw: list) -> list[dict]:
+ if not isinstance(raw, list):
+ return []
+ valid = []
+ for a in raw:
+ if not isinstance(a, dict):
+ continue
+ valid.append({
+ "id": str(a.get("id", "unknown"))[:64],
+ "stream": a.get("stream", "unknown"),
+ "description": str(a.get("description", ""))[:500],
+ "severity": a.get("severity", "info") if a.get("severity") in ("info", "warn", "critical") else "info",
+ "needs_t2": bool(a.get("needs_t2", False)),
+ "suggested_logql": str(a.get("suggested_logql", ""))[:300],
+ })
+ return valid
+
+
+def _normalize_confidence(v: str) -> str:
+ return v if v in ("high", "medium", "low") else "low"
+
+
+def _normalize_issue_type(v: str) -> str:
+ valid = ("error_spike", "config", "regression", "user_behavior", "infra", "unknown")
+ return v if v in valid else "unknown"
+
+
+def _valid_logql(q: str) -> bool:
+ q = q.strip()
+ return bool(q) and q.startswith("{") and "|" in q
+
+
+def _format_timeline_for_prompt(events: list[TimelineEvent], max_events: int = 60) -> str:
+ """Minimal timeline formatter used by analyst (avoids circular import with TimelineBuilder)."""
+ if not events:
+ return "(no timeline events)"
+
+ truncated = len(events) > max_events
+ shown = events[-max_events:] if truncated else events
+
+ lines = []
+ for i, ev in enumerate(shown, 1):
+ try:
+ t = ev.ts_iso[11:19]
+ except (IndexError, TypeError):
+ t = "??:??:??"
+ sid = f" session={ev.session_id[:8]}" if ev.session_id else ""
+ lines.append(f" [{i:03d}] {t} {ev.stream:<25} {ev.event_type}{sid}")
+
+ if truncated:
+ lines.append(f" [... {len(events) - max_events} earlier events not shown]")
+
+ return "\n".join(lines)
diff --git a/observability/local/log-sentinel/app.py b/observability/local/log-sentinel/app.py
new file mode 100644
index 0000000..c8af441
--- /dev/null
+++ b/observability/local/log-sentinel/app.py
@@ -0,0 +1,151 @@
+"""Log Sentinel v3 — Flask health/status/trigger + background sentinel loop."""
+
+import logging
+import threading
+import time
+
+from flask import Flask, jsonify, request
+
+from config import Config
+from loki_handler import LokiHandler
+from sentinel import Sentinel
+
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s %(name)-20s %(levelname)-5s %(message)s",
+)
+
+config = Config.from_env()
+
+# Push process logs to Loki
+loki_handler = LokiHandler(config.loki_url, env=config.env_label)
+loki_handler.setLevel(logging.INFO)
+logging.getLogger().addHandler(loki_handler)
+
+app = Flask(__name__)
+sentinel = Sentinel(config)
+
+
+@app.route("/health", methods=["GET"])
+def health():
+ return jsonify({"status": "ok", "service": "log-sentinel", "version": "3.0"})
+
+
+@app.route("/run", methods=["POST"])
+def manual_run():
+ result = sentinel.run_cycle()
+ return jsonify({
+ "status": "ok",
+ "cycle_id": result.cycle_id,
+ "cycle_num": result.cycle_num,
+ "window_minutes": result.window_minutes,
+ "timeline_event_count": result.timeline_event_count,
+ "anomaly_count": result.anomaly_count,
+ "duration_ms": result.duration_ms,
+ "summary": result.t1.summary if result.t1 else None,
+ "anomalies": result.t1.anomalies if result.t1 else [],
+ "evidence_packet_count": len(result.t1.evidence_packets) if result.t1 else 0,
+ "error": result.error,
+ })
+
+
+@app.route("/run_t2", methods=["POST"])
+def manual_run_t2():
+ t = threading.Thread(target=sentinel.run_t2_cycle, daemon=True)
+ t.start()
+ return jsonify({"status": "accepted", "message": "T2 cycle started in background"})
+
+
+@app.route("/run_t3", methods=["POST"])
+def manual_run_t3():
+ t = threading.Thread(target=sentinel.run_t3_cycle, daemon=True)
+ t.start()
+ return jsonify({"status": "accepted", "message": "T3 cycle started in background"})
+
+
+@app.route("/trigger", methods=["POST"])
+def grafana_trigger():
+ """Receive Grafana alert webhook. Dedup, parse, and dispatch trigger_cycle()."""
+ payload = request.get_json(silent=True) or {}
+ alerts = payload.get("alerts", [])
+ if not alerts:
+ return jsonify({"status": "ignored", "reason": "no alerts"}), 200
+
+ fired_names = []
+ now = time.time()
+ trigger_tier = "t1"
+ alert_lines = []
+
+ for alert in alerts:
+ labels = alert.get("labels", {})
+ annotations = alert.get("annotations", {})
+ alertname = labels.get("alertname", "unknown")
+ tier = labels.get("trigger_tier", "t1")
+ severity = labels.get("severity", "warn")
+ starts_at = alert.get("startsAt", "")
+
+ # Dedup: skip if same alertname fired within dedup window
+ last_ts = sentinel._trigger_dedup.get(alertname, 0)
+ if now - last_ts < config.dedup_window_sec:
+ continue
+
+ sentinel._trigger_dedup[alertname] = now
+ fired_names.append(alertname)
+ if tier == "t2":
+ trigger_tier = "t2"
+
+ description = annotations.get("description", annotations.get("summary", ""))
+ alert_lines.append(
+ f" Alert: {alertname} ({severity})\n"
+ f" Fired: {starts_at}\n"
+ f" {description}"
+ )
+
+ if not fired_names:
+ return jsonify({"status": "deduped"}), 200
+
+ alert_context = "\n".join(alert_lines)
+ sentinel.loki.push_trigger(
+ {
+ "alertname": ",".join(fired_names),
+ "trigger_tier": trigger_tier,
+ "alert_count": len(fired_names),
+ },
+ env=config.env_label,
+ )
+
+ # Run in background — webhook must return fast
+ t = threading.Thread(
+ target=sentinel.trigger_cycle,
+ args=(alert_context, trigger_tier, fired_names),
+ daemon=True,
+ )
+ t.start()
+
+ return jsonify({"status": "accepted", "alerts": fired_names, "tier": trigger_tier}), 202
+
+
+@app.route("/status", methods=["GET"])
+def status():
+ return jsonify({
+ "version": "3.0",
+ "sentinel_mode": config.sentinel_mode,
+ "t1_interval_sec": config.t1_interval_sec,
+ "t2_interval_sec": config.t2_interval_sec,
+ "t3_interval_sec": config.t3_interval_sec,
+ "lookback_sec": config.lookback_sec,
+ "t2_enabled": config.t2_enabled,
+ "models": {"fast": config.ollama_model_fast, "deep": config.ollama_model_deep},
+ "sentry_enabled": sentinel.sentry.enabled,
+ "stats": sentinel._stats,
+ "circuit_breakers": {
+ "loki": sentinel.loki_breaker.state,
+ "ollama": sentinel.ollama_breaker.state,
+ },
+ })
+
+
+if __name__ == "__main__":
+ t = threading.Thread(target=sentinel.start, daemon=True)
+ t.start()
+ app.run(host="0.0.0.0", port=8081, debug=False)
diff --git a/observability/local/log-sentinel/baseline.py b/observability/local/log-sentinel/baseline.py
new file mode 100644
index 0000000..c50b1e0
--- /dev/null
+++ b/observability/local/log-sentinel/baseline.py
@@ -0,0 +1,229 @@
+"""Baseline manager — rolling stats from Loki → baselines.json.
+
+T3 calls compute_and_save() to recompute baselines from the Loki window.
+T1 calls load() + get_prompt_context() to inject baseline values into its prompt.
+T3 calls get_threshold_recommendations() to surface T0 alert calibration suggestions.
+
+No ML, no LLM — simple rolling math (mean, count rates, p95 where sample size allows).
+"""
+
+import json
+import logging
+import os
+import statistics
+from datetime import datetime, timezone
+
+from loki_client import LokiClient
+
+logger = logging.getLogger("sentinel.baseline")
+
+DEFAULT_PATH = "/data/baselines.json"
+
+# Metric definitions: key, logql, how to compute the value
+_METRICS = [
+ {
+ "key": "sim_steward.error_rate.per_min",
+ "logql": '{app="sim-steward"} | json | level="ERROR"',
+ "compute": "rate_per_min",
+ "description": "ERROR log rate (per minute)",
+ },
+ {
+ "key": "sim_steward.action_count.per_session",
+ "logql": '{app="sim-steward"} | json | event="action_dispatched"',
+ "compute": "count_per_session",
+ "description": "Actions dispatched per iRacing session",
+ },
+ {
+ "key": "sim_steward.websocket_disconnect.per_hour",
+ "logql": '{app="sim-steward"} | json | event="websocket_disconnect"',
+ "compute": "rate_per_hour",
+ "description": "WebSocket disconnects per hour",
+ },
+ {
+ "key": "claude.cost_per_session.mean_usd",
+ "logql": '{app="claude-token-metrics"} | json',
+ "compute": "field_mean",
+ "field": "cost_usd",
+ "description": "Mean Claude session cost (USD)",
+ },
+ {
+ "key": "claude.tool_calls.per_session",
+ "logql": '{app="claude-dev-logging"} | json | event="tool_use"',
+ "compute": "count_per_session",
+ "description": "Tool calls per Claude session",
+ },
+ {
+ "key": "claude.error_rate.per_min",
+ "logql": '{app="claude-dev-logging"} | json | level="ERROR"',
+ "compute": "rate_per_min",
+ "description": "Claude session ERROR rate (per minute)",
+ },
+]
+
+# Known T0 alert thresholds for recommendation comparison
+# Format: alert_name → (baseline_key, window_minutes, current_threshold)
+_ALERT_MAPPINGS = [
+ ("error-spike-general", "sim_steward.error_rate.per_min", 10, 10),
+ ("claude-error-spike", "claude.error_rate.per_min", 5, 5),
+ ("websocket-disconnect-spike", "sim_steward.websocket_disconnect.per_hour", 5, 3),
+]
+
+
+class BaselineManager:
+ def __init__(self, loki: LokiClient, baseline_path: str = DEFAULT_PATH):
+ self.loki = loki
+ self.path = baseline_path
+ self._cache: dict = {}
+
+ def load(self) -> dict:
+ """Load baselines.json from disk. Returns empty dict if not found."""
+ try:
+ if os.path.exists(self.path):
+ with open(self.path) as f:
+ self._cache = json.load(f)
+ logger.info("Loaded baselines from %s (%d metrics)", self.path, len(self._cache))
+ else:
+ logger.info("No baselines.json at %s — starting fresh", self.path)
+ self._cache = {}
+ except Exception as e:
+ logger.warning("Failed to load baselines: %s", e)
+ self._cache = {}
+ return self._cache
+
+ def compute_and_save(self, lookback_sec: int = 86400) -> dict:
+ """
+ Query Loki over the lookback window, compute rolling metrics, write baselines.json.
+ Preserves existing values for metrics where no new data is found.
+ """
+ end_ns = self.loki.now_ns()
+ start_ns = end_ns - lookback_sec * 1_000_000_000
+ updated = dict(self._cache)
+ computed_count = 0
+
+ for metric in _METRICS:
+ try:
+ value = self._compute_metric(metric, start_ns, end_ns, lookback_sec)
+ if value is not None:
+ updated[metric["key"]] = round(value, 4)
+ computed_count += 1
+ logger.debug("Baseline %s = %.4f", metric["key"], value)
+ except Exception as e:
+ logger.warning("Baseline compute failed for %s: %s", metric["key"], e)
+
+ # Persist
+ try:
+ dirpath = os.path.dirname(os.path.abspath(self.path))
+ os.makedirs(dirpath, exist_ok=True)
+ with open(self.path, "w") as f:
+ json.dump(updated, f, indent=2)
+ self._cache = updated
+ logger.info(
+ "Baselines saved to %s (%d computed, %d total)",
+ self.path, computed_count, len(updated),
+ )
+ except Exception as e:
+ logger.warning("Failed to save baselines: %s", e)
+
+ return updated
+
+ def get_prompt_context(self) -> str:
+ """Format baseline values for injection into T1 LLM prompt."""
+ if not self._cache:
+ return "(no baseline data available yet — first run or no historical data)"
+
+ lines = ["Historical baseline for this system (use these to judge what is anomalous):"]
+ for key, value in sorted(self._cache.items()):
+ metric = next((m for m in _METRICS if m["key"] == key), None)
+ description = metric["description"] if metric else key.replace(".", " | ").replace("_", " ")
+ lines.append(f" {description}: {value}")
+ lines.append(
+ "Flag metrics that exceed baselines by 3x or more as anomalous. "
+ "Use these values to calibrate 'high', 'normal', and 'low' thresholds."
+ )
+ return "\n".join(lines)
+
+ def get_threshold_recommendations(self) -> list[dict]:
+ """
+ Compare computed baselines against known T0 alert thresholds.
+ Returns recommendation dicts for alerts that appear mis-calibrated.
+ Emitted by T3 as sentinel_threshold_recommendation events.
+ """
+ if not self._cache:
+ return []
+
+ recommendations = []
+ for alert_name, baseline_key, window_minutes, current_threshold in _ALERT_MAPPINGS:
+ baseline_val = self._cache.get(baseline_key)
+ if baseline_val is None:
+ continue
+
+ # Suggested threshold: 5x the baseline rate scaled to the alert window
+ suggested = round(baseline_val * window_minutes * 5, 1)
+ if suggested <= 0:
+ continue
+
+ delta_pct = abs(suggested - current_threshold) / max(current_threshold, 0.001)
+ if delta_pct < 0.25:
+ continue # Less than 25% difference — not worth recommending
+
+ recommendations.append({
+ "alert": alert_name,
+ "current_threshold": current_threshold,
+ "suggested_threshold": suggested,
+ "basis": (
+ f"{baseline_key}={baseline_val:.3f}/min × {window_minutes}min window × 5x safety margin"
+ ),
+ "confidence": min(0.9, 0.5 + delta_pct * 0.2),
+ "direction": "lower" if suggested < current_threshold else "higher",
+ })
+
+ return recommendations
+
+ # ── Private ───────────────────────────────────────────────────────────
+
+ def _compute_metric(
+ self, metric: dict, start_ns: int, end_ns: int, lookback_sec: int
+ ) -> float | None:
+ lines = self.loki.query_lines(metric["logql"], start_ns, end_ns, limit=1000)
+ if not lines:
+ return None
+
+ compute = metric.get("compute", "count")
+
+ if compute == "rate_per_min":
+ minutes = lookback_sec / 60
+ return len(lines) / minutes if minutes > 0 else None
+
+ elif compute == "rate_per_hour":
+ hours = lookback_sec / 3600
+ return len(lines) / hours if hours > 0 else None
+
+ elif compute == "count_per_session":
+ # Group by session_id, compute mean count per session
+ sessions: dict[str, int] = {}
+ no_session = 0
+ for line in lines:
+ sid = line.get("session_id")
+ if sid:
+ sessions[sid] = sessions.get(sid, 0) + 1
+ else:
+ no_session += 1
+ if sessions:
+ return statistics.mean(sessions.values())
+ # Fallback: total / estimated sessions (assume 1 session per hour)
+ estimated_sessions = max(1, lookback_sec / 3600)
+ return len(lines) / estimated_sessions
+
+ elif compute == "field_mean":
+ field = metric.get("field", "")
+ values = []
+ for line in lines:
+ v = line.get(field)
+ try:
+ values.append(float(v))
+ except (TypeError, ValueError):
+ pass
+ return statistics.mean(values) if values else None
+
+ else:
+ return float(len(lines))
diff --git a/observability/local/log-sentinel/circuit_breaker.py b/observability/local/log-sentinel/circuit_breaker.py
new file mode 100644
index 0000000..cedec1d
--- /dev/null
+++ b/observability/local/log-sentinel/circuit_breaker.py
@@ -0,0 +1,51 @@
+"""Circuit breaker for dependency health (Loki, Ollama)."""
+
+import logging
+import time
+
+logger = logging.getLogger("sentinel.circuit")
+
+
+class CircuitBreaker:
+ """Track consecutive failures and skip calls during backoff."""
+
+ CLOSED = "closed"
+ OPEN = "open"
+ HALF_OPEN = "half_open"
+
+ def __init__(self, name: str, failure_threshold: int = 3, backoff_sec: int = 60):
+ self.name = name
+ self.failure_threshold = failure_threshold
+ self.backoff_sec = backoff_sec
+ self.state = self.CLOSED
+ self.consecutive_failures = 0
+ self.last_failure_time = 0.0
+
+ def allow_request(self) -> bool:
+ if self.state == self.CLOSED:
+ return True
+ if self.state == self.OPEN:
+ if time.time() - self.last_failure_time >= self.backoff_sec:
+ self.state = self.HALF_OPEN
+ logger.info("Circuit %s half-open, trying one request", self.name)
+ return True
+ return False
+ # HALF_OPEN — allow one probe
+ return True
+
+ def record_success(self):
+ if self.state != self.CLOSED:
+ logger.info("Circuit %s closed (recovered)", self.name)
+ self.state = self.CLOSED
+ self.consecutive_failures = 0
+
+ def record_failure(self):
+ self.consecutive_failures += 1
+ self.last_failure_time = time.time()
+ if self.consecutive_failures >= self.failure_threshold:
+ if self.state != self.OPEN:
+ logger.warning(
+ "Circuit %s OPEN after %d failures, backing off %ds",
+ self.name, self.consecutive_failures, self.backoff_sec,
+ )
+ self.state = self.OPEN
diff --git a/observability/local/log-sentinel/config.py b/observability/local/log-sentinel/config.py
new file mode 100644
index 0000000..2ea06a1
--- /dev/null
+++ b/observability/local/log-sentinel/config.py
@@ -0,0 +1,35 @@
+"""Configuration from environment variables."""
+
+import os
+
+
+class Config:
+ def __init__(self):
+ self.loki_url = os.environ.get("LOKI_URL", "http://loki:3100")
+ self.grafana_url = os.environ.get("GRAFANA_URL", "http://grafana:3000")
+ self.grafana_user = os.environ.get("GRAFANA_USER", "admin")
+ self.grafana_password = os.environ.get("GRAFANA_PASSWORD", "admin")
+ self.ollama_url = os.environ.get("OLLAMA_URL", "http://host.docker.internal:11434")
+ self.ollama_model_fast = os.environ.get("OLLAMA_MODEL_FAST", "qwen3:8b")
+ self.ollama_model_deep = os.environ.get("OLLAMA_MODEL_DEEP", "qwen3:32b")
+ self.poll_interval_sec = int(os.environ.get("SENTINEL_POLL_INTERVAL_SEC", "60"))
+ self.lookback_sec = int(os.environ.get("SENTINEL_LOOKBACK_SEC", "300"))
+ self.t2_enabled = os.environ.get("SENTINEL_T2_ENABLED", "true").lower() == "true"
+ self.t2_proactive_interval_sec = int(os.environ.get("SENTINEL_T2_PROACTIVE_INTERVAL_SEC", "300"))
+ self.dedup_window_sec = int(os.environ.get("SENTINEL_DEDUP_WINDOW_SEC", "300"))
+ self.env_label = os.environ.get("SIMSTEWARD_LOG_ENV", "local")
+ self.sentry_dsn = os.environ.get("SENTINEL_SENTRY_DSN", "")
+ # v3 additions
+ self.sentinel_mode = os.environ.get("SENTINEL_MODE", "dev") # "dev" | "prod"
+ self.t1_interval_sec = int(os.environ.get("SENTINEL_T1_INTERVAL_SEC", "300")) # 5 min
+ self.t2_interval_sec = int(os.environ.get("SENTINEL_T2_INTERVAL_SEC", "900")) # 15 min
+ self.t3_interval_sec = int(os.environ.get("SENTINEL_T3_INTERVAL_SEC", "7200")) # 2h (dev default)
+ self.merge_window_sec = int(os.environ.get("SENTINEL_MERGE_WINDOW_SEC", "10")) # T0 batch window
+ self.sentry_auth_token = os.environ.get("SENTRY_AUTH_TOKEN", "")
+ self.sentry_org = os.environ.get("SENTRY_ORG", "")
+ self.sentry_project = os.environ.get("SENTRY_PROJECT", "")
+ self.baseline_path = os.environ.get("SENTINEL_BASELINE_PATH", "/data/baselines.json")
+
+ @classmethod
+ def from_env(cls):
+ return cls()
diff --git a/observability/local/log-sentinel/evidence.py b/observability/local/log-sentinel/evidence.py
new file mode 100644
index 0000000..1a9199a
--- /dev/null
+++ b/observability/local/log-sentinel/evidence.py
@@ -0,0 +1,235 @@
+"""Evidence packet model — pre-assembles log context for T2 consumption.
+
+T1 identifies an anomaly, then EvidenceBuilder:
+ 1. Finds which feature invocations contain the anomalous signal
+ 2. Builds a targeted LogQL query
+ 3. Pre-fetches up to 50 related log lines from Loki
+ 4. Packages everything into an EvidencePacket ready for T2
+
+T2 receives EvidencePackets — it reasons over pre-assembled evidence,
+not raw Loki queries. This dramatically improves T2 output quality.
+"""
+
+import logging
+import time
+import uuid
+from dataclasses import dataclass, field
+
+from loki_client import LokiClient
+from trace import FeatureInvocation
+
+logger = logging.getLogger("sentinel.evidence")
+
+_MAX_LOG_LINES = 50
+
+
+@dataclass
+class EvidencePacket:
+ anomaly_id: str
+ anomaly_description: str
+ severity: str # "info" | "warn" | "critical"
+ detector_stream: str # which stream flagged it
+ invocations: list[FeatureInvocation] # invocations containing the anomaly
+ related_log_lines: list[dict] # pre-fetched raw log lines (capped at 50)
+ suggested_logql: str # T1's suggested query for T2 to refine
+ t1_hypothesis: str # T1's one-sentence best-guess root cause
+ t1_confidence: float # 0.0 to 1.0
+ assembled_at_ns: int
+ logql_used: str # the actual query used to fetch related_log_lines
+
+ def to_loki_dict(self) -> dict:
+ """Serializable dict for push to Loki as sentinel_evidence_packet event."""
+ return {
+ "event": "sentinel_evidence_packet",
+ "component": "log-sentinel",
+ "domain": "system",
+ "level": "WARN" if self.severity in ("warn", "critical") else "INFO",
+ "message": f"[{self.severity.upper()}] {self.anomaly_description[:120]}",
+ "anomaly_id": self.anomaly_id,
+ "anomaly_description": self.anomaly_description,
+ "severity": self.severity,
+ "detector_stream": self.detector_stream,
+ "t1_hypothesis": self.t1_hypothesis,
+ "t1_confidence": self.t1_confidence,
+ "suggested_logql": self.suggested_logql,
+ "logql_used": self.logql_used,
+ "related_lines_count": len(self.related_log_lines),
+ "invocation_count": len(self.invocations),
+ "invocation_ids": [inv.invocation_id for inv in self.invocations],
+ "action_types": list({inv.action_type for inv in self.invocations}),
+ "assembled_at_ns": self.assembled_at_ns,
+ }
+
+ def to_prompt_text(self) -> str:
+ """Format evidence packet as text block for LLM (T2) consumption."""
+ lines = [
+ f"=== EVIDENCE PACKET {self.anomaly_id} ===",
+ f"Severity: {self.severity.upper()}",
+ f"Stream: {self.detector_stream}",
+ f"Anomaly: {self.anomaly_description}",
+ f"T1 hypothesis: {self.t1_hypothesis or '(none)'}",
+ f"T1 confidence: {self.t1_confidence:.0%}",
+ "",
+ ]
+
+ if self.invocations:
+ lines.append(f"Affected invocations ({len(self.invocations)}):")
+ for inv in self.invocations[:5]:
+ status = "FAILED" if inv.success is False else ("OK" if inv.success else "?")
+ lines.append(
+ f" [{status}] {inv.action_type} via {inv.correlation_method} "
+ f"({inv.duration_ms}ms, {len(inv.events)} events)"
+ )
+ if inv.error:
+ lines.append(f" error: {inv.error}")
+ lines.append("")
+
+ if self.related_log_lines:
+ lines.append(f"Related log lines ({len(self.related_log_lines)}, capped at {_MAX_LOG_LINES}):")
+ for log in self.related_log_lines[:_MAX_LOG_LINES]:
+ ts = log.get("timestamp", "")[:19]
+ evt = log.get("event", log.get("message", ""))[:60]
+ lvl = log.get("level", "")
+ err = log.get("error", "")
+ suffix = f" error={err[:60]}" if err else ""
+ lines.append(f" {ts} [{lvl}] {evt}{suffix}")
+ lines.append("")
+
+ lines.append(f"Suggested LogQL for deeper investigation: {self.suggested_logql}")
+ return "\n".join(lines)
+
+
+class EvidenceBuilder:
+ """Assembles EvidencePackets from T1 anomaly signals + feature invocations."""
+
+ def __init__(self, loki: LokiClient):
+ self.loki = loki
+
+ def build(
+ self,
+ anomaly: dict,
+ invocations: list[FeatureInvocation],
+ start_ns: int,
+ end_ns: int,
+ ) -> EvidencePacket:
+ """
+ Build an EvidencePacket for a single T1 anomaly.
+
+ anomaly dict shape (from T1 LLM output):
+ id, description, severity, stream, event_type,
+ hypothesis, confidence, suggested_logql, trace_id
+ """
+ anomaly_id = anomaly.get("id") or str(uuid.uuid4())[:8]
+ stream = anomaly.get("stream", "sim-steward")
+ event_type = anomaly.get("event_type", "")
+
+ relevant = self._find_relevant_invocations(anomaly, invocations)
+ logql = self._build_logql(anomaly, relevant, stream, event_type)
+
+ try:
+ lines = self.loki.query_lines(logql, start_ns, end_ns, limit=_MAX_LOG_LINES)
+ except Exception as e:
+ logger.warning("EvidenceBuilder Loki query failed: %s", e)
+ lines = []
+
+ suggested = anomaly.get("suggested_logql") or logql
+
+ return EvidencePacket(
+ anomaly_id=anomaly_id,
+ anomaly_description=anomaly.get("description", anomaly.get("title", "")),
+ severity=anomaly.get("severity", "warn"),
+ detector_stream=stream,
+ invocations=relevant,
+ related_log_lines=lines,
+ suggested_logql=suggested,
+ t1_hypothesis=anomaly.get("hypothesis", ""),
+ t1_confidence=float(anomaly.get("confidence", 0.5)),
+ assembled_at_ns=int(time.time() * 1e9),
+ logql_used=logql,
+ )
+
+ def build_many(
+ self,
+ anomalies: list[dict],
+ invocations: list[FeatureInvocation],
+ start_ns: int,
+ end_ns: int,
+ ) -> list[EvidencePacket]:
+ """Build evidence packets for all anomalies. Skips on error."""
+ packets = []
+ for anomaly in anomalies:
+ try:
+ packet = self.build(anomaly, invocations, start_ns, end_ns)
+ packets.append(packet)
+ except Exception as e:
+ logger.warning("Failed to build evidence for anomaly %s: %s", anomaly.get("id", "?"), e)
+ return packets
+
+ # ── Private ───────────────────────────────────────────────────────────
+
+ def _find_relevant_invocations(
+ self, anomaly: dict, invocations: list[FeatureInvocation]
+ ) -> list[FeatureInvocation]:
+ """Find invocations that contain signals matching this anomaly."""
+ # Tier 1: exact trace_id match
+ trace_id = anomaly.get("trace_id")
+ if trace_id:
+ matches = [inv for inv in invocations if inv.invocation_id == trace_id]
+ if matches:
+ return matches
+
+ # Tier 2: invocations containing an event of the matching type/stream
+ anomaly_event = anomaly.get("event_type", "")
+ anomaly_stream = anomaly.get("stream", "")
+ anomaly_severity = anomaly.get("severity", "")
+
+ relevant = []
+ for inv in invocations:
+ for ev in inv.events:
+ stream_match = anomaly_stream and ev.stream == anomaly_stream
+ event_match = anomaly_event and ev.event_type == anomaly_event
+ error_match = anomaly_severity == "critical" and (
+ ev.raw.get("level", "").upper() == "ERROR" or ev.raw.get("error")
+ )
+ if stream_match or event_match or error_match:
+ relevant.append(inv)
+ break
+
+ if relevant:
+ return relevant
+
+ # Tier 3: failed invocations (best-effort for error anomalies)
+ failed = [inv for inv in invocations if inv.success is False]
+ if failed:
+ return failed[:3]
+
+ # Fallback: first 3 invocations
+ return invocations[:3]
+
+ def _build_logql(
+ self,
+ anomaly: dict,
+ invocations: list[FeatureInvocation],
+ stream: str,
+ event_type: str,
+ ) -> str:
+ """Build a targeted LogQL query for fetching related log lines."""
+ # Prefer trace_id query if available
+ trace_ids = [
+ inv.invocation_id
+ for inv in invocations
+ if inv.correlation_method == "trace_id"
+ ]
+ if len(trace_ids) == 1:
+ return f'{{app="{stream}"}} | json | trace_id="{trace_ids[0]}"'
+
+ # Event-type query
+ if event_type:
+ return f'{{app="{stream}"}} | json | event="{event_type}"'
+
+ # Severity-based fallback
+ severity = anomaly.get("severity", "warn")
+ if severity == "critical":
+ return f'{{app="{stream}"}} | json | level="ERROR"'
+
+ return f'{{app="{stream}"}} | json'
diff --git a/observability/local/log-sentinel/grafana_client.py b/observability/local/log-sentinel/grafana_client.py
new file mode 100644
index 0000000..cc065c5
--- /dev/null
+++ b/observability/local/log-sentinel/grafana_client.py
@@ -0,0 +1,64 @@
+"""Grafana HTTP API client for annotations."""
+
+import logging
+import time
+
+import requests
+
+logger = logging.getLogger("sentinel.grafana")
+
+
+class GrafanaClient:
+ def __init__(self, base_url: str, user: str = "admin", password: str = "admin"):
+ self.base_url = base_url.rstrip("/")
+ self.auth = (user, password)
+
+ def annotate(self, finding):
+ try:
+ requests.post(
+ f"{self.base_url}/api/annotations",
+ auth=self.auth,
+ json={
+ "time": int(time.time() * 1000),
+ "tags": ["log-sentinel", finding.detector, finding.severity, finding.category],
+ "text": f"[{finding.severity.upper()}] {finding.title}
{finding.summary}",
+ },
+ timeout=5,
+ )
+ except Exception as e:
+ logger.debug("Grafana annotation error: %s", e)
+
+ def annotate_investigation(self, investigation):
+ try:
+ requests.post(
+ f"{self.base_url}/api/annotations",
+ auth=self.auth,
+ json={
+ "time": int(time.time() * 1000),
+ "tags": ["log-sentinel", "investigation", investigation.finding.detector, investigation.confidence, investigation.trigger],
+ "text": (
+ f"Investigation: {investigation.finding.title}
"
+ f"Root cause: {investigation.root_cause}
"
+ f"Recommendation: {investigation.recommendation}
"
+ f"Confidence: {investigation.confidence} | Model: {investigation.model} | Type: {investigation.issue_type}"
+ ),
+ },
+ timeout=5,
+ )
+ except Exception as e:
+ logger.debug("Grafana investigation annotation error: %s", e)
+
+ def annotate_raw(self, title: str, text: str, tags: list[str]):
+ try:
+ requests.post(
+ f"{self.base_url}/api/annotations",
+ auth=self.auth,
+ json={
+ "time": int(time.time() * 1000),
+ "tags": ["log-sentinel"] + [t for t in tags if t],
+ "text": f"{title}
{text}",
+ },
+ timeout=5,
+ )
+ except Exception as e:
+ logger.debug("Grafana annotate_raw error: %s", e)
diff --git a/observability/local/log-sentinel/loki_client.py b/observability/local/log-sentinel/loki_client.py
new file mode 100644
index 0000000..7de2c9e
--- /dev/null
+++ b/observability/local/log-sentinel/loki_client.py
@@ -0,0 +1,330 @@
+"""Loki HTTP API client — query + push, with structured sentinel event helpers."""
+
+import json
+import logging
+import time
+from datetime import datetime, timezone
+
+import requests
+
+logger = logging.getLogger("sentinel.loki")
+
+
+class LokiClient:
+ def __init__(self, base_url: str, timeout: int = 5):
+ self.base_url = base_url.rstrip("/")
+ self.timeout = timeout
+
+ # ── Time helpers ──
+
+ @staticmethod
+ def now_ns() -> int:
+ return int(datetime.now(timezone.utc).timestamp() * 1e9)
+
+ @staticmethod
+ def now_minus_ms(offset_ms: int) -> int:
+ return int((datetime.now(timezone.utc).timestamp() * 1000 - offset_ms) * 1e6)
+
+ # ── Query API ──
+
+ def count(self, logql: str, start_ns: int, end_ns: int) -> int:
+ try:
+ resp = requests.get(
+ f"{self.base_url}/loki/api/v1/query_range",
+ params={"query": logql, "start": str(start_ns), "end": str(end_ns), "limit": 1000, "direction": "forward"},
+ timeout=self.timeout,
+ )
+ if resp.status_code != 200:
+ return -1
+ total = 0
+ for stream in resp.json().get("data", {}).get("result", []):
+ total += len(stream.get("values", []))
+ return total
+ except Exception as e:
+ logger.warning("Loki count error: %s", e)
+ return -1
+
+ def query_lines(self, logql: str, start_ns: int, end_ns: int, limit: int = 1000) -> list[dict]:
+ try:
+ resp = requests.get(
+ f"{self.base_url}/loki/api/v1/query_range",
+ params={"query": logql, "start": str(start_ns), "end": str(end_ns), "limit": limit, "direction": "forward"},
+ timeout=self.timeout,
+ )
+ if resp.status_code != 200:
+ return []
+ lines = []
+ for stream in resp.json().get("data", {}).get("result", []):
+ for pair in stream.get("values", []):
+ if len(pair) >= 2:
+ try:
+ lines.append(json.loads(pair[1]))
+ except (json.JSONDecodeError, TypeError):
+ pass
+ return lines
+ except Exception:
+ return []
+
+ # ── Push API ──
+
+ def push(self, entry: dict, env: str = "local"):
+ """Push a single log entry to Loki. Fire-and-forget."""
+ try:
+ ts_ns = str(int(time.time() * 1e9))
+ stream_labels = {"app": "sim-steward", "env": env, "level": entry.get("level", "INFO")}
+ for key in ("component", "event", "domain"):
+ val = entry.get(key)
+ if val:
+ stream_labels[key] = val
+ payload = {"streams": [{"stream": stream_labels, "values": [[ts_ns, json.dumps(entry)]]}]}
+ requests.post(f"{self.base_url}/loki/api/v1/push", json=payload, timeout=3)
+ except Exception as e:
+ logger.debug("Loki push error: %s", e)
+
+ # ── Sentinel event helpers ──
+
+ def push_finding(self, finding, env: str = "local"):
+ entry = {
+ "level": "WARN" if finding.severity in ("warn", "critical") else "INFO",
+ "message": finding.title,
+ "timestamp": finding.timestamp,
+ "component": "log-sentinel",
+ "event": "sentinel_finding",
+ "domain": "system",
+ "finding_id": finding.finding_id,
+ "detector": finding.detector,
+ "category": finding.category,
+ "severity": finding.severity,
+ "title": finding.title,
+ "summary": finding.summary,
+ "fingerprint": finding.fingerprint,
+ "escalated_to_t2": finding.escalate_to_t2,
+ "logql_query": finding.logql_query,
+ "flow_context": finding.flow_context,
+ **finding.evidence,
+ }
+ self.push(entry, env)
+
+ def push_investigation(self, investigation, env: str = "local"):
+ entry = {
+ "level": "INFO",
+ "message": f"Investigation: {investigation.root_cause[:120]}",
+ "timestamp": investigation.timestamp,
+ "component": "log-sentinel",
+ "event": "sentinel_investigation",
+ "domain": "system",
+ "investigation_id": investigation.investigation_id,
+ "finding_id": investigation.finding.finding_id,
+ "detector": investigation.finding.detector,
+ "category": investigation.finding.category,
+ "trigger": investigation.trigger,
+ "model": investigation.model,
+ "confidence": investigation.confidence,
+ "issue_type": investigation.issue_type,
+ "root_cause": investigation.root_cause,
+ "correlation": investigation.correlation,
+ "impact": investigation.impact,
+ "recommendation": investigation.recommendation,
+ "inference_duration_ms": investigation.inference_duration_ms,
+ "gather_duration_ms": investigation.gather_duration_ms,
+ "context_lines_gathered": investigation.context_lines_gathered,
+ }
+ self.push(entry, env)
+
+ def push_cycle(self, cycle_data: dict, env: str = "local"):
+ anomaly_count = cycle_data.get("anomaly_count", cycle_data.get("finding_count", 0))
+ entry = {
+ "level": "INFO",
+ "message": f"Cycle #{cycle_data['cycle_num']}: {anomaly_count} anomalies",
+ "component": "log-sentinel",
+ "event": "sentinel_cycle",
+ "domain": "system",
+ **cycle_data,
+ }
+ self.push(entry, env)
+
+ def push_detector_run(self, run_data: dict, env: str = "local"):
+ entry = {
+ "level": "ERROR" if run_data.get("error") else "INFO",
+ "message": f"Detector {run_data['detector']}: {run_data['finding_count']} findings in {run_data['duration_ms']}ms",
+ "component": "log-sentinel",
+ "event": "sentinel_detector_run",
+ "domain": "system",
+ **run_data,
+ }
+ self.push(entry, env)
+
+ def push_t2_run(self, t2_data: dict, env: str = "local"):
+ entry = {
+ "level": "INFO",
+ "message": f"T2 {t2_data['tier']}: {t2_data['model']} confidence={t2_data.get('confidence', '?')} in {t2_data.get('total_duration_ms', '?')}ms",
+ "component": "log-sentinel",
+ "event": "sentinel_t2_run",
+ "domain": "system",
+ **t2_data,
+ }
+ self.push(entry, env)
+
+ def push_analyst_run(self, run_data: dict, env: str = "local"):
+ tier = run_data.get("tier", "t1")
+ entry = {
+ "level": "INFO",
+ "message": f"Analyst {tier}: model={run_data.get('model','?')} anomalies={run_data.get('anomaly_count', run_data.get('logql_queries_generated', '?'))} duration={run_data.get('duration_ms','?')}ms",
+ "component": "log-sentinel",
+ "event": "sentinel_analyst_run",
+ "domain": "system",
+ **run_data,
+ }
+ self.push(entry, env)
+
+ def push_timeline(self, timeline_data: dict, env: str = "local"):
+ entry = {
+ "level": "INFO",
+ "message": f"Timeline: {timeline_data.get('event_count', 0)} events, {timeline_data.get('session_count', 0)} sessions",
+ "component": "log-sentinel",
+ "event": "sentinel_timeline_built",
+ "domain": "system",
+ **timeline_data,
+ }
+ self.push(entry, env)
+
+ def push_investigation_v2(self, t2_result, anomalies: list, env: str = "local"):
+ from analyst import T2Result
+ entry = {
+ "level": "INFO",
+ "message": f"Investigation [{t2_result.confidence}]: {t2_result.root_cause[:120]}",
+ "component": "log-sentinel",
+ "event": "sentinel_investigation",
+ "domain": "system",
+ "anomaly_ids": [a.get("id", "") for a in anomalies if a.get("needs_t2")],
+ "root_cause": t2_result.root_cause,
+ "issue_type": t2_result.issue_type,
+ "confidence": t2_result.confidence,
+ "correlation": t2_result.correlation,
+ "impact": t2_result.impact,
+ "recommendation": t2_result.recommendation,
+ "logql_queries_used": t2_result.logql_queries_used,
+ "logql_gather_duration_ms": t2_result.logql_gather_duration_ms,
+ "inference_duration_ms": t2_result.inference_duration_ms,
+ "sentry_worthy": t2_result.sentry_worthy,
+ "model": t2_result.model,
+ }
+ self.push(entry, env)
+
+ def annotate_raw(self, *args, **kwargs):
+ """Stub — annotate_raw is called on grafana_client, not loki_client."""
+ pass
+
+ def push_sentry_event(self, sentry_data: dict, env: str = "local"):
+ entry = {
+ "level": "INFO",
+ "message": f"Sentry issue: {sentry_data.get('title', '?')[:100]}",
+ "component": "log-sentinel",
+ "event": "sentinel_sentry_issue",
+ "domain": "system",
+ **sentry_data,
+ }
+ self.push(entry, env)
+
+ # ── v3 push helpers ──────────────────────────────────────────────────────
+
+ def push_evidence_packet(self, packet, env: str = "local"):
+ """Push sentinel_evidence_packet — T1's pre-assembled anomaly context."""
+ entry = packet.to_loki_dict()
+ self.push(entry, env)
+
+ def push_t2_investigation(self, t2_result, packet_dicts: list, env: str = "local"):
+ """Push sentinel_t2_investigation — T2's investigation result."""
+ entry = {
+ "level": "INFO",
+ "message": f"T2 investigation [{t2_result.confidence}]: {t2_result.root_cause[:120]}",
+ "component": "log-sentinel",
+ "event": "sentinel_t2_investigation",
+ "domain": "system",
+ "root_cause": t2_result.root_cause,
+ "issue_type": t2_result.issue_type,
+ "confidence": t2_result.confidence,
+ "correlation": t2_result.correlation,
+ "impact": t2_result.impact,
+ "recommendation": t2_result.recommendation,
+ "sentry_worthy": t2_result.sentry_worthy,
+ "sentry_fingerprint": t2_result.sentry_fingerprint,
+ "sentry_event_id": t2_result.sentry_event_id or "",
+ "evidence_packet_count": t2_result.evidence_packet_count,
+ "anomaly_ids": [p.get("anomaly_id", "") for p in packet_dicts],
+ "logql_queries_used": t2_result.logql_queries_used,
+ "logql_gather_duration_ms": t2_result.logql_gather_duration_ms,
+ "inference_duration_ms": t2_result.inference_duration_ms,
+ "model": t2_result.model,
+ }
+ self.push(entry, env)
+
+ def push_synthesis(self, t3_result, trigger: str = "scheduled", env: str = "local"):
+ """Push sentinel_synthesis — T3's period synthesis summary."""
+ entry = {
+ "level": "INFO",
+ "message": f"T3 synthesis [{trigger}]: {t3_result.sessions_analyzed} sessions, "
+ f"{len(t3_result.recurring_patterns)} patterns",
+ "component": "log-sentinel",
+ "event": "sentinel_synthesis",
+ "domain": "system",
+ "trigger": trigger,
+ "period_summary": t3_result.period_summary[:500],
+ "sessions_analyzed": t3_result.sessions_analyzed,
+ "features_worked": t3_result.features_worked,
+ "features_failed": t3_result.features_failed,
+ "recurring_pattern_count": len(t3_result.recurring_patterns),
+ "regression_detected": t3_result.regression_detected,
+ "regression_detail": t3_result.regression_detail[:200],
+ "action_items": t3_result.action_items[:5],
+ "baselines_updated": t3_result.baselines_updated,
+ "threshold_recommendation_count": len(t3_result.threshold_recommendations),
+ "model": t3_result.model,
+ "inference_duration_ms": t3_result.inference_duration_ms,
+ }
+ self.push(entry, env)
+
+ def push_narrative(self, narrative_dict: dict, env: str = "local"):
+ """Push sentinel_narrative — T3's per-session story."""
+ entry = {
+ "level": "INFO",
+ "message": f"Session narrative: {narrative_dict.get('session_id', '?')[:12]}",
+ "component": "log-sentinel",
+ "event": "sentinel_narrative",
+ "domain": "system",
+ "session_id": narrative_dict.get("session_id", ""),
+ "narrative_text": narrative_dict.get("narrative_text", "")[:1000],
+ "features_worked": narrative_dict.get("features_worked", []),
+ "features_failed": narrative_dict.get("features_failed", []),
+ "invocation_count": narrative_dict.get("invocation_count", 0),
+ }
+ self.push(entry, env)
+
+ def push_threshold_recommendation(self, rec: dict, env: str = "local"):
+ """Push sentinel_threshold_recommendation — T3's threshold calibration advice."""
+ entry = {
+ "level": "INFO",
+ "message": (
+ f"Threshold recommendation: {rec.get('alert', '?')} "
+ f"current={rec.get('current_threshold')} → suggested={rec.get('suggested_threshold')} "
+ f"({rec.get('direction', '?')})"
+ ),
+ "component": "log-sentinel",
+ "event": "sentinel_threshold_recommendation",
+ "domain": "system",
+ **rec,
+ }
+ self.push(entry, env)
+
+ def push_trigger(self, alert_data: dict, env: str = "local"):
+ """Push sentinel_trigger — per T0 webhook alert received."""
+ entry = {
+ "level": "INFO",
+ "message": f"Trigger: {alert_data.get('alertname', '?')} [{alert_data.get('trigger_tier', '?')}]",
+ "component": "log-sentinel",
+ "event": "sentinel_trigger",
+ "domain": "system",
+ "trigger_source": "grafana_alert",
+ **alert_data,
+ }
+ self.push(entry, env)
diff --git a/observability/local/log-sentinel/loki_handler.py b/observability/local/log-sentinel/loki_handler.py
new file mode 100644
index 0000000..e8a9dcd
--- /dev/null
+++ b/observability/local/log-sentinel/loki_handler.py
@@ -0,0 +1,66 @@
+"""Python logging handler that pushes log records to Loki."""
+
+import json
+import logging
+import time
+import threading
+
+import requests
+
+
+class LokiHandler(logging.Handler):
+ def __init__(self, loki_url: str, env: str = "local", flush_interval: float = 2.0):
+ super().__init__()
+ self.loki_url = loki_url.rstrip("/")
+ self.env = env
+ self.flush_interval = flush_interval
+ self._buffer = []
+ self._lock = threading.Lock()
+ self._start_flush_timer()
+
+ def _start_flush_timer(self):
+ self._timer = threading.Timer(self.flush_interval, self._flush_loop)
+ self._timer.daemon = True
+ self._timer.start()
+
+ def _flush_loop(self):
+ self._flush()
+ self._start_flush_timer()
+
+ def emit(self, record: logging.LogRecord):
+ try:
+ entry = {
+ "level": record.levelname,
+ "message": self.format(record),
+ "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S.000Z", time.gmtime(record.created)),
+ "component": "log-sentinel",
+ "event": "sentinel_log",
+ "domain": "system",
+ "logger": record.name,
+ "func": record.funcName,
+ }
+ with self._lock:
+ self._buffer.append(entry)
+ except Exception:
+ self.handleError(record)
+
+ def _flush(self):
+ with self._lock:
+ if not self._buffer:
+ return
+ entries = self._buffer[:]
+ self._buffer.clear()
+ by_level = {}
+ for e in entries:
+ by_level.setdefault(e["level"], []).append(e)
+ streams = []
+ for level, group in by_level.items():
+ values = [[str(int(time.time() * 1e9)), json.dumps(e)] for e in group]
+ streams.append({
+ "stream": {"app": "sim-steward", "env": self.env, "level": level, "component": "log-sentinel", "event": "sentinel_log", "domain": "system"},
+ "values": values,
+ })
+ try:
+ requests.post(f"{self.loki_url}/loki/api/v1/push", json={"streams": streams}, timeout=3)
+ except Exception:
+ pass
diff --git a/observability/local/log-sentinel/narrative.py b/observability/local/log-sentinel/narrative.py
new file mode 100644
index 0000000..72e3a9b
--- /dev/null
+++ b/observability/local/log-sentinel/narrative.py
@@ -0,0 +1,214 @@
+"""Session narrative builder — used by T3 synthesis.
+
+Turns a set of FeatureInvocations + T1/T2 findings into a human-readable
+per-session story that answers: "What was the user trying to do, did it work?"
+
+Output shape (returned as text block):
+ NARRATIVE: []
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+ <2-3 sentence prose of what happened>
+
+ WORKED: ·
+ FAILED: (error)
+ PATTERNS:
+ ACTION:
+"""
+
+import logging
+from datetime import datetime, timezone
+
+from trace import FeatureInvocation
+
+logger = logging.getLogger("sentinel.narrative")
+
+
+class NarrativeBuilder:
+ def build(
+ self,
+ session_id: str,
+ invocations: list[FeatureInvocation],
+ anomaly_dicts: list[dict],
+ t2_investigation_dicts: list[dict],
+ pattern_matches: list[str] | None = None,
+ ) -> str:
+ """Build a narrative text block for a single session."""
+ if not invocations:
+ return f"NARRATIVE: session={session_id}\n (no feature invocations recorded)"
+
+ # Time range
+ start_ns = min(inv.start_ts_ns for inv in invocations)
+ end_ns = max(inv.end_ts_ns for inv in invocations)
+ start_dt = datetime.fromtimestamp(start_ns / 1e9, tz=timezone.utc)
+ end_dt = datetime.fromtimestamp(end_ns / 1e9, tz=timezone.utc)
+ date_str = start_dt.strftime("%Y-%m-%d")
+ time_range = f"{start_dt.strftime('%H:%M')}–{end_dt.strftime('%H:%M')}"
+
+ # Classify worked vs failed
+ worked = []
+ failed = []
+ for inv in invocations:
+ if inv.success is False:
+ failed.append(inv)
+ elif inv.success is True:
+ worked.append(inv)
+ # success=None (unknown) counted as neither
+
+ # Build prose summary
+ prose = self._build_prose(invocations, worked, failed, anomaly_dicts)
+
+ # Recommendation from T2 investigations or anomalies
+ action = self._extract_action(t2_investigation_dicts, anomaly_dicts)
+
+ # Pattern summary
+ patterns_text = ""
+ if pattern_matches:
+ patterns_text = " · ".join(pattern_matches[:3])
+ elif _has_recurring_issue(anomaly_dicts):
+ patterns_text = f"{sum(1 for a in anomaly_dicts if a.get('severity') in ('warn', 'critical'))} anomalies flagged"
+
+ # Assemble
+ sep = "━" * 48
+ lines = [
+ f"NARRATIVE: {date_str} {time_range} [{session_id[:12]}]",
+ sep,
+ "",
+ prose,
+ "",
+ ]
+
+ if worked:
+ worked_str = " · ".join(_action_label(inv) for inv in _dedupe_by_type(worked))
+ lines.append(f"WORKED: {worked_str}")
+ if failed:
+ failed_str = " · ".join(
+ f"{_action_label(inv)} ({(inv.error or 'error')[:40]})"
+ for inv in _dedupe_by_type(failed)
+ )
+ lines.append(f"FAILED: {failed_str}")
+ if patterns_text:
+ lines.append(f"PATTERNS: {patterns_text}")
+ if action:
+ lines.append(f"ACTION: {action[:200]}")
+
+ return "\n".join(lines)
+
+ def build_all(
+ self,
+ invocations: list[FeatureInvocation],
+ anomaly_dicts: list[dict],
+ t2_investigation_dicts: list[dict],
+ ) -> list[dict]:
+ """Group invocations by session_id, build a narrative per session.
+
+ Returns list of dicts with keys: session_id, narrative_text, features_worked,
+ features_failed, invocation_count.
+ """
+ # Group invocations by session_id
+ sessions: dict[str, list[FeatureInvocation]] = {}
+ for inv in invocations:
+ sid = (inv.trigger_event.session_id if inv.trigger_event else None) or "no_session"
+ sessions.setdefault(sid, []).append(inv)
+
+ results = []
+ for sid, session_invocations in sessions.items():
+ # Filter anomalies + investigations for this session
+ session_anomalies = _filter_for_session(anomaly_dicts, sid)
+ session_t2 = _filter_for_session(t2_investigation_dicts, sid)
+
+ text = self.build(
+ session_id=sid,
+ invocations=session_invocations,
+ anomaly_dicts=session_anomalies,
+ t2_investigation_dicts=session_t2,
+ )
+ worked = [inv.action_type for inv in session_invocations if inv.success is True]
+ failed = [inv.action_type for inv in session_invocations if inv.success is False]
+
+ results.append({
+ "session_id": sid,
+ "narrative_text": text,
+ "features_worked": list(dict.fromkeys(worked)), # dedupe, order-preserving
+ "features_failed": list(dict.fromkeys(failed)),
+ "invocation_count": len(session_invocations),
+ })
+
+ return results
+
+ # ── Private ───────────────────────────────────────────────────────────────
+
+ def _build_prose(
+ self,
+ all_invocations: list[FeatureInvocation],
+ worked: list[FeatureInvocation],
+ failed: list[FeatureInvocation],
+ anomaly_dicts: list[dict],
+ ) -> str:
+ total = len(all_invocations)
+ worked_count = len(worked)
+ failed_count = len(failed)
+
+ # Action type distribution
+ type_counts: dict[str, int] = {}
+ for inv in all_invocations:
+ type_counts[inv.action_type] = type_counts.get(inv.action_type, 0) + 1
+
+ top_types = sorted(type_counts.items(), key=lambda x: x[1], reverse=True)[:3]
+ type_str = ", ".join(f"{name} (×{n})" for name, n in top_types)
+
+ health_str = (
+ "All recorded actions completed successfully."
+ if failed_count == 0
+ else f"{failed_count} of {total} action(s) failed."
+ )
+
+ anomaly_count = sum(1 for a in anomaly_dicts if a.get("severity") in ("warn", "critical"))
+ anomaly_str = f" {anomaly_count} anomaly flags were raised." if anomaly_count else ""
+
+ return (
+ f"{total} feature invocation(s) recorded: {type_str}. "
+ f"{health_str}{anomaly_str}"
+ )
+
+ def _extract_action(
+ self,
+ t2_dicts: list[dict],
+ anomaly_dicts: list[dict],
+ ) -> str:
+ # Prefer T2 recommendation if available
+ for t2 in t2_dicts:
+ rec = t2.get("recommendation", "")
+ if rec and rec not in ("Investigate manually.", ""):
+ return rec[:200]
+ # Fall back to critical anomaly hypothesis
+ for a in anomaly_dicts:
+ if a.get("severity") == "critical" and a.get("hypothesis"):
+ return a["hypothesis"][:200]
+ return ""
+
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+
+def _action_label(inv: FeatureInvocation) -> str:
+ return inv.action_type.replace("_", " ").replace("-", " ").lower()
+
+
+def _dedupe_by_type(invocations: list[FeatureInvocation]) -> list[FeatureInvocation]:
+ seen: dict[str, FeatureInvocation] = {}
+ for inv in invocations:
+ seen.setdefault(inv.action_type, inv)
+ return list(seen.values())
+
+
+def _has_recurring_issue(anomaly_dicts: list[dict]) -> bool:
+ return any(a.get("severity") in ("warn", "critical") for a in anomaly_dicts)
+
+
+def _filter_for_session(items: list[dict], session_id: str) -> list[dict]:
+ """Return items that mention this session_id, or all items if session is no_session."""
+ if session_id == "no_session":
+ return items
+ return [
+ item for item in items
+ if not item.get("session_id") or item.get("session_id") == session_id
+ ]
diff --git a/observability/local/log-sentinel/ollama_client.py b/observability/local/log-sentinel/ollama_client.py
new file mode 100644
index 0000000..4448a44
--- /dev/null
+++ b/observability/local/log-sentinel/ollama_client.py
@@ -0,0 +1,64 @@
+"""Ollama HTTP client with qwen3 /think and /no_think mode support."""
+
+import re
+import time
+import logging
+
+import requests
+
+logger = logging.getLogger("sentinel.ollama")
+
+_THINK_STRIP = re.compile(r".*?", re.DOTALL)
+
+
+class OllamaClient:
+ def __init__(self, base_url: str, timeout: int = 300):
+ self.base_url = base_url.rstrip("/")
+ self.timeout = timeout
+
+ def generate(
+ self,
+ model: str,
+ prompt: str,
+ think: bool = False,
+ temperature: float = 0.1,
+ ) -> tuple[str, int]:
+ """
+ Call Ollama /api/generate. Returns (response_text, duration_ms).
+ Prepends /think or /no_think for qwen3 models.
+ Strips ... blocks from output before returning.
+ Raises on failure so callers can handle via circuit breaker.
+ """
+ mode_prefix = "/think\n" if think else "/no_think\n"
+ full_prompt = mode_prefix + prompt
+
+ start = time.time()
+ resp = requests.post(
+ f"{self.base_url}/api/generate",
+ json={
+ "model": model,
+ "prompt": full_prompt,
+ "stream": False,
+ "options": {
+ "temperature": temperature,
+ "num_predict": 2048,
+ },
+ },
+ timeout=self.timeout,
+ )
+ duration_ms = int((time.time() - start) * 1000)
+
+ if resp.status_code != 200:
+ raise RuntimeError(f"Ollama {resp.status_code}: {resp.text[:200]}")
+
+ raw = resp.json().get("response", "")
+ cleaned = _THINK_STRIP.sub("", raw).strip()
+ return cleaned, duration_ms
+
+ def is_available(self) -> bool:
+ """Quick availability check — HEAD /api/tags."""
+ try:
+ resp = requests.get(f"{self.base_url}/api/tags", timeout=5)
+ return resp.status_code == 200
+ except Exception:
+ return False
diff --git a/observability/local/log-sentinel/prompts.py b/observability/local/log-sentinel/prompts.py
new file mode 100644
index 0000000..08bfdb6
--- /dev/null
+++ b/observability/local/log-sentinel/prompts.py
@@ -0,0 +1,396 @@
+"""Prompt templates and structured output schemas for Log Sentinel v2/v3."""
+
+# ── Stream descriptions injected into every prompt ──────────────────────────
+
+STREAM_DESCRIPTIONS = {
+ "sim-steward": (
+ "SimHub plugin logs: iRacing session events, user actions (button clicks, "
+ "replay controls), WebSocket messages, incident detection, plugin lifecycle. "
+ "Key fields: event, domain, component, session_id, subsession_id."
+ ),
+ "claude-dev-logging": (
+ "Claude Code AI agent logs: tool calls (Read, Write, Bash, etc.), "
+ "session lifecycle, subagent activity, MCP service calls, token snapshots. "
+ "Key fields: event, hook_type, tool_name, service, session_id, duration_ms."
+ ),
+ "claude-token-metrics": (
+ "Claude Code session summaries: one entry per completed AI session. "
+ "Fields: total_input_tokens, total_output_tokens, cost_usd, model, effort, "
+ "assistant_turns, tool_use_count, session_id."
+ ),
+}
+
+# ── T1 prompts ───────────────────────────────────────────────────────────────
+
+T1_SYSTEM = """\
+You are a log analyst for a SimHub iRacing plugin system that integrates with an AI coding assistant.
+You analyze structured JSON logs from three streams to identify what happened and what looks wrong.
+
+Stream guide:
+{stream_guide}
+
+Always respond with valid JSON only. No markdown, no explanation outside the JSON object.\
+"""
+
+T1_SUMMARY_PROMPT = """\
+Analyze the following log activity from the past {window_minutes} minutes.
+
+LOG COUNTS (total lines per stream):
+{counts}
+
+RECENT LOGS — sim-steward ({sim_steward_count} lines shown):
+{sim_steward_sample}
+
+RECENT LOGS — claude-dev-logging ({claude_dev_count} lines shown):
+{claude_dev_sample}
+
+RECENT LOGS — claude-token-metrics ({claude_token_count} lines shown):
+{claude_token_sample}
+
+Respond with this JSON schema exactly:
+{{
+ "summary": "<2-3 sentence narrative of what happened this window>",
+ "cycle_notes": ""
+}}
+"""
+
+T1_ANOMALY_PROMPT = """\
+You have already summarized this window:
+{summary}
+
+Now analyze the same logs for anomalies. Look for:
+- Error spikes or unexpected ERROR/WARN levels
+- Gaps in expected activity (e.g. session started but no actions followed)
+- Unusual token costs or AI session patterns
+- WebSocket disconnects, action failures, plugin crashes
+- Anything that deviates from normal healthy operation
+
+LOG COUNTS:
+{counts}
+
+RECENT LOGS — sim-steward:
+{sim_steward_sample}
+
+RECENT LOGS — claude-dev-logging:
+{claude_dev_sample}
+
+RECENT LOGS — claude-token-metrics:
+{claude_token_sample}
+
+Respond with this JSON schema exactly:
+{{
+ "anomalies": [
+ {{
+ "id": "",
+ "stream": "",
+ "description": "",
+ "severity": "",
+ "needs_t2": ,
+ "suggested_logql": ""
+ }}
+ ]
+}}
+
+Return an empty anomalies array if nothing looks wrong. Do not invent anomalies.
+"""
+
+# ── T2 prompts ───────────────────────────────────────────────────────────────
+
+T2_SYSTEM = """\
+You are a senior site reliability engineer investigating anomalies in a SimHub iRacing plugin system.
+You have been given anomaly flags, a chronological event timeline, and raw log evidence from targeted queries.
+Your job: determine root cause, identify cross-stream correlations, and provide concrete actionable recommendations.
+
+Stream guide:
+{stream_guide}
+
+Always respond with valid JSON only. No markdown, no explanation outside the JSON object.\
+"""
+
+T2_INVESTIGATION_PROMPT = """\
+ANOMALIES TO INVESTIGATE:
+{anomaly_descriptions}
+
+EVENT TIMELINE (past {window_minutes} minutes, chronological across all streams):
+{timeline_text}
+
+TARGETED LOG QUERIES AND RESULTS:
+{logql_results}
+
+Based on all of the above, respond with this JSON schema exactly:
+{{
+ "root_cause": "",
+ "issue_type": "",
+ "confidence": "",
+ "correlation": "",
+ "impact": "",
+ "recommendation": "",
+ "logql_queries_used": {logql_queries_list},
+ "sentry_worthy":
+}}
+"""
+
+# ── LogQL generation prompt ──────────────────────────────────────────────────
+
+LOGQL_GEN_SYSTEM = """\
+You are a Loki LogQL expert. Generate precise LogQL queries to investigate anomalies.
+Always respond with a valid JSON array of strings only. No explanation.\
+"""
+
+LOGQL_GEN_PROMPT = """\
+Generate up to 5 LogQL queries to investigate these anomalies:
+{anomaly_descriptions}
+
+Available streams (use exact app label values):
+- {{app="sim-steward"}} — plugin actions, iRacing events
+- {{app="claude-dev-logging"}} — AI agent tool calls, lifecycle
+- {{app="claude-token-metrics"}} — AI session token summaries
+
+Time window: past {window_minutes} minutes.
+
+Rules:
+- Every query must start with {{ and contain at least one |
+- Use | json to parse JSON log lines
+- Use | level = "ERROR" or | event = "..." to filter
+- Keep queries focused and specific to the anomalies
+
+Respond with a JSON array of strings:
+["", "", ...]
+"""
+
+
+# ── Helper: build formatted stream guide ────────────────────────────────────
+
+def build_stream_guide() -> str:
+ return "\n".join(
+ f" {app}: {desc}" for app, desc in STREAM_DESCRIPTIONS.items()
+ )
+
+
+# ── Helper: format log sample for prompt ────────────────────────────────────
+
+def format_log_sample(lines: list[dict], max_lines: int = 30) -> str:
+ import json
+ if not lines:
+ return " (no logs in this window)"
+ shown = lines[-max_lines:] # most recent
+ return "\n".join(f" {json.dumps(line, default=str)}" for line in shown)
+
+
+# ── Helper: format LogQL results for T2 prompt ──────────────────────────────
+
+def format_logql_results(results: dict[str, list[dict]]) -> str:
+ import json
+ if not results:
+ return " (no additional queries executed)"
+ sections = []
+ for query, lines in results.items():
+ if not lines:
+ sections.append(f"=== {query} ===\n (0 results)")
+ else:
+ formatted = "\n".join(
+ f" {json.dumps(line, default=str)}" for line in lines[:50]
+ )
+ sections.append(f"=== {query} ===\n{formatted}")
+ return "\n\n".join(sections)
+
+
+# ── v3: Feature invocation formatter ────────────────────────────────────────
+
+def format_invocations(invocations, max_invocations: int = 15) -> str:
+ """Format FeatureInvocation list for injection into T1 prompt."""
+ if not invocations:
+ return " (no feature invocations detected this window)"
+
+ shown = invocations[:max_invocations]
+ lines = []
+ for inv in shown:
+ status = "FAILED" if inv.success is False else ("OK" if inv.success else "?")
+ err = f" error={inv.error[:60]}" if inv.error else ""
+ lines.append(
+ f" [{status}] {inv.action_type} via {inv.correlation_method} "
+ f"({inv.duration_ms}ms, {len(inv.events)} events){err}"
+ )
+ if len(invocations) > max_invocations:
+ lines.append(f" [... {len(invocations) - max_invocations} more invocations not shown]")
+ return "\n".join(lines)
+
+
+def format_evidence_packets_for_t2(packet_dicts: list[dict]) -> str:
+ """Format Loki-serialized evidence packet metadata for T2 prompt."""
+ if not packet_dicts:
+ return " (no evidence packets available)"
+ lines = []
+ for p in packet_dicts:
+ lines.append(
+ f" [{p.get('severity', '?').upper()}] anomaly_id={p.get('anomaly_id', '?')} "
+ f"stream={p.get('detector_stream', '?')}"
+ )
+ lines.append(f" {p.get('anomaly_description', '')[:120]}")
+ if p.get("t1_hypothesis"):
+ lines.append(f" T1 hypothesis: {p['t1_hypothesis'][:120]}")
+ lines.append(
+ f" confidence={p.get('t1_confidence', 0):.0%} "
+ f"related_lines={p.get('related_lines_count', 0)} "
+ f"invocations={p.get('invocation_count', 0)}"
+ )
+ if p.get("suggested_logql"):
+ lines.append(f" suggested_logql: {p['suggested_logql'][:120]}")
+ lines.append("")
+ return "\n".join(lines)
+
+
+# ── v3: T1 anomaly prompt with invocations + baseline context ────────────────
+
+T1_ANOMALY_PROMPT_V3 = """\
+You have already summarized this window:
+{summary}
+
+FEATURE INVOCATIONS (user actions traced end-to-end this window):
+{invocations_text}
+
+BASELINE CONTEXT (historical normal values — use to judge what is anomalous):
+{baseline_context}
+
+Now analyze the logs for anomalies. Look for:
+- Error spikes or unexpected ERROR/WARN levels
+- Failed feature invocations (action_type FAILED)
+- Gaps in expected activity (e.g. session started but no actions followed)
+- Unusual token costs or AI session patterns
+- WebSocket disconnects, action failures, plugin crashes
+- Metrics exceeding baselines by 3x or more
+- Anything deviating from historical normal operation
+
+LOG COUNTS:
+{counts}
+
+RECENT LOGS — sim-steward:
+{sim_steward_sample}
+
+RECENT LOGS — claude-dev-logging:
+{claude_dev_sample}
+
+RECENT LOGS — claude-token-metrics:
+{claude_token_sample}
+
+Respond with this JSON schema exactly:
+{{
+ "anomalies": [
+ {{
+ "id": "",
+ "stream": "",
+ "event_type": "",
+ "description": "",
+ "severity": "",
+ "needs_t2": ,
+ "hypothesis": "",
+ "confidence": <0.0 to 1.0>,
+ "trace_id": "",
+ "suggested_logql": ""
+ }}
+ ]
+}}
+
+Return an empty anomalies array if nothing looks wrong. Do not invent anomalies.
+"""
+
+
+# ── v3: T2 evidence-packet prompts ──────────────────────────────────────────
+
+T2_EVIDENCE_SYSTEM = """\
+You are a senior site reliability engineer investigating anomalies in a SimHub iRacing plugin system.
+You have been given pre-assembled evidence packets from T1 fast triage, plus relevant Sentry history.
+Your job: validate T1 hypotheses, determine root cause, identify cross-stream correlations, and provide
+concrete actionable recommendations.
+
+Stream guide:
+{stream_guide}
+
+Always respond with valid JSON only. No markdown, no explanation outside the JSON object.\
+"""
+
+T2_EVIDENCE_PROMPT = """\
+EVIDENCE PACKETS FROM T1 TRIAGE:
+{evidence_text}
+
+SENTRY HISTORY (existing issues matching these anomaly signatures):
+{sentry_context}
+
+ADDITIONAL LOG EVIDENCE (from targeted LogQL queries):
+{logql_results}
+
+Based on all of the above, respond with this JSON schema exactly:
+{{
+ "root_cause": "",
+ "issue_type": "",
+ "confidence": "",
+ "correlation": "",
+ "impact": "",
+ "recommendation": "",
+ "sentry_worthy": ,
+ "sentry_fingerprint": "",
+ "logql_queries_used": []
+}}
+"""
+
+
+# ── v3: T3 synthesis prompts ─────────────────────────────────────────────────
+
+T3_SYSTEM = """\
+You are a systems analyst synthesizing log data, anomaly findings, and Sentry history
+for a SimHub iRacing plugin with an integrated AI coding assistant.
+
+Your goal: answer "What was the user trying to do, and did it work?"
+Produce a human-readable synthesis covering sessions, patterns, costs, regressions, and health.
+
+Stream guide:
+{stream_guide}
+
+Always respond with valid JSON only. No markdown, no explanation outside the JSON object.\
+"""
+
+T3_SYNTHESIS_PROMPT = """\
+SYNTHESIS WINDOW: {window_description}
+MODE: {mode}
+
+T1 EVIDENCE PACKETS (anomalies found this period):
+{evidence_summary}
+
+T2 INVESTIGATIONS (deep findings this period):
+{investigation_summary}
+
+OPEN SENTRY ISSUES:
+{sentry_issues}
+
+RECENT RELEASES:
+{recent_releases}
+
+SESSION NARRATIVES:
+{session_narratives}
+
+Respond with this JSON schema exactly:
+{{
+ "period_summary": "<2-3 sentence overview of the period>",
+ "sessions_analyzed": ,
+ "features_worked": ["", ...],
+ "features_failed": ["", ...],
+ "recurring_patterns": [
+ {{
+ "pattern": "",
+ "occurrences": ,
+ "first_seen": "",
+ "recommendation": ""
+ }}
+ ],
+ "cost_summary": {{
+ "sessions": ,
+ "total_usd": ,
+ "mean_per_session_usd": ,
+ "trend": ""
+ }},
+ "regression_detected": ,
+ "regression_detail": "",
+ "action_items": ["", ""],
+ "baselines_need_update":
+}}
+"""
diff --git a/observability/local/log-sentinel/requirements.txt b/observability/local/log-sentinel/requirements.txt
new file mode 100644
index 0000000..e8f6ece
--- /dev/null
+++ b/observability/local/log-sentinel/requirements.txt
@@ -0,0 +1,5 @@
+flask>=3.0.0
+requests>=2.31.0
+schedule>=1.2.0
+sentry-sdk>=2.0.0
+pytest>=8.0.0
diff --git a/observability/local/log-sentinel/sentinel.py b/observability/local/log-sentinel/sentinel.py
new file mode 100644
index 0000000..55df187
--- /dev/null
+++ b/observability/local/log-sentinel/sentinel.py
@@ -0,0 +1,326 @@
+"""Log Sentinel v3 — main cycle orchestrator."""
+
+import logging
+import time
+import uuid
+from dataclasses import dataclass
+
+import schedule
+
+from baseline import BaselineManager
+from circuit_breaker import CircuitBreaker
+from config import Config
+from evidence import EvidenceBuilder
+from grafana_client import GrafanaClient
+from loki_client import LokiClient
+from ollama_client import OllamaClient
+from sentry_client import SentryClient
+from t1_agent import T1Agent, T1Result
+from t2_agent import T2Agent, T2Result
+from t3_agent import T3Agent
+from timeline import TimelineBuilder
+from trace import InvocationBuilder
+
+logger = logging.getLogger("sentinel")
+
+
+@dataclass
+class CycleResult:
+ cycle_id: str
+ cycle_num: int
+ window_minutes: int
+ t1: T1Result | None
+ timeline_event_count: int
+ anomaly_count: int
+ duration_ms: int
+ error: str | None = None
+
+
+class Sentinel:
+ def __init__(self, config: Config):
+ self.config = config
+
+ self.loki = LokiClient(config.loki_url)
+ self.ollama = OllamaClient(config.ollama_url)
+ self.grafana = GrafanaClient(config.grafana_url, config.grafana_user, config.grafana_password)
+ self.sentry = SentryClient(config.sentry_dsn, config.env_label)
+
+ self.loki_breaker = CircuitBreaker("loki", failure_threshold=3, backoff_sec=60)
+ self.ollama_breaker = CircuitBreaker("ollama", failure_threshold=3, backoff_sec=120)
+
+ self.baseline = BaselineManager(self.loki, config.baseline_path)
+ self.evidence_builder = EvidenceBuilder(self.loki)
+ self.invocation_builder = InvocationBuilder()
+ self.timeline_builder = TimelineBuilder(self.loki, self.loki_breaker)
+
+ self.t1_agent = T1Agent(
+ self.ollama, self.loki, self.ollama_breaker, config,
+ self.baseline, self.evidence_builder,
+ )
+ self.t2_agent = T2Agent(
+ self.ollama, self.loki, self.grafana, self.sentry,
+ self.ollama_breaker, config,
+ )
+ self.t3_agent = T3Agent(
+ self.ollama, self.loki, self.grafana, self.sentry,
+ self.baseline, self.ollama_breaker, config,
+ )
+
+ self._cycle_num = 0
+ self._trigger_dedup: dict[str, float] = {} # alertname → last trigger time.time()
+ self._stats = {
+ "cycles_completed": 0,
+ "total_anomalies": 0,
+ "last_cycle_duration_ms": 0,
+ "last_t1_duration_ms": 0,
+ "last_t2_run_ts": 0,
+ "last_t3_run_ts": 0,
+ }
+
+ # ── Public ───────────────────────────────────────────────────────────────
+
+ def start(self):
+ """Blocking schedule loop."""
+ logger.info(
+ "Sentinel v3 started: mode=%s t1=%ds t2=%ds t3=%ds fast=%s deep=%s",
+ self.config.sentinel_mode,
+ self.config.t1_interval_sec,
+ self.config.t2_interval_sec,
+ self.config.t3_interval_sec,
+ self.config.ollama_model_fast,
+ self.config.ollama_model_deep,
+ )
+ self.run_cycle()
+ schedule.every(self.config.t1_interval_sec).seconds.do(self.run_cycle)
+ schedule.every(self.config.t2_interval_sec).seconds.do(self.run_t2_cycle)
+ schedule.every(self.config.t3_interval_sec).seconds.do(self.run_t3_cycle)
+ while True:
+ schedule.run_pending()
+ time.sleep(1)
+
+ def run_cycle(self) -> CycleResult:
+ """T1 analysis cycle. Always returns CycleResult."""
+ self._cycle_num += 1
+ cycle_id = str(uuid.uuid4())[:8]
+ cycle_start = time.time()
+
+ end_ns = self.loki.now_ns()
+ start_ns = end_ns - int(self.config.lookback_sec * 1e9)
+ window_minutes = max(1, self.config.lookback_sec // 60)
+
+ logger.info("Cycle #%d [%s] start: window=%dmin", self._cycle_num, cycle_id, window_minutes)
+
+ t1 = None
+ timeline_events = []
+ error = None
+
+ try:
+ # 1. Gather
+ counts, samples = self._gather(start_ns, end_ns)
+
+ # 2. Build timeline + invocations
+ timeline_events = self.timeline_builder.build(start_ns, end_ns)
+ tl_stats = self.timeline_builder.get_stats(timeline_events)
+ self.loki.push_timeline({
+ **tl_stats,
+ "cycle_id": cycle_id,
+ "truncated": tl_stats["event_count"] > 60,
+ }, self.config.env_label)
+
+ invocations = self.invocation_builder.build(timeline_events)
+
+ # 3. T1 analysis
+ if not self.ollama_breaker.allow_request():
+ logger.warning("T1 skipped: Ollama circuit open")
+ else:
+ t1 = self.t1_agent.run(
+ start_ns, end_ns, counts,
+ samples["sim-steward"],
+ samples["claude-dev-logging"],
+ samples["claude-token-metrics"],
+ invocations=invocations,
+ trigger_source="scheduled",
+ )
+ self.loki.push_analyst_run({
+ "cycle_id": cycle_id,
+ "tier": "t1",
+ "model": t1.model,
+ "think_mode": True,
+ "duration_ms": t1.total_duration_ms,
+ "summary_duration_ms": t1.summary_duration_ms,
+ "anomaly_duration_ms": t1.anomaly_duration_ms,
+ "anomaly_count": len(t1.anomalies),
+ "needs_t2_count": sum(1 for a in t1.anomalies if a.get("needs_t2")),
+ "evidence_packet_count": len(t1.evidence_packets),
+ "invocation_count": len(t1.invocations),
+ "window_minutes": window_minutes,
+ "trigger_source": t1.trigger_source,
+ }, self.config.env_label)
+
+ except Exception as e:
+ error = str(e)
+ logger.error("Cycle #%d error: %s", self._cycle_num, e)
+
+ duration_ms = int((time.time() - cycle_start) * 1000)
+ result = CycleResult(
+ cycle_id=cycle_id,
+ cycle_num=self._cycle_num,
+ window_minutes=window_minutes,
+ t1=t1,
+ timeline_event_count=len(timeline_events),
+ anomaly_count=len(t1.anomalies) if t1 else 0,
+ duration_ms=duration_ms,
+ error=error,
+ )
+
+ self.loki.push_cycle({
+ "cycle_id": cycle_id,
+ "cycle_num": self._cycle_num,
+ "window_minutes": window_minutes,
+ "t1_duration_ms": t1.total_duration_ms if t1 else 0,
+ "anomaly_count": result.anomaly_count,
+ "evidence_packet_count": len(t1.evidence_packets) if t1 else 0,
+ "timeline_event_count": len(timeline_events),
+ "total_duration_ms": duration_ms,
+ "error": error,
+ }, self.config.env_label)
+
+ self._stats["cycles_completed"] = self._cycle_num
+ self._stats["last_cycle_duration_ms"] = duration_ms
+ self._stats["last_t1_duration_ms"] = t1.total_duration_ms if t1 else 0
+ if t1:
+ self._stats["total_anomalies"] += result.anomaly_count
+
+ logger.info(
+ "Cycle #%d complete: %d anomalies %d evidence_packets %dms",
+ self._cycle_num, result.anomaly_count,
+ len(t1.evidence_packets) if t1 else 0, duration_ms,
+ )
+ return result
+
+ def run_t2_cycle(self) -> None:
+ """Independent T2 investigation cycle — pulls evidence packets from Loki."""
+ if not self.ollama_breaker.allow_request():
+ logger.warning("T2 cycle skipped: Ollama circuit open")
+ return
+ logger.info("T2 cycle starting")
+ try:
+ result = self.t2_agent.run()
+ self._stats["last_t2_run_ts"] = int(time.time())
+ if result:
+ logger.info(
+ "T2 cycle complete: confidence=%s sentry=%s %dms",
+ result.confidence, result.sentry_worthy, result.total_duration_ms,
+ )
+ except Exception as e:
+ logger.error("T2 cycle error: %s", e)
+
+ def run_t3_cycle(self) -> None:
+ """Independent T3 synthesis cycle — runs on slow cadence."""
+ if not self.ollama_breaker.allow_request():
+ logger.warning("T3 cycle skipped: Ollama circuit open")
+ return
+ logger.info("T3 cycle starting (mode=%s)", self.config.sentinel_mode)
+ try:
+ result = self.t3_agent.run(trigger="scheduled")
+ self._stats["last_t3_run_ts"] = int(time.time())
+ logger.info(
+ "T3 cycle complete: %d sessions, regression=%s, %dms",
+ result.sessions_analyzed, result.regression_detected, result.inference_duration_ms,
+ )
+ except Exception as e:
+ logger.error("T3 cycle error: %s", e)
+
+ def trigger_cycle(
+ self,
+ alert_context: str,
+ trigger_tier: str,
+ alert_names: list[str],
+ lookback_sec: int = 1800,
+ ) -> None:
+ """Alert-driven cycle — called from /trigger webhook, runs in background thread."""
+ logger.info(
+ "Trigger cycle: tier=%s alerts=%s lookback=%ds",
+ trigger_tier, alert_names, lookback_sec,
+ )
+ end_ns = self.loki.now_ns()
+ start_ns = end_ns - lookback_sec * 1_000_000_000
+
+ try:
+ counts, samples = self._gather(start_ns, end_ns)
+ timeline_events = self.timeline_builder.build(start_ns, end_ns)
+ invocations = self.invocation_builder.build(timeline_events)
+ except Exception as e:
+ logger.error("Trigger cycle gather failed: %s", e)
+ return
+
+ if not self.ollama_breaker.allow_request():
+ logger.warning("Trigger cycle T1 skipped: Ollama circuit open")
+ return
+
+ t1 = None
+ try:
+ t1 = self.t1_agent.run(
+ start_ns, end_ns, counts,
+ samples["sim-steward"],
+ samples["claude-dev-logging"],
+ samples["claude-token-metrics"],
+ invocations=invocations,
+ alert_context=alert_context,
+ trigger_source="grafana_alert",
+ alert_names=alert_names,
+ )
+ logger.info(
+ "Trigger T1 complete: %d anomalies, %d evidence_packets, %dms",
+ len(t1.anomalies), len(t1.evidence_packets), t1.total_duration_ms,
+ )
+ except Exception as e:
+ logger.error("Trigger cycle T1 failed: %s", e)
+
+ # For t2-tier alerts, skip needs_t2 gate — escalate immediately
+ if trigger_tier == "t2" and self.config.t2_enabled:
+ if not self.ollama_breaker.allow_request():
+ logger.warning("Trigger cycle T2 skipped: Ollama circuit open")
+ return
+ try:
+ forced_ids = [ep.anomaly_id for ep in t1.evidence_packets] if t1 else None
+ result = self.t2_agent.run(forced_packet_ids=forced_ids)
+ self._stats["last_t2_run_ts"] = int(time.time())
+ if result:
+ logger.info(
+ "Trigger T2 complete: confidence=%s sentry=%s %dms",
+ result.confidence, result.sentry_worthy, result.total_duration_ms,
+ )
+ except Exception as e:
+ logger.error("Trigger cycle T2 failed: %s", e)
+
+ # ── Private ──────────────────────────────────────────────────────────────
+
+ def _gather(self, start_ns: int, end_ns: int) -> tuple[dict, dict]:
+ """Fetch counts and samples from all three Loki streams."""
+ stream_queries = {
+ "sim-steward": '{app="sim-steward"} | json',
+ "claude-dev-logging": '{app="claude-dev-logging"} | json',
+ "claude-token-metrics": '{app="claude-token-metrics"} | json',
+ }
+
+ counts = {}
+ samples = {}
+
+ if not self.loki_breaker.allow_request():
+ logger.warning("Gather skipped: Loki circuit open")
+ return {k: 0 for k in stream_queries}, {k: [] for k in stream_queries}
+
+ try:
+ for name, logql in stream_queries.items():
+ counts[name] = self.loki.count(logql, start_ns, end_ns)
+ samples[name] = self.loki.query_lines(logql, start_ns, end_ns, limit=100)
+ self.loki_breaker.record_success()
+ except Exception as e:
+ self.loki_breaker.record_failure()
+ logger.error("Gather failed: %s", e)
+ for name in stream_queries:
+ counts.setdefault(name, -1)
+ samples.setdefault(name, [])
+
+ return counts, samples
diff --git a/observability/local/log-sentinel/sentry_client.py b/observability/local/log-sentinel/sentry_client.py
new file mode 100644
index 0000000..1fc98ed
--- /dev/null
+++ b/observability/local/log-sentinel/sentry_client.py
@@ -0,0 +1,230 @@
+"""Sentry SDK wrapper — create issues, read history, and capture behavioral findings.
+
+v3 additions:
+ - traces_sample_rate bumped to 1.0 (enable transactions)
+ - search_issues() — REST API read for T2/T3 history queries
+ - find_releases() — REST API read for T3 regression detection
+ - capture_behavioral_finding() — T2 writes behavioral patterns not captured by SDK
+"""
+
+import logging
+import requests
+
+logger = logging.getLogger("sentinel.sentry")
+
+_sdk_available = False
+try:
+ import sentry_sdk
+ _sdk_available = True
+except ImportError:
+ logger.warning("sentry-sdk not installed, Sentry integration disabled")
+
+
+class SentryClient:
+ def __init__(
+ self,
+ dsn: str,
+ env: str = "local",
+ auth_token: str = "",
+ org: str = "",
+ project: str = "",
+ ):
+ self.enabled = bool(dsn) and _sdk_available
+ self._auth_token = auth_token
+ self._org = org
+ self._project = project
+ self._api_enabled = bool(auth_token and org and project)
+
+ if self.enabled:
+ sentry_sdk.init(
+ dsn=dsn,
+ environment=env,
+ traces_sample_rate=1.0,
+ send_default_pii=False,
+ )
+ logger.info("Sentry initialized (env=%s)", env)
+ else:
+ if dsn and not _sdk_available:
+ logger.warning("Sentry DSN set but sentry-sdk not installed")
+ elif not dsn:
+ logger.info("Sentry disabled (no DSN)")
+
+ def create_issue(self, finding) -> str | None:
+ """Create Sentry issue for a critical finding. Returns event_id or None."""
+ if not self.enabled:
+ return None
+ try:
+ with sentry_sdk.new_scope() as scope:
+ scope.set_tag("detector", finding.detector)
+ scope.set_tag("category", finding.category)
+ scope.set_tag("severity", finding.severity)
+ scope.set_tag("issue_type", "unknown")
+ scope.set_context("finding", {
+ "finding_id": finding.finding_id,
+ "fingerprint": finding.fingerprint,
+ "summary": finding.summary,
+ "logql_query": finding.logql_query,
+ "evidence": finding.evidence,
+ })
+ scope.fingerprint = [finding.detector, finding.fingerprint]
+ event_id = sentry_sdk.capture_message(
+ f"[CRITICAL] {finding.title}",
+ level="error",
+ scope=scope,
+ )
+ logger.info("Sentry issue created for finding %s: %s", finding.finding_id[:8], event_id)
+ return event_id
+ except Exception as e:
+ logger.warning("Sentry create_issue failed: %s", e)
+ return None
+
+ def capture_behavioral_finding(
+ self,
+ title: str,
+ issue_type: str,
+ recommendation: str,
+ confidence: str,
+ fingerprint: str,
+ context: dict,
+ ) -> str | None:
+ """Create Sentry issue for a T2 behavioral finding (not captured by SDK).
+
+ Only call this for patterns that wouldn't surface as clean exceptions:
+ e.g. 'WebSocket always drops after 20min replay', 'incident detection stalls
+ after session_num > 3'. Do NOT use for things already covered by SDK capture.
+ """
+ if not self.enabled:
+ return None
+ try:
+ level = "error" if confidence == "high" else "warning"
+ with sentry_sdk.new_scope() as scope:
+ scope.set_tag("issue_type", issue_type)
+ scope.set_tag("confidence", confidence)
+ scope.set_tag("source", "t2_behavioral")
+ scope.set_context("finding", {
+ "recommendation": recommendation,
+ **context,
+ })
+ scope.fingerprint = ["t2.behavioral", fingerprint]
+ event_id = sentry_sdk.capture_message(
+ f"[T2] {title}",
+ level=level,
+ scope=scope,
+ )
+ logger.info("Sentry behavioral finding created: %s", event_id)
+ return event_id
+ except Exception as e:
+ logger.warning("Sentry capture_behavioral_finding failed: %s", e)
+ return None
+
+ # ── REST API read methods ──────────────────────────────────────────────────
+
+ def search_issues(self, query: str = "is:unresolved", limit: int = 10) -> list[dict]:
+ """Search Sentry issues via REST API. Returns list of issue dicts."""
+ if not self._api_enabled:
+ return []
+ try:
+ resp = requests.get(
+ f"https://sentry.io/api/0/projects/{self._org}/{self._project}/issues/",
+ headers={"Authorization": f"Bearer {self._auth_token}"},
+ params={"query": query, "limit": limit},
+ timeout=10,
+ )
+ if resp.status_code == 200:
+ return resp.json()
+ logger.debug("Sentry search_issues HTTP %d: %s", resp.status_code, resp.text[:200])
+ except Exception as e:
+ logger.debug("Sentry search_issues failed: %s", e)
+ return []
+
+ def get_issue(self, issue_id: str) -> dict:
+ """Fetch a single Sentry issue by ID."""
+ if not self._api_enabled:
+ return {}
+ try:
+ resp = requests.get(
+ f"https://sentry.io/api/0/issues/{issue_id}/",
+ headers={"Authorization": f"Bearer {self._auth_token}"},
+ timeout=10,
+ )
+ if resp.status_code == 200:
+ return resp.json()
+ except Exception as e:
+ logger.debug("Sentry get_issue failed: %s", e)
+ return {}
+
+ def find_releases(self, limit: int = 5) -> list[dict]:
+ """Fetch recent releases for regression detection in T3."""
+ if not self._api_enabled:
+ return []
+ try:
+ resp = requests.get(
+ f"https://sentry.io/api/0/projects/{self._org}/{self._project}/releases/",
+ headers={"Authorization": f"Bearer {self._auth_token}"},
+ params={"limit": limit},
+ timeout=10,
+ )
+ if resp.status_code == 200:
+ return resp.json()
+ except Exception as e:
+ logger.debug("Sentry find_releases failed: %s", e)
+ return []
+
+ def create_release(self, version: str) -> dict:
+ """Create a Sentry release (called from deploy.ps1 via this client)."""
+ if not self._api_enabled:
+ return {}
+ try:
+ resp = requests.post(
+ f"https://sentry.io/api/0/organizations/{self._org}/releases/",
+ headers={"Authorization": f"Bearer {self._auth_token}"},
+ json={"version": version, "projects": [self._project]},
+ timeout=10,
+ )
+ if resp.status_code in (200, 201):
+ return resp.json()
+ logger.debug("Sentry create_release HTTP %d: %s", resp.status_code, resp.text[:200])
+ except Exception as e:
+ logger.warning("Sentry create_release failed: %s", e)
+ return {}
+
+ def create_investigation_issue(self, investigation) -> str | None:
+ """Create Sentry issue for a T2 investigation report. Returns event_id or None."""
+ if not self.enabled:
+ return None
+ try:
+ finding = investigation.finding
+ level = "error" if finding.severity == "critical" else "warning"
+ with sentry_sdk.new_scope() as scope:
+ scope.set_tag("detector", finding.detector)
+ scope.set_tag("category", finding.category)
+ scope.set_tag("severity", finding.severity)
+ scope.set_tag("model", investigation.model)
+ scope.set_tag("confidence", investigation.confidence)
+ scope.set_tag("issue_type", investigation.issue_type)
+ scope.set_tag("trigger", investigation.trigger)
+ scope.set_context("investigation", {
+ "investigation_id": investigation.investigation_id,
+ "finding_id": finding.finding_id,
+ "root_cause": investigation.root_cause,
+ "correlation": investigation.correlation,
+ "impact": investigation.impact,
+ "recommendation": investigation.recommendation,
+ "inference_duration_ms": investigation.inference_duration_ms,
+ })
+ scope.set_context("finding", {
+ "title": finding.title,
+ "summary": finding.summary,
+ "evidence": finding.evidence,
+ })
+ scope.fingerprint = [finding.detector, investigation.root_cause[:50]]
+ event_id = sentry_sdk.capture_message(
+ f"[T2] {investigation.root_cause[:120]}",
+ level=level,
+ scope=scope,
+ )
+ logger.info("Sentry investigation issue for %s: %s", investigation.investigation_id[:8], event_id)
+ return event_id
+ except Exception as e:
+ logger.warning("Sentry create_investigation_issue failed: %s", e)
+ return None
diff --git a/observability/local/log-sentinel/t1_agent.py b/observability/local/log-sentinel/t1_agent.py
new file mode 100644
index 0000000..f6119f8
--- /dev/null
+++ b/observability/local/log-sentinel/t1_agent.py
@@ -0,0 +1,220 @@
+"""T1 — Fast triage agent.
+
+Replaces the T1 half of analyst.py for v3.
+Key changes over v2 Analyst.run_t1():
+ - Accepts pre-built FeatureInvocations from InvocationBuilder
+ - Injects BaselineManager context into anomaly prompt
+ - Accepts optional T0 alert context for event-driven runs
+ - Builds EvidencePackets for each anomaly via EvidenceBuilder
+ - Pushes sentinel_evidence_packet events to Loki
+ - T1Result carries invocations + evidence_packets + trigger metadata
+"""
+
+import logging
+from dataclasses import dataclass, field
+
+from analyst import _parse_json, _normalize_anomalies
+from baseline import BaselineManager
+from circuit_breaker import CircuitBreaker
+from config import Config
+from evidence import EvidenceBuilder, EvidencePacket
+from loki_client import LokiClient
+from ollama_client import OllamaClient
+from prompts import (
+ T1_SYSTEM, T1_SUMMARY_PROMPT, T1_ANOMALY_PROMPT_V3,
+ build_stream_guide, format_log_sample, format_invocations,
+)
+from trace import FeatureInvocation
+
+logger = logging.getLogger("sentinel.t1")
+
+
+@dataclass
+class T1Result:
+ summary: str
+ cycle_notes: str
+ anomalies: list[dict]
+ invocations: list[FeatureInvocation]
+ evidence_packets: list[EvidencePacket]
+ model: str
+ summary_duration_ms: int
+ anomaly_duration_ms: int
+ trigger_source: str # "scheduled" | "grafana_alert"
+ alert_names: list[str] # T0 alert names that triggered this run
+ raw_summary_response: str = field(repr=False, default="")
+ raw_anomaly_response: str = field(repr=False, default="")
+
+ @property
+ def needs_t2(self) -> bool:
+ return any(a.get("needs_t2") for a in self.anomalies)
+
+ @property
+ def total_duration_ms(self) -> int:
+ return self.summary_duration_ms + self.anomaly_duration_ms
+
+
+class T1Agent:
+ def __init__(
+ self,
+ ollama: OllamaClient,
+ loki: LokiClient,
+ breaker: CircuitBreaker,
+ config: Config,
+ baseline: BaselineManager,
+ evidence_builder: EvidenceBuilder,
+ ):
+ self.ollama = ollama
+ self.loki = loki
+ self.breaker = breaker
+ self.config = config
+ self.baseline = baseline
+ self.evidence_builder = evidence_builder
+ self._stream_guide = build_stream_guide()
+
+ def run(
+ self,
+ start_ns: int,
+ end_ns: int,
+ counts: dict[str, int],
+ sim_steward_sample: list[dict],
+ claude_dev_sample: list[dict],
+ claude_token_sample: list[dict],
+ invocations: list[FeatureInvocation],
+ alert_context: str = "",
+ trigger_source: str = "scheduled",
+ alert_names: list[str] | None = None,
+ ) -> T1Result:
+ window_minutes = max(1, int((end_ns - start_ns) / 1e9 / 60))
+ counts_text = "\n".join(f" {k}: {v}" for k, v in counts.items())
+
+ samples = dict(
+ sim_steward_sample=format_log_sample(sim_steward_sample),
+ sim_steward_count=len(sim_steward_sample),
+ claude_dev_sample=format_log_sample(claude_dev_sample),
+ claude_dev_count=len(claude_dev_sample),
+ claude_token_sample=format_log_sample(claude_token_sample),
+ claude_token_count=len(claude_token_sample),
+ )
+
+ invocations_text = format_invocations(invocations)
+ baseline_context = self.baseline.get_prompt_context()
+ system = T1_SYSTEM.format(stream_guide=self._stream_guide)
+
+ # Optional T0 alert context prefix — injected into both calls
+ alert_prefix = ""
+ if alert_context:
+ alert_prefix = (
+ f"ALERT CONTEXT (from Grafana):\n{alert_context}\n"
+ "→ Focus investigation on this signal. Do not suppress even if recent history is quiet.\n\n"
+ )
+
+ # Call A: summary (/no_think — fast)
+ summary_prompt = alert_prefix + T1_SUMMARY_PROMPT.format(
+ window_minutes=window_minutes,
+ counts=counts_text,
+ **samples,
+ )
+ summary_text = ""
+ cycle_notes = ""
+ summary_ms = 0
+ raw_summary = ""
+ try:
+ raw_summary, summary_ms = self.ollama.generate(
+ self.config.ollama_model_fast,
+ system + "\n\n" + summary_prompt,
+ think=False,
+ )
+ self.breaker.record_success()
+ parsed = _parse_json(raw_summary)
+ summary_text = parsed.get("summary", "")
+ cycle_notes = parsed.get("cycle_notes", "")
+ except Exception as e:
+ self.breaker.record_failure()
+ logger.error("T1 summary call failed: %s", e)
+
+ # Call B: anomaly scan (/think) — invocations + baseline context included
+ anomaly_prompt = alert_prefix + T1_ANOMALY_PROMPT_V3.format(
+ summary=summary_text or "(summary unavailable)",
+ counts=counts_text,
+ invocations_text=invocations_text,
+ baseline_context=baseline_context,
+ **samples,
+ )
+ anomalies = []
+ anomaly_ms = 0
+ raw_anomaly = ""
+ try:
+ raw_anomaly, anomaly_ms = self.ollama.generate(
+ self.config.ollama_model_fast,
+ system + "\n\n" + anomaly_prompt,
+ think=True,
+ )
+ self.breaker.record_success()
+ parsed = _parse_json(raw_anomaly)
+ anomalies = _normalize_anomalies_v3(parsed.get("anomalies", []))
+ except Exception as e:
+ self.breaker.record_failure()
+ logger.error("T1 anomaly call failed: %s", e)
+
+ # Build evidence packets for each anomaly, push to Loki
+ evidence_packets = []
+ if anomalies:
+ evidence_packets = self.evidence_builder.build_many(
+ anomalies, invocations, start_ns, end_ns
+ )
+ for packet in evidence_packets:
+ try:
+ self.loki.push_evidence_packet(packet, env=self.config.env_label)
+ except Exception as e:
+ logger.warning("Failed to push evidence packet %s: %s", packet.anomaly_id, e)
+
+ logger.info(
+ "T1 [%s]: %d invocations, %d anomalies (%d→T2), %d evidence packets, summary=%dms anomaly=%dms",
+ trigger_source,
+ len(invocations),
+ len(anomalies),
+ sum(1 for a in anomalies if a.get("needs_t2")),
+ len(evidence_packets),
+ summary_ms,
+ anomaly_ms,
+ )
+
+ return T1Result(
+ summary=summary_text,
+ cycle_notes=cycle_notes,
+ anomalies=anomalies,
+ invocations=invocations,
+ evidence_packets=evidence_packets,
+ model=self.config.ollama_model_fast,
+ summary_duration_ms=summary_ms,
+ anomaly_duration_ms=anomaly_ms,
+ trigger_source=trigger_source,
+ alert_names=alert_names or [],
+ raw_summary_response=raw_summary,
+ raw_anomaly_response=raw_anomaly,
+ )
+
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+
+def _normalize_anomalies_v3(raw: list) -> list[dict]:
+ """Normalize v3 anomaly dicts from T1 LLM output (superset of v2 fields)."""
+ if not isinstance(raw, list):
+ return []
+ valid = []
+ for a in raw:
+ if not isinstance(a, dict):
+ continue
+ valid.append({
+ "id": str(a.get("id", "unknown"))[:64],
+ "stream": a.get("stream", "unknown"),
+ "event_type": str(a.get("event_type", ""))[:64],
+ "description": str(a.get("description", ""))[:500],
+ "severity": a.get("severity", "info") if a.get("severity") in ("info", "warn", "critical") else "info",
+ "needs_t2": bool(a.get("needs_t2", False)),
+ "hypothesis": str(a.get("hypothesis", ""))[:300],
+ "confidence": float(a.get("confidence", 0.5)) if isinstance(a.get("confidence"), (int, float)) else 0.5,
+ "trace_id": str(a.get("trace_id", ""))[:64],
+ "suggested_logql": str(a.get("suggested_logql", ""))[:300],
+ })
+ return valid
diff --git a/observability/local/log-sentinel/t2_agent.py b/observability/local/log-sentinel/t2_agent.py
new file mode 100644
index 0000000..a10ea91
--- /dev/null
+++ b/observability/local/log-sentinel/t2_agent.py
@@ -0,0 +1,318 @@
+"""T2 — Deep investigation agent.
+
+Replaces the T2 half of analyst.py for v3.
+Key changes over v2 Analyst.run_t2():
+ - Reads evidence packets from Loki (state store), not from T1Result directly
+ - Queries Sentry for existing issues before forming recommendations
+ - Produces sentinel_t2_investigation events to Loki
+ - Creates Grafana annotation per investigation
+ - Creates Sentry issue if sentry_worthy + high confidence + not already captured
+
+Input flow:
+ Loki {event="sentinel_evidence_packet"} (last 15 min)
+ → SentryClient.search_issues() for each anomaly signature
+ → qwen3:32b /think
+ → LokiClient.push_t2_investigation()
+ → GrafanaClient.annotate_raw()
+ → SentryClient.capture_message() if warranted
+"""
+
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+
+from analyst import _parse_json, _normalize_confidence, _normalize_issue_type, _valid_logql
+from circuit_breaker import CircuitBreaker
+from config import Config
+from grafana_client import GrafanaClient
+from loki_client import LokiClient
+from ollama_client import OllamaClient
+from prompts import (
+ T2_EVIDENCE_SYSTEM, T2_EVIDENCE_PROMPT,
+ build_stream_guide, format_evidence_packets_for_t2, format_logql_results,
+ LOGQL_GEN_SYSTEM, LOGQL_GEN_PROMPT,
+)
+from sentry_client import SentryClient
+
+logger = logging.getLogger("sentinel.t2")
+
+# How far back to pull evidence packets from Loki
+_EVIDENCE_LOOKBACK_SEC = 900 # 15 minutes
+
+
+@dataclass
+class T2Result:
+ root_cause: str
+ issue_type: str
+ confidence: str
+ correlation: str
+ impact: str
+ recommendation: str
+ logql_queries_used: list[str]
+ sentry_worthy: bool
+ sentry_fingerprint: str
+ evidence_packet_count: int
+ sentry_event_id: str | None
+ model: str
+ inference_duration_ms: int
+ logql_gather_duration_ms: int
+ raw_response: str = field(repr=False, default="")
+
+ @property
+ def total_duration_ms(self) -> int:
+ return self.inference_duration_ms + self.logql_gather_duration_ms
+
+
+class T2Agent:
+ def __init__(
+ self,
+ ollama: OllamaClient,
+ loki: LokiClient,
+ grafana: GrafanaClient,
+ sentry: SentryClient,
+ breaker: CircuitBreaker,
+ config: Config,
+ ):
+ self.ollama = ollama
+ self.loki = loki
+ self.grafana = grafana
+ self.sentry = sentry
+ self.breaker = breaker
+ self.config = config
+ self._stream_guide = build_stream_guide()
+
+ def run(
+ self,
+ end_ns: int | None = None,
+ lookback_sec: int = _EVIDENCE_LOOKBACK_SEC,
+ forced_packet_ids: list[str] | None = None,
+ ) -> T2Result | None:
+ """
+ Run T2 investigation over recent evidence packets.
+
+ forced_packet_ids: if set, only process these specific anomaly_ids
+ (used when T1 immediately escalates critical anomalies)
+ """
+ if end_ns is None:
+ end_ns = self.loki.now_ns()
+ start_ns = end_ns - lookback_sec * 1_000_000_000
+
+ # Step 1: load evidence packets from Loki
+ packet_dicts = self._load_evidence_packets(start_ns, end_ns, forced_packet_ids)
+ if not packet_dicts:
+ logger.info("T2: no evidence packets in window, skipping")
+ return None
+
+ # Step 2: read Sentry history for context
+ sentry_context = self._fetch_sentry_context(packet_dicts)
+
+ # Step 3: generate + execute targeted LogQL for additional evidence
+ gather_start = time.time()
+ queries = self._generate_logql_queries(packet_dicts, lookback_sec // 60)
+ logql_results = self._execute_logql_queries(queries, start_ns, end_ns)
+ gather_ms = int((time.time() - gather_start) * 1000)
+
+ # Step 4: T2 inference
+ system = T2_EVIDENCE_SYSTEM.format(stream_guide=self._stream_guide)
+ prompt = T2_EVIDENCE_PROMPT.format(
+ evidence_text=format_evidence_packets_for_t2(packet_dicts),
+ sentry_context=sentry_context,
+ logql_results=format_logql_results(logql_results),
+ )
+
+ raw = ""
+ infer_ms = 0
+ try:
+ raw, infer_ms = self.ollama.generate(
+ self.config.ollama_model_deep,
+ system + "\n\n" + prompt,
+ think=True,
+ )
+ self.breaker.record_success()
+ except Exception as e:
+ self.breaker.record_failure()
+ logger.error("T2 inference failed: %s", e)
+
+ parsed = _parse_json(raw)
+ all_queries = queries + list(parsed.get("logql_queries_used", []))
+
+ result = T2Result(
+ root_cause=parsed.get("root_cause", "Unable to determine root cause."),
+ issue_type=_normalize_issue_type(parsed.get("issue_type", "unknown")),
+ confidence=_normalize_confidence(parsed.get("confidence", "low")),
+ correlation=parsed.get("correlation", "No correlations identified."),
+ impact=parsed.get("impact", "Impact unknown."),
+ recommendation=parsed.get("recommendation", "Investigate manually."),
+ logql_queries_used=all_queries,
+ sentry_worthy=bool(parsed.get("sentry_worthy", False)),
+ sentry_fingerprint=str(parsed.get("sentry_fingerprint", ""))[:100],
+ evidence_packet_count=len(packet_dicts),
+ sentry_event_id=None,
+ model=self.config.ollama_model_deep,
+ inference_duration_ms=infer_ms,
+ logql_gather_duration_ms=gather_ms,
+ raw_response=raw,
+ )
+
+ # Step 5: push investigation to Loki + Grafana
+ self._push_investigation(result, packet_dicts, end_ns)
+ self._annotate_grafana(result)
+
+ # Step 6: create Sentry issue if warranted
+ if result.sentry_worthy and result.confidence == "high":
+ event_id = self._create_sentry_issue(result, packet_dicts)
+ result.sentry_event_id = event_id
+
+ logger.info(
+ "T2 complete: confidence=%s sentry=%s packets=%d gather=%dms infer=%dms queries=%d",
+ result.confidence, result.sentry_worthy,
+ len(packet_dicts), gather_ms, infer_ms, len(all_queries),
+ )
+ return result
+
+ # ── Private ───────────────────────────────────────────────────────────────
+
+ def _load_evidence_packets(
+ self,
+ start_ns: int,
+ end_ns: int,
+ forced_ids: list[str] | None,
+ ) -> list[dict]:
+ logql = '{app="sim-steward", event="sentinel_evidence_packet"}'
+ packets = self.loki.query_lines(logql, start_ns, end_ns, limit=100)
+ if forced_ids:
+ packets = [p for p in packets if p.get("anomaly_id") in forced_ids]
+ # Dedup by anomaly_id, keep most recent
+ seen: dict[str, dict] = {}
+ for p in packets:
+ aid = p.get("anomaly_id", "")
+ if aid not in seen or p.get("assembled_at_ns", 0) > seen[aid].get("assembled_at_ns", 0):
+ seen[aid] = p
+ return list(seen.values())
+
+ def _fetch_sentry_context(self, packet_dicts: list[dict]) -> str:
+ if not packet_dicts:
+ return "(no Sentry history available)"
+ # Build a query from the most severe anomaly descriptions
+ critical = [p for p in packet_dicts if p.get("severity") == "critical"]
+ sample = (critical or packet_dicts)[:3]
+ streams = list({p.get("detector_stream", "") for p in sample if p.get("detector_stream")})
+ query = " ".join(streams) + " " + " ".join(
+ p.get("anomaly_description", "")[:40] for p in sample
+ )
+ try:
+ issues = self.sentry.search_issues(query=query.strip()[:200], limit=5)
+ if not issues:
+ return "(no matching Sentry issues found)"
+ lines = []
+ for issue in issues:
+ lines.append(
+ f" [{issue.get('level', '?').upper()}] {issue.get('title', '?')[:80]}"
+ f" (status={issue.get('status', '?')}, times_seen={issue.get('count', '?')})"
+ )
+ if issue.get("lastSeen"):
+ lines.append(f" last_seen: {issue['lastSeen']}")
+ return "\n".join(lines)
+ except Exception as e:
+ logger.debug("Sentry context fetch failed: %s", e)
+ return "(Sentry unavailable)"
+
+ def _generate_logql_queries(
+ self,
+ packet_dicts: list[dict],
+ window_minutes: int,
+ ) -> list[str]:
+ # Seed with suggested_logql from evidence packets
+ seeded = [
+ p["suggested_logql"] for p in packet_dicts
+ if p.get("suggested_logql") and _valid_logql(p["suggested_logql"])
+ ]
+
+ if not packet_dicts:
+ return seeded[:5]
+
+ anomaly_descriptions = "\n".join(
+ f"- {p.get('anomaly_id', '?')}: {p.get('anomaly_description', '')[:80]}"
+ for p in packet_dicts[:5]
+ )
+ prompt = LOGQL_GEN_SYSTEM + "\n\n" + LOGQL_GEN_PROMPT.format(
+ anomaly_descriptions=anomaly_descriptions,
+ window_minutes=window_minutes,
+ )
+ try:
+ raw, _ = self.ollama.generate(
+ self.config.ollama_model_fast,
+ prompt,
+ think=False,
+ temperature=0.0,
+ )
+ generated = json.loads(raw) if raw.strip().startswith("[") else []
+ if isinstance(generated, list):
+ combined = seeded + [q for q in generated if isinstance(q, str)]
+ return [q.strip() for q in combined if _valid_logql(q)][:5]
+ except Exception as e:
+ logger.debug("T2 LogQL gen failed: %s", e)
+
+ return [q for q in seeded if _valid_logql(q)][:5]
+
+ def _execute_logql_queries(
+ self, queries: list[str], start_ns: int, end_ns: int
+ ) -> dict[str, list[dict]]:
+ results = {}
+ for query in queries:
+ try:
+ lines = self.loki.query_lines(query, start_ns, end_ns, limit=50)
+ results[query] = lines
+ except Exception as e:
+ logger.debug("T2 LogQL execute failed (%s): %s", query[:60], e)
+ results[query] = []
+ return results
+
+ def _push_investigation(
+ self, result: T2Result, packet_dicts: list[dict], end_ns: int
+ ) -> None:
+ try:
+ self.loki.push_t2_investigation(result, packet_dicts, env=self.config.env_label)
+ except Exception as e:
+ logger.warning("Failed to push T2 investigation to Loki: %s", e)
+
+ def _annotate_grafana(self, result: T2Result) -> None:
+ try:
+ severity_tag = "critical" if result.confidence == "high" and result.sentry_worthy else "investigation"
+ self.grafana.annotate_raw(
+ title=f"T2 Investigation [{result.confidence}]: {result.root_cause[:80]}",
+ text=(
+ f"Root cause: {result.root_cause}
"
+ f"Recommendation: {result.recommendation}
"
+ f"Type: {result.issue_type} | Packets: {result.evidence_packet_count} | "
+ f"Model: {result.model}"
+ ),
+ tags=["t2", result.issue_type, result.confidence, severity_tag],
+ )
+ except Exception as e:
+ logger.debug("T2 Grafana annotation failed: %s", e)
+
+ def _create_sentry_issue(
+ self, result: T2Result, packet_dicts: list[dict]
+ ) -> str | None:
+ try:
+ streams = list({p.get("detector_stream", "") for p in packet_dicts if p.get("detector_stream")})
+ fingerprint = result.sentry_fingerprint or f"t2.{result.issue_type}.{streams[0] if streams else 'unknown'}"
+ return self.sentry.capture_behavioral_finding(
+ title=result.root_cause[:120],
+ issue_type=result.issue_type,
+ recommendation=result.recommendation,
+ confidence=result.confidence,
+ fingerprint=fingerprint,
+ context={
+ "root_cause": result.root_cause,
+ "correlation": result.correlation,
+ "impact": result.impact,
+ "evidence_packet_count": result.evidence_packet_count,
+ "model": result.model,
+ },
+ )
+ except Exception as e:
+ logger.warning("T2 Sentry issue creation failed: %s", e)
+ return None
diff --git a/observability/local/log-sentinel/t3_agent.py b/observability/local/log-sentinel/t3_agent.py
new file mode 100644
index 0000000..3cfbf09
--- /dev/null
+++ b/observability/local/log-sentinel/t3_agent.py
@@ -0,0 +1,329 @@
+"""T3 — Synthesis agent.
+
+Runs on a mode-dependent schedule (dev: 2h, prod: 4h) or on T2 critical escalation.
+Answers: "What was the user trying to do, and did it work?"
+
+What T3 does:
+ 1. Query Loki for T1 evidence packets + T2 investigations for the synthesis window
+ 2. Query Sentry for open issues + recent releases
+ 3. Build session narratives via NarrativeBuilder
+ 4. Run qwen3:32b /think for 7 synthesis passes (single LLM call)
+ 5. Update baselines.json via BaselineManager
+ 6. Emit sentinel_threshold_recommendation per drifted T0 threshold
+ 7. Push sentinel_synthesis + sentinel_narrative events to Loki
+
+Mode differences:
+ dev — 2h cadence, focus: Claude sessions, tool usage, code activity
+ prod — 4h cadence, focus: iRacing sessions, feature stability, user-facing errors
+"""
+
+import logging
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+
+from analyst import _parse_json
+from baseline import BaselineManager
+from circuit_breaker import CircuitBreaker
+from config import Config
+from grafana_client import GrafanaClient
+from loki_client import LokiClient
+from narrative import NarrativeBuilder
+from ollama_client import OllamaClient
+from prompts import T3_SYSTEM, T3_SYNTHESIS_PROMPT, build_stream_guide
+from sentry_client import SentryClient
+from trace import FeatureInvocation
+
+logger = logging.getLogger("sentinel.t3")
+
+# Lookbacks per mode for pulling Loki evidence
+_MODE_LOOKBACK = {
+ "dev": 2 * 3600,
+ "prod": 4 * 3600,
+}
+
+
+@dataclass
+class T3Result:
+ period_summary: str
+ sessions_analyzed: int
+ features_worked: list[str]
+ features_failed: list[str]
+ recurring_patterns: list[dict]
+ cost_summary: dict
+ regression_detected: bool
+ regression_detail: str
+ action_items: list[str]
+ baselines_updated: bool
+ threshold_recommendations: list[dict]
+ session_narratives: list[dict] # list of {session_id, narrative_text, ...}
+ model: str
+ inference_duration_ms: int
+ raw_response: str = field(repr=False, default="")
+
+
+class T3Agent:
+ def __init__(
+ self,
+ ollama: OllamaClient,
+ loki: LokiClient,
+ grafana: GrafanaClient,
+ sentry: SentryClient,
+ baseline: BaselineManager,
+ breaker: CircuitBreaker,
+ config: Config,
+ ):
+ self.ollama = ollama
+ self.loki = loki
+ self.grafana = grafana
+ self.sentry = sentry
+ self.baseline = baseline
+ self.breaker = breaker
+ self.config = config
+ self.narrative_builder = NarrativeBuilder()
+ self._stream_guide = build_stream_guide()
+
+ def run(
+ self,
+ end_ns: int | None = None,
+ invocations: list[FeatureInvocation] | None = None,
+ lookback_sec: int | None = None,
+ trigger: str = "scheduled",
+ ) -> T3Result:
+ """
+ Run T3 synthesis.
+
+ invocations: if provided (e.g. from same-cycle T1 run), used for narratives.
+ Otherwise T3 uses only Loki-stored invocation summaries.
+ trigger: "scheduled" | "t2_escalation"
+ """
+ if end_ns is None:
+ end_ns = self.loki.now_ns()
+ if lookback_sec is None:
+ lookback_sec = _MODE_LOOKBACK.get(self.config.sentinel_mode, 7200)
+ start_ns = end_ns - lookback_sec * 1_000_000_000
+
+ mode = self.config.sentinel_mode
+ window_description = _format_window(start_ns, end_ns, mode)
+
+ # Step 1: load evidence from Loki
+ evidence_packets = self._load_evidence_packets(start_ns, end_ns)
+ investigations = self._load_investigations(start_ns, end_ns)
+
+ # Step 2: Sentry context
+ sentry_issues_text, sentry_releases_text = self._fetch_sentry_context()
+
+ # Step 3: build session narratives
+ session_narratives = []
+ if invocations:
+ all_anomalies = [ep for ep in evidence_packets]
+ session_narratives = self.narrative_builder.build_all(
+ invocations=invocations,
+ anomaly_dicts=all_anomalies,
+ t2_investigation_dicts=investigations,
+ )
+
+ narratives_text = _format_narratives_for_prompt(session_narratives)
+
+ # Step 4: T3 LLM synthesis
+ system = T3_SYSTEM.format(stream_guide=self._stream_guide)
+ prompt = T3_SYNTHESIS_PROMPT.format(
+ window_description=window_description,
+ mode=mode,
+ evidence_summary=_format_evidence_summary(evidence_packets),
+ investigation_summary=_format_investigation_summary(investigations),
+ sentry_issues=sentry_issues_text,
+ recent_releases=sentry_releases_text,
+ session_narratives=narratives_text,
+ )
+
+ raw = ""
+ infer_ms = 0
+ try:
+ raw, infer_ms = self.ollama.generate(
+ self.config.ollama_model_deep,
+ system + "\n\n" + prompt,
+ think=True,
+ )
+ self.breaker.record_success()
+ except Exception as e:
+ self.breaker.record_failure()
+ logger.error("T3 inference failed: %s", e)
+
+ parsed = _parse_json(raw)
+
+ # Step 5: update baselines
+ baselines_updated = False
+ threshold_recs = []
+ try:
+ self.baseline.compute_and_save(lookback_sec=lookback_sec)
+ threshold_recs = self.baseline.get_threshold_recommendations()
+ baselines_updated = True
+ logger.info("T3: baselines updated, %d threshold recommendations", len(threshold_recs))
+ except Exception as e:
+ logger.warning("T3 baseline update failed: %s", e)
+
+ result = T3Result(
+ period_summary=parsed.get("period_summary", ""),
+ sessions_analyzed=int(parsed.get("sessions_analyzed", len(session_narratives))),
+ features_worked=parsed.get("features_worked", []),
+ features_failed=parsed.get("features_failed", []),
+ recurring_patterns=parsed.get("recurring_patterns", []),
+ cost_summary=parsed.get("cost_summary", {}),
+ regression_detected=bool(parsed.get("regression_detected", False)),
+ regression_detail=parsed.get("regression_detail", ""),
+ action_items=parsed.get("action_items", []),
+ baselines_updated=baselines_updated,
+ threshold_recommendations=threshold_recs,
+ session_narratives=session_narratives,
+ model=self.config.ollama_model_deep,
+ inference_duration_ms=infer_ms,
+ raw_response=raw,
+ )
+
+ # Step 6: push all outputs
+ self._push_outputs(result, end_ns, trigger)
+ self._annotate_grafana(result, trigger)
+
+ logger.info(
+ "T3 [%s/%s]: %d sessions, %d patterns, regression=%s, baselines=%s, %dms",
+ mode, trigger,
+ result.sessions_analyzed,
+ len(result.recurring_patterns),
+ result.regression_detected,
+ result.baselines_updated,
+ infer_ms,
+ )
+ return result
+
+ # ── Private ───────────────────────────────────────────────────────────────
+
+ def _load_evidence_packets(self, start_ns: int, end_ns: int) -> list[dict]:
+ logql = '{app="sim-steward", event="sentinel_evidence_packet"}'
+ try:
+ return self.loki.query_lines(logql, start_ns, end_ns, limit=200)
+ except Exception as e:
+ logger.warning("T3 evidence packet load failed: %s", e)
+ return []
+
+ def _load_investigations(self, start_ns: int, end_ns: int) -> list[dict]:
+ logql = '{app="sim-steward", event="sentinel_t2_investigation"}'
+ try:
+ return self.loki.query_lines(logql, start_ns, end_ns, limit=50)
+ except Exception as e:
+ logger.warning("T3 investigation load failed: %s", e)
+ return []
+
+ def _fetch_sentry_context(self) -> tuple[str, str]:
+ issues_text = "(Sentry unavailable)"
+ releases_text = "(no release data)"
+ try:
+ issues = self.sentry.search_issues(query="is:unresolved", limit=20)
+ if issues:
+ lines = [
+ f" [{i.get('level', '?').upper()}] {i.get('title', '?')[:80]}"
+ f" (times_seen={i.get('count', '?')}, last={i.get('lastSeen', '?')[:10]})"
+ for i in issues
+ ]
+ issues_text = "\n".join(lines)
+ else:
+ issues_text = "(no open Sentry issues)"
+ except Exception as e:
+ logger.debug("T3 Sentry issues fetch failed: %s", e)
+
+ try:
+ releases = self.sentry.find_releases(limit=5)
+ if releases:
+ lines = [
+ f" {r.get('version', '?')} released {r.get('dateCreated', '?')[:10]}"
+ for r in releases
+ ]
+ releases_text = "\n".join(lines)
+ else:
+ releases_text = "(no releases found)"
+ except Exception as e:
+ logger.debug("T3 Sentry releases fetch failed: %s", e)
+
+ return issues_text, releases_text
+
+ def _push_outputs(self, result: T3Result, end_ns: int, trigger: str) -> None:
+ # Push synthesis summary
+ try:
+ self.loki.push_synthesis(result, trigger=trigger, env=self.config.env_label)
+ except Exception as e:
+ logger.warning("T3: failed to push synthesis to Loki: %s", e)
+
+ # Push per-session narratives
+ for narrative in result.session_narratives:
+ try:
+ self.loki.push_narrative(narrative, env=self.config.env_label)
+ except Exception as e:
+ logger.debug("T3: failed to push narrative for %s: %s", narrative.get("session_id"), e)
+
+ # Push threshold recommendations
+ for rec in result.threshold_recommendations:
+ try:
+ self.loki.push_threshold_recommendation(rec, env=self.config.env_label)
+ except Exception as e:
+ logger.debug("T3: failed to push threshold rec: %s", e)
+
+ def _annotate_grafana(self, result: T3Result, trigger: str) -> None:
+ try:
+ regression_note = f" ⚠️ Regression: {result.regression_detail[:60]}" if result.regression_detected else ""
+ self.grafana.annotate_raw(
+ title=f"T3 Synthesis [{self.config.sentinel_mode}]: {result.sessions_analyzed} sessions",
+ text=(
+ f"{result.period_summary[:200]}{regression_note}
"
+ f"Patterns: {len(result.recurring_patterns)} | "
+ f"Baselines updated: {result.baselines_updated} | "
+ f"Trigger: {trigger}"
+ ),
+ tags=["t3", "synthesis", self.config.sentinel_mode, trigger],
+ )
+ except Exception as e:
+ logger.debug("T3 Grafana annotation failed: %s", e)
+
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+
+def _format_window(start_ns: int, end_ns: int, mode: str) -> str:
+ start_dt = datetime.fromtimestamp(start_ns / 1e9, tz=timezone.utc)
+ end_dt = datetime.fromtimestamp(end_ns / 1e9, tz=timezone.utc)
+ return (
+ f"{start_dt.strftime('%Y-%m-%d %H:%M')} – {end_dt.strftime('%H:%M')} UTC "
+ f"({int((end_ns - start_ns) / 3.6e12):.0f}h window, mode={mode})"
+ )
+
+
+def _format_evidence_summary(packets: list[dict]) -> str:
+ if not packets:
+ return " (none)"
+ lines = []
+ for p in packets[:20]:
+ lines.append(
+ f" [{p.get('severity', '?').upper()}] {p.get('anomaly_description', '')[:80]}"
+ )
+ if len(packets) > 20:
+ lines.append(f" [... {len(packets) - 20} more]")
+ return "\n".join(lines)
+
+
+def _format_investigation_summary(investigations: list[dict]) -> str:
+ if not investigations:
+ return " (none)"
+ lines = []
+ for inv in investigations[:10]:
+ lines.append(
+ f" [{inv.get('confidence', '?')}] {inv.get('root_cause', '')[:80]}"
+ f" (type={inv.get('issue_type', '?')})"
+ )
+ return "\n".join(lines)
+
+
+def _format_narratives_for_prompt(session_narratives: list[dict]) -> str:
+ if not session_narratives:
+ return " (no session narratives available — no invocations this window)"
+ parts = []
+ for n in session_narratives[:10]:
+ parts.append(n.get("narrative_text", "")[:600])
+ return "\n\n".join(parts)
diff --git a/observability/local/log-sentinel/tests/__init__.py b/observability/local/log-sentinel/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/observability/local/log-sentinel/timeline.py b/observability/local/log-sentinel/timeline.py
new file mode 100644
index 0000000..649961e
--- /dev/null
+++ b/observability/local/log-sentinel/timeline.py
@@ -0,0 +1,200 @@
+"""Cross-stream timeline builder — correlates events from all Loki streams."""
+
+import logging
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+
+from loki_client import LokiClient
+from circuit_breaker import CircuitBreaker
+
+logger = logging.getLogger("sentinel.timeline")
+
+# Streams to query and their display names
+STREAMS = [
+ ("sim-steward", '{app="sim-steward"} | json'),
+ ("claude-dev-logging", '{app="claude-dev-logging"} | json'),
+ ("claude-token-metrics", '{app="claude-token-metrics"} | json'),
+]
+
+# Events to exclude from the timeline (too noisy)
+_SKIP_EVENTS = {"sentinel_log", "sentinel_cycle", "sentinel_analyst_run", "sentinel_timeline_built"}
+
+# Temporal correlation window (nanoseconds)
+_TEMPORAL_WINDOW_NS = 30 * 1_000_000_000
+
+
+@dataclass
+class TimelineEvent:
+ ts_ns: int
+ ts_iso: str
+ stream: str
+ event_type: str
+ domain: str
+ component: str
+ message: str
+ session_id: str | None
+ subsession_id: str | None
+ raw: dict = field(repr=False)
+
+
+class TimelineBuilder:
+ def __init__(self, loki: LokiClient, breaker: CircuitBreaker):
+ self.loki = loki
+ self.breaker = breaker
+
+ def build(
+ self,
+ start_ns: int,
+ end_ns: int,
+ limit_per_stream: int = 200,
+ ) -> list[TimelineEvent]:
+ """Query all streams, merge and sort chronologically."""
+ if not self.breaker.allow_request():
+ logger.warning("Timeline build skipped: Loki circuit open")
+ return []
+
+ all_events: list[TimelineEvent] = []
+ try:
+ for stream_name, logql in STREAMS:
+ lines = self.loki.query_lines(logql, start_ns, end_ns, limit=limit_per_stream)
+ self.breaker.record_success()
+ for line in lines:
+ ev = self._parse_event(stream_name, line)
+ if ev:
+ all_events.append(ev)
+ except Exception as e:
+ self.breaker.record_failure()
+ logger.error("Timeline build error: %s", e)
+ return all_events
+
+ all_events.sort(key=lambda e: e.ts_ns)
+ return all_events
+
+ def _parse_event(self, stream: str, line: dict) -> TimelineEvent | None:
+ event_type = line.get("event", "")
+ if event_type in _SKIP_EVENTS:
+ return None
+
+ # Parse timestamp — prefer the log's own timestamp field, fallback to now
+ ts_ns = 0
+ ts_iso = line.get("timestamp", "")
+ if ts_iso:
+ try:
+ dt = datetime.fromisoformat(ts_iso.replace("Z", "+00:00"))
+ ts_ns = int(dt.timestamp() * 1e9)
+ except (ValueError, TypeError):
+ pass
+ if not ts_ns:
+ ts_ns = self.loki.now_ns()
+ ts_iso = datetime.now(timezone.utc).isoformat()
+
+ return TimelineEvent(
+ ts_ns=ts_ns,
+ ts_iso=ts_iso,
+ stream=stream,
+ event_type=event_type or "unknown",
+ domain=line.get("domain", ""),
+ component=line.get("component", ""),
+ message=line.get("message", ""),
+ session_id=line.get("session_id") or None,
+ subsession_id=line.get("subsession_id") or None,
+ raw=line,
+ )
+
+ def get_active_sessions(self, events: list[TimelineEvent]) -> list[str]:
+ """Return distinct session_ids seen in the event list."""
+ seen = []
+ for ev in events:
+ if ev.session_id and ev.session_id not in seen:
+ seen.append(ev.session_id)
+ return seen
+
+ def to_prompt_text(self, events: list[TimelineEvent], max_events: int = 60) -> str:
+ """Format timeline as human-readable numbered lines for LLM consumption."""
+ if not events:
+ return "(no events in this window)"
+
+ truncated = len(events) > max_events
+ shown = events[-max_events:] if truncated else events
+
+ # Group by session_id
+ sessions: dict[str, list[TimelineEvent]] = {}
+ no_session: list[TimelineEvent] = []
+
+ for ev in shown:
+ if ev.session_id:
+ sessions.setdefault(ev.session_id, []).append(ev)
+ else:
+ no_session.append(ev)
+
+ lines = []
+ counter = 1
+
+ for sid, evts in sessions.items():
+ # Find subsession if present
+ sub = next((e.subsession_id for e in evts if e.subsession_id), None)
+ header = f"SESSION {sid[:8]}"
+ if sub:
+ header += f" [subsession {sub}]"
+ lines.append(header)
+ for ev in evts:
+ lines.append(_format_event_line(counter, ev))
+ counter += 1
+ lines.append("")
+
+ if no_session:
+ lines.append("CO-OCCURRING (no session correlation)")
+ for ev in no_session:
+ lines.append(_format_event_line(counter, ev))
+ counter += 1
+
+ if truncated:
+ lines.append(
+ f"\n[NOTE: {len(events) - max_events} earlier events not shown. "
+ f"Earliest: {events[0].ts_iso}, Latest: {events[-1].ts_iso}]"
+ )
+
+ return "\n".join(lines)
+
+ def get_stats(self, events: list[TimelineEvent]) -> dict:
+ sessions = self.get_active_sessions(events)
+ streams = list({e.stream for e in events})
+ return {
+ "event_count": len(events),
+ "session_count": len(sessions),
+ "streams_queried": streams,
+ }
+
+
+def _format_event_line(idx: int, ev: TimelineEvent) -> str:
+ # Extract time portion only (HH:MM:SS)
+ try:
+ t = ev.ts_iso[11:19]
+ except (IndexError, TypeError):
+ t = "??:??:??"
+
+ # Pick the most informative extra field from raw
+ extra = _pick_extra(ev)
+ extra_str = f" {extra}" if extra else ""
+
+ return (
+ f" [{idx:03d}] {t} {ev.stream:<25} {ev.event_type:<30}{extra_str}"
+ )
+
+
+def _pick_extra(ev: TimelineEvent) -> str:
+ """Extract a short key=value summary from the raw event for the timeline."""
+ raw = ev.raw
+ candidates = [
+ ("action", raw.get("action")),
+ ("tool", raw.get("tool_name")),
+ ("event_type", raw.get("hook_type")),
+ ("track", raw.get("track_display_name")),
+ ("driver", raw.get("display_name")),
+ ("cost_usd", raw.get("cost_usd")),
+ ("tokens", raw.get("total_tokens")),
+ ("error", raw.get("error")),
+ ("duration_ms", raw.get("duration_ms")),
+ ]
+ parts = [f"{k}={v}" for k, v in candidates if v is not None and v != ""]
+ return " ".join(parts[:3])
diff --git a/observability/local/log-sentinel/trace.py b/observability/local/log-sentinel/trace.py
new file mode 100644
index 0000000..5d336a4
--- /dev/null
+++ b/observability/local/log-sentinel/trace.py
@@ -0,0 +1,225 @@
+"""Feature invocation model — groups timeline events into traceable user actions.
+
+Three correlation strategies (applied in order):
+ 1. trace_id exact — events share a trace_id field (plugin + dashboard instrumented)
+ 2. temporal — events cluster within 150ms with expected sequence patterns
+ 3. inferred — fallback: group by session_id + 1-minute time bucket
+"""
+
+import logging
+import uuid
+from dataclasses import dataclass, field
+
+from timeline import TimelineEvent
+
+logger = logging.getLogger("sentinel.trace")
+
+# Temporal grouping window (nanoseconds)
+_TEMPORAL_WINDOW_NS = 150_000_000 # 150ms
+
+# Events that anchor the start of a new invocation in temporal mode
+_ANCHOR_EVENTS = {
+ "dashboard_ui_event",
+ "action_dispatched",
+ "iracing_session_start",
+ "iracing_replay_seek",
+}
+
+# Events that signal the end of an invocation
+_TERMINAL_EVENTS = {
+ "action_result",
+ "iracing_session_end",
+}
+
+# Inferred grouping bucket (nanoseconds)
+_BUCKET_NS = 60 * 1_000_000_000 # 1 minute
+
+
+@dataclass
+class FeatureInvocation:
+ invocation_id: str # trace_id if available, else generated UUID
+ correlation_method: str # "trace_id" | "temporal" | "inferred"
+ start_ts_ns: int
+ end_ts_ns: int
+ action_type: str # "replay_seek" | "incident_review" | "session_start" | etc.
+ trigger_event: TimelineEvent # first event in this invocation
+ events: list[TimelineEvent] # all events belonging to this invocation
+ success: bool | None # did the feature complete? None = unknown
+ error: str | None # error message if failed
+ duration_ms: int
+ streams_involved: list[str] # which Loki streams contributed events
+
+ def to_summary_dict(self) -> dict:
+ """Compact serializable summary for Loki push and LLM context."""
+ return {
+ "invocation_id": self.invocation_id,
+ "correlation_method": self.correlation_method,
+ "action_type": self.action_type,
+ "success": self.success,
+ "error": self.error,
+ "duration_ms": self.duration_ms,
+ "event_count": len(self.events),
+ "streams": self.streams_involved,
+ "start_ts_ns": self.start_ts_ns,
+ "end_ts_ns": self.end_ts_ns,
+ }
+
+
+class InvocationBuilder:
+ """Groups a flat list of TimelineEvents into FeatureInvocation objects."""
+
+ def build(self, events: list[TimelineEvent]) -> list[FeatureInvocation]:
+ """
+ Returns invocations built from the event list.
+ Events are consumed across three passes; any event can only belong to one invocation.
+ """
+ remaining = list(events)
+ invocations: list[FeatureInvocation] = []
+
+ # Pass 1 — exact trace_id grouping
+ trace_invocations, remaining = self._group_by_trace_id(remaining)
+ invocations.extend(trace_invocations)
+
+ # Pass 2 — temporal window grouping
+ temporal_invocations, remaining = self._group_temporal(remaining)
+ invocations.extend(temporal_invocations)
+
+ # Pass 3 — inferred (session + time bucket)
+ inferred_invocations = self._group_inferred(remaining)
+ invocations.extend(inferred_invocations)
+
+ logger.debug(
+ "InvocationBuilder: %d events → %d invocations (%d trace_id, %d temporal, %d inferred)",
+ len(events),
+ len(invocations),
+ len(trace_invocations),
+ len(temporal_invocations),
+ len(inferred_invocations),
+ )
+ return sorted(invocations, key=lambda i: i.start_ts_ns)
+
+ # ── Pass 1: exact trace_id ─────────────────────────────────────────────
+
+ def _group_by_trace_id(
+ self, events: list[TimelineEvent]
+ ) -> tuple[list[FeatureInvocation], list[TimelineEvent]]:
+ groups: dict[str, list[TimelineEvent]] = {}
+ leftover: list[TimelineEvent] = []
+
+ for ev in events:
+ tid = ev.raw.get("trace_id")
+ if tid:
+ groups.setdefault(tid, []).append(ev)
+ else:
+ leftover.append(ev)
+
+ invocations = [
+ _build_invocation(group, "trace_id", trace_id=tid)
+ for tid, group in groups.items()
+ ]
+ return invocations, leftover
+
+ # ── Pass 2: temporal window ────────────────────────────────────────────
+
+ def _group_temporal(
+ self, events: list[TimelineEvent]
+ ) -> tuple[list[FeatureInvocation], list[TimelineEvent]]:
+ if not events:
+ return [], []
+
+ sorted_events = sorted(events, key=lambda e: e.ts_ns)
+ groups: list[list[TimelineEvent]] = []
+ current: list[TimelineEvent] = []
+
+ for ev in sorted_events:
+ if not current:
+ current = [ev]
+ continue
+
+ gap = ev.ts_ns - current[-1].ts_ns
+ is_anchor = ev.event_type in _ANCHOR_EVENTS
+
+ if is_anchor or gap > _TEMPORAL_WINDOW_NS:
+ if current:
+ groups.append(current)
+ current = [ev]
+ else:
+ current.append(ev)
+
+ if current:
+ groups.append(current)
+
+ # Drop single-event groups with no action signal — too noisy
+ meaningful = [g for g in groups if len(g) > 1 or g[0].event_type in _ANCHOR_EVENTS]
+ leftover = [ev for g in groups if g not in meaningful for ev in g]
+
+ invocations = [_build_invocation(g, "temporal") for g in meaningful]
+ return invocations, leftover
+
+ # ── Pass 3: inferred (session + time bucket) ───────────────────────────
+
+ def _group_inferred(self, events: list[TimelineEvent]) -> list[FeatureInvocation]:
+ if not events:
+ return []
+
+ buckets: dict[str, list[TimelineEvent]] = {}
+ for ev in events:
+ sid = ev.session_id or "no_session"
+ bucket = ev.ts_ns // _BUCKET_NS
+ key = f"{sid}:{bucket}"
+ buckets.setdefault(key, []).append(ev)
+
+ return [_build_invocation(group, "inferred") for group in buckets.values()]
+
+
+# ── Helpers ────────────────────────────────────────────────────────────────
+
+def _build_invocation(
+ events: list[TimelineEvent],
+ method: str,
+ trace_id: str | None = None,
+) -> FeatureInvocation:
+ sorted_events = sorted(events, key=lambda e: e.ts_ns)
+ start_ns = sorted_events[0].ts_ns
+ end_ns = sorted_events[-1].ts_ns
+ duration_ms = max(0, (end_ns - start_ns) // 1_000_000)
+
+ # action_type: prefer action_dispatched.raw["action"], else trigger event_type
+ action_type = "unknown"
+ for ev in sorted_events:
+ if ev.event_type == "action_dispatched":
+ action_type = ev.raw.get("action") or ev.event_type
+ break
+ if action_type == "unknown":
+ action_type = sorted_events[0].event_type or "unknown"
+
+ # success / error: look for terminal events
+ success: bool | None = None
+ error: str | None = None
+ for ev in sorted_events:
+ if ev.event_type in _TERMINAL_EVENTS or ev.event_type.endswith("_result"):
+ raw_success = ev.raw.get("success")
+ raw_error = ev.raw.get("error")
+ if raw_error:
+ success = False
+ error = str(raw_error)[:200]
+ break
+ if raw_success is not None:
+ success = bool(raw_success)
+ break
+
+ streams = list({ev.stream for ev in sorted_events})
+
+ return FeatureInvocation(
+ invocation_id=trace_id or str(uuid.uuid4()),
+ correlation_method=method,
+ start_ts_ns=start_ns,
+ end_ts_ns=end_ns,
+ action_type=action_type,
+ trigger_event=sorted_events[0],
+ events=sorted_events,
+ success=success,
+ error=error,
+ duration_ms=duration_ms,
+ streams_involved=streams,
+ )
diff --git a/observability/local/logs/claude-session-metrics.jsonl b/observability/local/logs/claude-session-metrics.jsonl
new file mode 100644
index 0000000..031585b
--- /dev/null
+++ b/observability/local/logs/claude-session-metrics.jsonl
@@ -0,0 +1 @@
+{"event":"claude_session_metrics","session_id":"94406f61-7d81-49e0-8b78-cea2011dff2e","project":"local","machine":"WIN-PC","env":"local","timestamp":"2026-03-26T21:22:07.242Z","total_input_tokens":669,"total_output_tokens":142861,"total_cache_creation_tokens":1867125,"total_cache_read_tokens":86930799,"total_tokens":143530,"assistant_turns":422,"tool_use_count":0,"model":"claude-opus-4-6","effort":"med","thinking":true,"cost_usd":176.1294}
diff --git a/src/SimSteward.Plugin/SimStewardPlugin.DataCaptureSuite.cs b/src/SimSteward.Plugin/SimStewardPlugin.DataCaptureSuite.cs
index 1175f9c..5255eaf 100644
--- a/src/SimSteward.Plugin/SimStewardPlugin.DataCaptureSuite.cs
+++ b/src/SimSteward.Plugin/SimStewardPlugin.DataCaptureSuite.cs
@@ -419,12 +419,13 @@ private void TickPreflight()
return;
}
- // Seek to near-end of replay for L2
- int seekTarget = Math.Max(0, _replayFrameTotal - 10);
+ // Seek to end of replay using ReplaySearch(ToEnd) — more reliable than
+ // frame-based seek (ReplayFrameNumEnd can be 0 or stale, which would
+ // seek to frame 0 and read SessionState at replay start instead of end).
+ _preflightSettleTicks = 0;
try
{
- _irsdk.ReplaySetPlaySpeed(1, false);
- _irsdk.ReplaySetPlayPosition(IRacingSdkEnum.RpyPosMode.Begin, seekTarget);
+ _irsdk.ReplaySearch(IRacingSdkEnum.RpySrchMode.ToEnd);
}
catch (Exception ex)
{
@@ -444,8 +445,10 @@ private void TickPreflight()
{
_preflightSettleTicks++;
int frame = SafeGetInt("ReplayFrameNum");
- int seekTarget = Math.Max(0, _replayFrameTotal - 10);
- if (Math.Abs(frame - seekTarget) <= 30 || _preflightSettleTicks > 300)
+ // ReplaySearch(ToEnd) is fire-and-forget; we don't have an exact target frame.
+ // Settle when: near ReplayFrameNumEnd (if valid) OR after 60 ticks (1s min wait).
+ bool nearEnd = _replayFrameTotal > 0 && frame >= _replayFrameTotal - 60;
+ if (nearEnd || _preflightSettleTicks >= 60 || _preflightSettleTicks > 300)
{
int sessionState = 0;
try { sessionState = _irsdk.Data.GetInt("SessionState"); } catch { }
diff --git a/token-cost-dashboard.png b/token-cost-dashboard.png
new file mode 100644
index 0000000..742ae82
Binary files /dev/null and b/token-cost-dashboard.png differ