Skip to content

Commit c856279

Browse files
committed
feat: phase 6 — prometheus metrics, grafana dashboard, alert rules
1 parent c65f832 commit c856279

10 files changed

Lines changed: 280 additions & 0 deletions

backend/app/main.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from fastapi.middleware.cors import CORSMiddleware
33
from contextlib import asynccontextmanager
44
import httpx
5+
from prometheus_fastapi_instrumentator import Instrumentator
56

67
from app.routers import prices, coins
78
from app.services.coingecko import CoinGeckoService
@@ -33,6 +34,9 @@ async def lifespan(app: FastAPI):
3334
app.include_router(prices.router, prefix="/api/prices", tags=["prices"])
3435
app.include_router(coins.router, prefix="/api/coins", tags=["coins"])
3536

37+
# Expose /metrics endpoint for Prometheus scraping
38+
Instrumentator().instrument(app).expose(app)
39+
3640

3741
@app.get("/health")
3842
async def health_check():

backend/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ uvicorn[standard]==0.30.1
33
httpx==0.27.0
44
pydantic==2.7.1
55
python-dotenv==1.0.1
6+
prometheus-fastapi-instrumentator==6.1.0
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PrometheusRule
3+
metadata:
4+
name: cryptoscope-alerts
5+
namespace: cryptoscope
6+
labels:
7+
release: kube-prometheus-stack
8+
spec:
9+
groups:
10+
- name: cryptoscope.availability
11+
interval: 30s
12+
rules:
13+
# Fire if no healthy backend pods for 2 minutes
14+
- alert: BackendDown
15+
expr: |
16+
count(kube_pod_status_ready{namespace="cryptoscope",pod=~"backend-.*",condition="true"}) == 0
17+
for: 2m
18+
labels:
19+
severity: critical
20+
annotations:
21+
summary: "CryptoScope backend has no healthy pods"
22+
description: "All backend pods have been unavailable for more than 2 minutes."
23+
24+
# Fire if error rate exceeds 5%
25+
- alert: HighErrorRate
26+
expr: |
27+
sum(rate(http_requests_total{namespace="cryptoscope",status=~"5.."}[5m]))
28+
/
29+
sum(rate(http_requests_total{namespace="cryptoscope"}[5m])) * 100 > 5
30+
for: 5m
31+
labels:
32+
severity: warning
33+
annotations:
34+
summary: "High error rate on CryptoScope API"
35+
description: "Error rate is {{ $value | printf \"%.2f\" }}% over the last 5 minutes."
36+
37+
# Fire if p95 latency exceeds 2 seconds
38+
- alert: HighLatency
39+
expr: |
40+
histogram_quantile(0.95,
41+
sum(rate(http_request_duration_seconds_bucket{namespace="cryptoscope"}[5m])) by (le)
42+
) > 2
43+
for: 5m
44+
labels:
45+
severity: warning
46+
annotations:
47+
summary: "High API latency on CryptoScope"
48+
description: "p95 latency is {{ $value | printf \"%.2f\" }}s — CoinGecko may be slow."
49+
50+
- name: cryptoscope.resources
51+
interval: 60s
52+
rules:
53+
# Fire if a pod is restarting frequently
54+
- alert: PodCrashLooping
55+
expr: |
56+
increase(kube_pod_container_status_restarts_total{namespace="cryptoscope"}[15m]) > 3
57+
for: 5m
58+
labels:
59+
severity: warning
60+
annotations:
61+
summary: "Pod {{ $labels.pod }} is crash looping"
62+
description: "Pod has restarted {{ $value }} times in the last 15 minutes."
63+
64+
# Fire if HPA is at maximum replicas (capacity pressure)
65+
- alert: HPAAtMaxReplicas
66+
expr: |
67+
kube_horizontalpodautoscaler_status_current_replicas{namespace="cryptoscope"}
68+
==
69+
kube_horizontalpodautoscaler_spec_max_replicas{namespace="cryptoscope"}
70+
for: 10m
71+
labels:
72+
severity: warning
73+
annotations:
74+
summary: "HPA {{ $labels.horizontalpodautoscaler }} is at max replicas"
75+
description: "Consider increasing max replicas or node capacity."
96 Bytes
Binary file not shown.
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: cryptoscope-dashboard
5+
namespace: monitoring
6+
labels:
7+
grafana_dashboard: "1"
8+
data:
9+
cryptoscope.json: |
10+
{
11+
"title": "CryptoScope",
12+
"uid": "cryptoscope-main",
13+
"timezone": "browser",
14+
"schemaVersion": 38,
15+
"refresh": "30s",
16+
"panels": [
17+
{
18+
"id": 1,
19+
"title": "Request rate (req/s)",
20+
"type": "timeseries",
21+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
22+
"targets": [{
23+
"expr": "sum(rate(http_requests_total{namespace=\"cryptoscope\"}[1m]))",
24+
"legendFormat": "requests/sec"
25+
}]
26+
},
27+
{
28+
"id": 2,
29+
"title": "Request latency p95 (ms)",
30+
"type": "timeseries",
31+
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
32+
"targets": [{
33+
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{namespace=\"cryptoscope\"}[5m])) by (le)) * 1000",
34+
"legendFormat": "p95 latency"
35+
}]
36+
},
37+
{
38+
"id": 3,
39+
"title": "Error rate (%)",
40+
"type": "timeseries",
41+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
42+
"targets": [{
43+
"expr": "sum(rate(http_requests_total{namespace=\"cryptoscope\",status=~\"5..\"}[1m])) / sum(rate(http_requests_total{namespace=\"cryptoscope\"}[1m])) * 100",
44+
"legendFormat": "error %"
45+
}]
46+
},
47+
{
48+
"id": 4,
49+
"title": "Pod count",
50+
"type": "stat",
51+
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 8},
52+
"targets": [{
53+
"expr": "count(kube_pod_status_ready{namespace=\"cryptoscope\",condition=\"true\"})",
54+
"legendFormat": "ready pods"
55+
}]
56+
},
57+
{
58+
"id": 5,
59+
"title": "Memory usage (MB)",
60+
"type": "timeseries",
61+
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 8},
62+
"targets": [{
63+
"expr": "sum(container_memory_working_set_bytes{namespace=\"cryptoscope\",container!=\"\"}) by (pod) / 1024 / 1024",
64+
"legendFormat": "{{pod}}"
65+
}]
66+
},
67+
{
68+
"id": 6,
69+
"title": "CPU usage (cores)",
70+
"type": "timeseries",
71+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
72+
"targets": [{
73+
"expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"cryptoscope\",container!=\"\"}[1m])) by (pod)",
74+
"legendFormat": "{{pod}}"
75+
}]
76+
},
77+
{
78+
"id": 7,
79+
"title": "HPA replica count",
80+
"type": "timeseries",
81+
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
82+
"targets": [
83+
{
84+
"expr": "kube_horizontalpodautoscaler_status_current_replicas{namespace=\"cryptoscope\"}",
85+
"legendFormat": "{{horizontalpodautoscaler}} current"
86+
},
87+
{
88+
"expr": "kube_horizontalpodautoscaler_spec_max_replicas{namespace=\"cryptoscope\"}",
89+
"legendFormat": "{{horizontalpodautoscaler}} max"
90+
}
91+
]
92+
}
93+
]
94+
}
96 Bytes
Binary file not shown.

monitoring/prometheus-values.yaml

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# monitoring/prometheus-values.yaml
2+
# Helm values for kube-prometheus-stack
3+
# Tuned for t3.small nodes — minimal resource usage
4+
5+
prometheusOperator:
6+
admissionWebhooks:
7+
enabled: false
8+
tls:
9+
enabled: false
10+
11+
grafana:
12+
adminPassword: "cryptoscope-admin" # change this in production
13+
persistence:
14+
enabled: false # no PVC needed for learning
15+
resources:
16+
requests:
17+
cpu: 100m
18+
memory: 128Mi
19+
limits:
20+
cpu: 300m
21+
memory: 256Mi
22+
23+
# Auto-load our custom dashboard
24+
dashboardProviders:
25+
dashboardproviders.yaml:
26+
apiVersion: 1
27+
providers:
28+
- name: cryptoscope
29+
orgId: 1
30+
folder: CryptoScope
31+
type: file
32+
disableDeletion: false
33+
options:
34+
path: /var/lib/grafana/dashboards/cryptoscope
35+
36+
dashboardsConfigMaps:
37+
cryptoscope: "cryptoscope-dashboard"
38+
39+
grafana.ini:
40+
analytics:
41+
check_for_updates: false
42+
43+
prometheus:
44+
prometheusSpec:
45+
retention: 7d
46+
resources:
47+
requests:
48+
cpu: 200m
49+
memory: 512Mi
50+
limits:
51+
cpu: 500m
52+
memory: 1Gi
53+
54+
# Scrape ALL namespaces — picks up our FastAPI /metrics endpoint
55+
podMonitorNamespaceSelector: {}
56+
podMonitorSelector: {}
57+
serviceMonitorNamespaceSelector: {}
58+
serviceMonitorSelector: {}
59+
ruleNamespaceSelector: {}
60+
61+
alertmanager:
62+
alertmanagerSpec:
63+
resources:
64+
requests:
65+
cpu: 50m
66+
memory: 64Mi
67+
limits:
68+
cpu: 100m
69+
memory: 128Mi
70+
71+
# Reduce resource usage on small nodes
72+
prometheusOperator:
73+
resources:
74+
requests:
75+
cpu: 100m
76+
memory: 128Mi
77+
limits:
78+
cpu: 200m
79+
memory: 256Mi
80+
81+
kube-state-metrics:
82+
resources:
83+
requests:
84+
cpu: 50m
85+
memory: 64Mi
86+
87+
nodeExporter:
88+
resources:
89+
requests:
90+
cpu: 50m
91+
memory: 32Mi
96 Bytes
Binary file not shown.

monitoring/service-monitor.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: ServiceMonitor
3+
metadata:
4+
name: backend-monitor
5+
namespace: cryptoscope
6+
labels:
7+
release: kube-prometheus-stack # must match the Helm release name
8+
spec:
9+
selector:
10+
matchLabels:
11+
app: backend
12+
endpoints:
13+
- port: http
14+
path: /metrics
15+
interval: 15s
96 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)