Emmsfay
diff --git a/‎backend/app/main.py‎
Lines changed: 4 additions & 0 deletions b/‎backend/app/main.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backend/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎backend/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎monitoring/alerts/cryptoscope-rules.yaml‎
Lines changed: 75 additions & 0 deletions b/‎monitoring/alerts/cryptoscope-rules.yaml‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎monitoring/alerts/cryptoscope-rules.yaml:Zone.Identifier‎
96 Bytes b/‎monitoring/alerts/cryptoscope-rules.yaml:Zone.Identifier‎
96 Bytes
diff --git a/‎monitoring/dashboards/cryptoscope-dashboard.yaml‎
Lines changed: 94 additions & 0 deletions b/‎monitoring/dashboards/cryptoscope-dashboard.yaml‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎monitoring/dashboards/cryptoscope-dashboard.yaml:Zone.Identifier‎
96 Bytes b/‎monitoring/dashboards/cryptoscope-dashboard.yaml:Zone.Identifier‎
96 Bytes
diff --git a/‎monitoring/prometheus-values.yaml‎
Lines changed: 91 additions & 0 deletions b/‎monitoring/prometheus-values.yaml‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎monitoring/prometheus-values.yaml:Zone.Identifier‎
96 Bytes b/‎monitoring/prometheus-values.yaml:Zone.Identifier‎
96 Bytes
diff --git a/‎monitoring/service-monitor.yaml‎
Lines changed: 15 additions & 0 deletions b/‎monitoring/service-monitor.yaml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎monitoring/service-monitor.yaml:Zone.Identifier‎
96 Bytes b/‎monitoring/service-monitor.yaml:Zone.Identifier‎
96 Bytes
@@ -2,6 +2,7 @@
 from fastapi.middleware.cors import CORSMiddleware
 from contextlib import asynccontextmanager
 import httpx
+from prometheus_fastapi_instrumentator import Instrumentator
 
 from app.routers import prices, coins
 from app.services.coingecko import CoinGeckoService
@@ -33,6 +34,9 @@ async def lifespan(app: FastAPI):
 app.include_router(prices.router, prefix="/api/prices", tags=["prices"])
 app.include_router(coins.router, prefix="/api/coins", tags=["coins"])
 
+# Expose /metrics endpoint for Prometheus scraping
+Instrumentator().instrument(app).expose(app)
+
 
 @app.get("/health")
 async def health_check():
 
@@ -3,3 +3,4 @@ uvicorn[standard]==0.30.1
 httpx==0.27.0
 pydantic==2.7.1
 python-dotenv==1.0.1
+prometheus-fastapi-instrumentator==6.1.0
@@ -0,0 +1,75 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: cryptoscope-alerts
+  namespace: cryptoscope
+  labels:
+    release: kube-prometheus-stack
+spec:
+  groups:
+    - name: cryptoscope.availability
+      interval: 30s
+      rules:
+        # Fire if no healthy backend pods for 2 minutes
+        - alert: BackendDown
+          expr: |
+            count(kube_pod_status_ready{namespace="cryptoscope",pod=~"backend-.*",condition="true"}) == 0
+          for: 2m
+          labels:
+            severity: critical
+          annotations:
+            summary: "CryptoScope backend has no healthy pods"
+            description: "All backend pods have been unavailable for more than 2 minutes."
+
+        # Fire if error rate exceeds 5%
+        - alert: HighErrorRate
+          expr: |
+            sum(rate(http_requests_total{namespace="cryptoscope",status=~"5.."}[5m]))
+            /
+            sum(rate(http_requests_total{namespace="cryptoscope"}[5m])) * 100 > 5
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "High error rate on CryptoScope API"
+            description: "Error rate is {{ $value | printf \"%.2f\" }}% over the last 5 minutes."
+
+        # Fire if p95 latency exceeds 2 seconds
+        - alert: HighLatency
+          expr: |
+            histogram_quantile(0.95,
+              sum(rate(http_request_duration_seconds_bucket{namespace="cryptoscope"}[5m])) by (le)
+            ) > 2
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "High API latency on CryptoScope"
+            description: "p95 latency is {{ $value | printf \"%.2f\" }}s — CoinGecko may be slow."
+
+    - name: cryptoscope.resources
+      interval: 60s
+      rules:
+        # Fire if a pod is restarting frequently
+        - alert: PodCrashLooping
+          expr: |
+            increase(kube_pod_container_status_restarts_total{namespace="cryptoscope"}[15m]) > 3
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Pod {{ $labels.pod }} is crash looping"
+            description: "Pod has restarted {{ $value }} times in the last 15 minutes."
+
+        # Fire if HPA is at maximum replicas (capacity pressure)
+        - alert: HPAAtMaxReplicas
+          expr: |
+            kube_horizontalpodautoscaler_status_current_replicas{namespace="cryptoscope"}
+            ==
+            kube_horizontalpodautoscaler_spec_max_replicas{namespace="cryptoscope"}
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: "HPA {{ $labels.horizontalpodautoscaler }} is at max replicas"
+            description: "Consider increasing max replicas or node capacity."
@@ -0,0 +1,94 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: cryptoscope-dashboard
+  namespace: monitoring
+  labels:
+    grafana_dashboard: "1"
+data:
+  cryptoscope.json: |
+    {
+      "title": "CryptoScope",
+      "uid": "cryptoscope-main",
+      "timezone": "browser",
+      "schemaVersion": 38,
+      "refresh": "30s",
+      "panels": [
+        {
+          "id": 1,
+          "title": "Request rate (req/s)",
+          "type": "timeseries",
+          "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
+          "targets": [{
+            "expr": "sum(rate(http_requests_total{namespace=\"cryptoscope\"}[1m]))",
+            "legendFormat": "requests/sec"
+          }]
+        },
+        {
+          "id": 2,
+          "title": "Request latency p95 (ms)",
+          "type": "timeseries",
+          "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
+          "targets": [{
+            "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{namespace=\"cryptoscope\"}[5m])) by (le)) * 1000",
+            "legendFormat": "p95 latency"
+          }]
+        },
+        {
+          "id": 3,
+          "title": "Error rate (%)",
+          "type": "timeseries",
+          "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
+          "targets": [{
+            "expr": "sum(rate(http_requests_total{namespace=\"cryptoscope\",status=~\"5..\"}[1m])) / sum(rate(http_requests_total{namespace=\"cryptoscope\"}[1m])) * 100",
+            "legendFormat": "error %"
+          }]
+        },
+        {
+          "id": 4,
+          "title": "Pod count",
+          "type": "stat",
+          "gridPos": {"h": 8, "w": 6, "x": 12, "y": 8},
+          "targets": [{
+            "expr": "count(kube_pod_status_ready{namespace=\"cryptoscope\",condition=\"true\"})",
+            "legendFormat": "ready pods"
+          }]
+        },
+        {
+          "id": 5,
+          "title": "Memory usage (MB)",
+          "type": "timeseries",
+          "gridPos": {"h": 8, "w": 6, "x": 18, "y": 8},
+          "targets": [{
+            "expr": "sum(container_memory_working_set_bytes{namespace=\"cryptoscope\",container!=\"\"}) by (pod) / 1024 / 1024",
+            "legendFormat": "{{pod}}"
+          }]
+        },
+        {
+          "id": 6,
+          "title": "CPU usage (cores)",
+          "type": "timeseries",
+          "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
+          "targets": [{
+            "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"cryptoscope\",container!=\"\"}[1m])) by (pod)",
+            "legendFormat": "{{pod}}"
+          }]
+        },
+        {
+          "id": 7,
+          "title": "HPA replica count",
+          "type": "timeseries",
+          "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
+          "targets": [
+            {
+              "expr": "kube_horizontalpodautoscaler_status_current_replicas{namespace=\"cryptoscope\"}",
+              "legendFormat": "{{horizontalpodautoscaler}} current"
+            },
+            {
+              "expr": "kube_horizontalpodautoscaler_spec_max_replicas{namespace=\"cryptoscope\"}",
+              "legendFormat": "{{horizontalpodautoscaler}} max"
+            }
+          ]
+        }
+      ]
+    }
@@ -0,0 +1,91 @@
+# monitoring/prometheus-values.yaml
+# Helm values for kube-prometheus-stack
+# Tuned for t3.small nodes — minimal resource usage
+
+prometheusOperator:
+  admissionWebhooks:
+    enabled: false
+  tls:
+    enabled: false
+
+grafana:
+  adminPassword: "cryptoscope-admin"   # change this in production
+  persistence:
+    enabled: false                      # no PVC needed for learning
+  resources:
+    requests:
+      cpu: 100m
+      memory: 128Mi
+    limits:
+      cpu: 300m
+      memory: 256Mi
+
+  # Auto-load our custom dashboard
+  dashboardProviders:
+    dashboardproviders.yaml:
+      apiVersion: 1
+      providers:
+        - name: cryptoscope
+          orgId: 1
+          folder: CryptoScope
+          type: file
+          disableDeletion: false
+          options:
+            path: /var/lib/grafana/dashboards/cryptoscope
+
+  dashboardsConfigMaps:
+    cryptoscope: "cryptoscope-dashboard"
+
+  grafana.ini:
+    analytics:
+      check_for_updates: false
+
+prometheus:
+  prometheusSpec:
+    retention: 7d
+    resources:
+      requests:
+        cpu: 200m
+        memory: 512Mi
+      limits:
+        cpu: 500m
+        memory: 1Gi
+
+    # Scrape ALL namespaces — picks up our FastAPI /metrics endpoint
+    podMonitorNamespaceSelector: {}
+    podMonitorSelector: {}
+    serviceMonitorNamespaceSelector: {}
+    serviceMonitorSelector: {}
+    ruleNamespaceSelector: {}
+
+alertmanager:
+  alertmanagerSpec:
+    resources:
+      requests:
+        cpu: 50m
+        memory: 64Mi
+      limits:
+        cpu: 100m
+        memory: 128Mi
+
+# Reduce resource usage on small nodes
+prometheusOperator:
+  resources:
+    requests:
+      cpu: 100m
+      memory: 128Mi
+    limits:
+      cpu: 200m
+      memory: 256Mi
+
+kube-state-metrics:
+  resources:
+    requests:
+      cpu: 50m
+      memory: 64Mi
+
+nodeExporter:
+  resources:
+    requests:
+      cpu: 50m
+      memory: 32Mi
@@ -0,0 +1,15 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: backend-monitor
+  namespace: cryptoscope
+  labels:
+    release: kube-prometheus-stack   # must match the Helm release name
+spec:
+  selector:
+    matchLabels:
+      app: backend
+  endpoints:
+    - port: http
+      path: /metrics
+      interval: 15s