diff --git a/README.md b/README.md index 3c90f83..5f929a6 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,13 @@ Explore the documentation to understand how **BrainBytes** is architected, deplo - **[Monitoring System Documentation](docs/monitoring-documentation.md)** Understand how BrainBytes ensures reliability and visibility through robust monitoring and alerting. -- **[Simulation Documentation](docs/simulation-documentation.md)** + - **[Alerts Documentation](docs/monitoring-documentation.md#4-alert-rules-documentation)** + Clear procedures for each Prometheus alert, including what it means, common causes, how to troubleshoot, and how to resolve it. + + - **[Dashboard Catalog](docs/monitoring-documentation.md#5-dashboard-catalog)** + Overview of key Grafana dashboards, including their purpose, key metrics tracked, and operational use cases. + +- **[Simulation/Data Generator Documentation](docs/simulation-documentation.md)** A detailed guide for running the API simulation script used for testing and traffic generation. - **[Docker Development Setup](docs/docker-dev-setup.md)** @@ -64,6 +70,14 @@ These files define and support the core automation and infrastructure setup of t - **[Screenshot of Cloud Dashboard and Testing Results (Validation Report)](https://docs.google.com/document/d/1gfU2dtmo8PnKXEZZlr5iMl9UzHSvCOctWRax_l4ybCU/edit?usp=sharing)** Contains visual evidence of successful deployment and testing. +- **[Monitoring System Demonstration Script](./docs/monitoring-demo-script.md)** + A step-by-step guide for delivering a 10–15 minute live demo of BrainBytes’ monitoring capabilities. + +- **Dashboard JSON Exports** + - [main-dashboard.json](./docker/dashboards/main-dashboard.json) + - [resource-dashboard.json](./docker/dashboards/resource-optimization.json) + - [error-dashboard.json](./docker/dashboards/error-analysis.json) + ## Team Members - Kristopher Santos - Team Lead - lr.ksantos@mmdc.mcl.edu.ph diff --git a/docker/dashboards/error-analysis.json b/docker/dashboards/error-analysis.json new file mode 100644 index 0000000..25b0b3b --- /dev/null +++ b/docker/dashboards/error-analysis.json @@ -0,0 +1,1527 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "Official dashboard for Standalone Traefik", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 3, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 9, + "panels": [], + "title": "General", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 0, + "y": 1 + }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "count(traefik_config_reloads_total)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Traefik Instances", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 5, + "y": 1 + }, + "id": 7, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "sum(rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\"}[$interval])) by (entrypoint)", + "legendFormat": "{{entrypoint}}", + "range": true, + "refId": "A" + } + ], + "title": "Requests per Entrypoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "https://medium.com/@tristan_96324/prometheus-apdex-alerting-d17a065e39d0", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 6, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "(sum(rate(traefik_entrypoint_request_duration_seconds_bucket{le=\"0.3\",code=\"200\",entrypoint=~\"$entrypoint\"}[$interval])) by (method) + \n sum(rate(traefik_entrypoint_request_duration_seconds_bucket{le=\"1.2\",code=\"200\",entrypoint=~\"$entrypoint\"}[$interval])) by (method)) / 2 / \n sum(rate(traefik_entrypoint_request_duration_seconds_count{code=\"200\",entrypoint=~\"$entrypoint\"}[$interval])) by (method)\n", + "legendFormat": "{{method}}", + "range": true, + "refId": "A" + } + ], + "title": "Apdex score", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "Mean Distribution", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 0, + "y": 3 + }, + "id": 14, + "options": { + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true, + "values": ["percent"] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": ["mean"], + "fields": "", + "values": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "asc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "sum(rate(traefik_service_requests_total{service=~\"$service.*\",protocol=\"http\"}[$interval])) by (method, code)", + "legendFormat": "{{method}}[{{code}}]", + "range": true, + "refId": "A" + } + ], + "title": "Http Code ", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 23, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n traefik_service_request_duration_seconds_sum{service=~\"$service.*\",protocol=\"http\"} / \n traefik_service_request_duration_seconds_count{service=~\"$service.*\",protocol=\"http\"},\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)\n\n", + "legendFormat": "{{method}}[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Top slow services", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 5, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,code) \n (rate(traefik_service_requests_total{service=~\"$service.*\",protocol=\"http\"}[$interval])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Most requested services", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 11, + "panels": [], + "title": "SLO", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 3, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "label_replace(\n 1 - (sum by (service)\n (rate(traefik_service_request_duration_seconds_bucket{le=\"1.2\",service=~\"$service.*\"}[$interval])) / sum by (service) \n (rate(traefik_service_request_duration_seconds_count{service=~\"$service.*\"}[$interval]))\n ) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\"\n)", + "legendFormat": "{{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Services failing SLO of 1200ms", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 4, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "label_replace(\n 1 - (sum by (service)\n (rate(traefik_service_request_duration_seconds_bucket{le=\"0.3\",service=~\"$service.*\"}[$interval])) / sum by (service) \n (rate(traefik_service_request_duration_seconds_count{service=~\"$service.*\"}[$interval]))\n ) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\"\n)", + "legendFormat": "{{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Services failing SLO of 300ms", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 16, + "panels": [], + "title": "HTTP Details", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 8, + "x": 0, + "y": 27 + }, + "id": 17, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,method,code) \n (rate(traefik_service_requests_total{service=~\"$service.*\",code=~\"2..\",protocol=\"http\"}[$interval])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "{{method}}[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "2xx over $interval", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisGridShow": true, + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 8, + "x": 8, + "y": 27 + }, + "id": 18, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,method,code) \n (rate(traefik_service_requests_total{service=~\"$service.*\",code=~\"5..\",protocol=\"http\"}[$interval])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "{{method}}[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "5xx over $interval", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisGridShow": true, + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 8, + "x": 16, + "y": 27 + }, + "id": 19, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,method,code) \n (rate(traefik_service_requests_total{service=~\"$service.*\",code!~\"2..|5..\",protocol=\"http\"}[$interval])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "{{method}}[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Other codes over $interval", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisGridShow": true, + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 39 + }, + "id": 20, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,method) \n (rate(traefik_service_requests_bytes_total{service=~\"$service.*\",protocol=\"http\"}[$interval])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "{{method}} on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Requests Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisGridShow": true, + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 24, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,method) \n (rate(traefik_service_responses_bytes_total{service=~\"$service.*\",protocol=\"http\"}[$interval])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "{{method}} on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Responses Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 47 + }, + "id": 21, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "sum(traefik_open_connections{entrypoint=~\"$entrypoint\"}) by (entrypoint)\n", + "legendFormat": "{{entrypoint}}", + "range": true, + "refId": "A" + } + ], + "title": "Connections per Entrypoint", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "5s", + "schemaVersion": 41, + "tags": [], + "templating": { + "list": [ + { + "current": { + "text": "prometheus", + "value": "aeor9nztfrzlsa" + }, + "includeAll": false, + "label": "datasource", + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "auto": true, + "auto_count": 30, + "auto_min": "1m", + "current": { + "text": "$__auto", + "value": "$__auto" + }, + "name": "interval", + "options": [ + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "2h", + "value": "2h" + }, + { + "selected": false, + "text": "4h", + "value": "4h" + }, + { + "selected": false, + "text": "8h", + "value": "8h" + } + ], + "query": "1m,5m,10m,30m,1h,2h,4h,8h", + "refresh": 2, + "type": "interval" + }, + { + "current": { + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "definition": "label_values(traefik_open_connections, entrypoint)", + "includeAll": true, + "name": "entrypoint", + "options": [], + "query": { + "query": "label_values(traefik_open_connections, entrypoint)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": { + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "definition": "label_values(traefik_service_requests_total, service)", + "includeAll": true, + "name": "service", + "options": [], + "query": { + "query": "label_values(traefik_service_requests_total, service)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Traefik Dashboard", + "uid": "n5bu_kv45", + "version": 2 +} diff --git a/docker/dashboards/main-dashboard.json b/docker/dashboards/main-dashboard.json new file mode 100644 index 0000000..30cb920 --- /dev/null +++ b/docker/dashboards/main-dashboard.json @@ -0,0 +1,2374 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 4, + "links": [ + { + "icon": "external link", + "tags": [], + "targetBlank": true, + "title": "GitHub", + "type": "link", + "url": "https://github.com/Morfusee/MO-IT122-DevOps" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 261, + "panels": [], + "title": "Quick CPU / Mem / Disk", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "Resource pressure via PSI", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "links": [], + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + }, + { + "color": "dark-yellow", + "value": 70 + }, + { + "color": "dark-red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 323, + "options": { + "displayMode": "basic", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_cpu_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "CPU", + "range": false, + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_memory_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "Mem", + "range": false, + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_io_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "I/O", + "range": false, + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_irq_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "Irq", + "range": false, + "refId": "D", + "step": 240 + } + ], + "title": "Pressure", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "Overall CPU busy percentage (averaged across all cores)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 1 + }, + "id": 20, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=\"$node\"}[$__rate_interval])))", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "CPU Busy", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "System load over all CPU cores together", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 6, + "y": 1 + }, + "id": 155, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "scalar(node_load1{instance=\"$node\",job=\"$job\"}) * 100 / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Sys Load", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "Real RAM usage excluding cache and reclaimable memory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 9, + "y": 1 + }, + "id": 16, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "(1 - (node_memory_MemAvailable_bytes{instance=\"$node\", job=\"$job\"} / node_memory_MemTotal_bytes{instance=\"$node\", job=\"$job\"})) * 100", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "B", + "step": 240 + } + ], + "title": "RAM Used", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "Used Root FS", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 154, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "(\n (node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"}\n - node_filesystem_avail_bytes{instance=\"$node\", job=\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"})\n / node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"}\n) * 100\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Root FS Used", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 15, + "y": 1 + }, + "id": 14, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "CPU Cores", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 70 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 18, + "y": 1 + }, + "id": 23, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"}", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "RootFS Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bool_yes_no" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 22, + "y": 1 + }, + "id": 328, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_reboot_required{instance=\"$node\",job=\"$job\"}", + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Reboot Required", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 15, + "y": 3 + }, + "id": 75, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "RAM Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 18, + "y": 3 + }, + "id": 15, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_time_seconds{instance=\"$node\",job=\"$job\"} - node_boot_time_seconds{instance=\"$node\",job=\"$job\"}", + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 331, + "panels": [], + "title": "BrainBytes Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "Total HTTP Requests from the beginning until now", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 6 + }, + "id": 332, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "vertical", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "editorMode": "code", + "expr": "sum(increase(_http_requests_total[$__range]))", + "legendFormat": "{{label_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Total HTTP Requests", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + }, + { + "color": "orange", + "value": 80 + }, + { + "color": "red", + "value": 90 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 6 + }, + "id": 333, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "editorMode": "code", + "expr": "nodejs_eventloop_lag_seconds", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Max Event Loop Lag", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 335, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "topk(15,\n sum by (route, method, status) (rate(_http_requests_total[$__rate_interval])) > 0\n)", + "legendFormat": "{{method}}[{{status}}] on {{route}}", + "range": true, + "refId": "A" + } + ], + "title": "Most requested endpoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 11 + }, + "id": 339, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(_http_requests_total{ok=\"true\"}[$__range])) / sum(rate(_http_requests_total[$__range])) * 100", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Overall Success Rate of Requests", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red" + }, + { + "color": "dark-red", + "value": 20 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 11 + }, + "id": 338, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(_http_requests_total{ok=\"false\"}[$__range])) / sum(rate(_http_requests_total[$__range])) * 100", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Overall Error Rate of Requests", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 330, + "panels": [], + "title": "Traefik", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": ["[200] on adonisjs-docker@docker"], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 329, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,code) \n (rate(traefik_service_requests_total{service=~\"$service.*\",protocol=\"http\"}[$interval])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Most requested services", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 340, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "sum(traefik_open_connections{entrypoint=~\"$entrypoint\"}) by (entrypoint)\n", + "legendFormat": "{{entrypoint}}", + "range": true, + "refId": "A" + } + ], + "title": "Connections per Entrypoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 336, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,method,code) \n (rate(traefik_service_requests_total{service=~\"$service.*\",code=~\"2..\",protocol=\"http\"}[$interval])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "{{method}}[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "2xx over $interval", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisGridShow": true, + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 337, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,method,code) \n (rate(traefik_service_requests_total{service=~\"$service.*\",code!~\"2..|5..\",protocol=\"http\"}[$interval])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "{{method}}[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Other codes over $interval", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 341, + "panels": [], + "title": "Prometheus", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "Whether Prometheus startup was fully completed and the server is ready for normal operation.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 0, + "text": "FAILED" + }, + "1": { + "color": "green", + "index": 1, + "text": "OK" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 35 + }, + "id": 342, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "prometheus_ready{instance=\"$instance\"}", + "legendFormat": "{{ instance }} {{job}}", + "range": true, + "refId": "A" + } + ], + "title": "Prometheus Ready", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "Number of scrape pool targets.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red" + }, + { + "color": "orange", + "value": 1 + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "green", + "value": 100 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Maximum" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 35 + }, + "id": 343, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "sum(prometheus_target_scrape_pool_targets{instance=\"$instance\"})", + "legendFormat": "Total Targets", + "range": true, + "refId": "A" + } + ], + "title": "Targets", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "Total user and system CPU time of Prometheus process.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 2 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 16, + "x": 8, + "y": 35 + }, + "id": 346, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(process_cpu_seconds_total{instance=\"$instance\"}[$__rate_interval])", + "legendFormat": "{{ instance }}", + "range": true, + "refId": "A" + } + ], + "title": "Prometheus CPU Usage %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "The number of alertmanagers discovered and active.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red" + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "green", + "value": 2 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 40 + }, + "id": 344, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "prometheus_notifications_alertmanagers_discovered{instance=\"$instance\"}", + "refId": "A" + } + ], + "title": "Active Alertmanagers", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "Total number of alerts sent.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 40 + }, + "id": 345, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(increase(prometheus_notifications_sent_total{instance=\"$instance\"}[10m]))", + "legendFormat": "{{alertmanager}}", + "range": true, + "refId": "A" + } + ], + "title": "Sent Alerts [10m]", + "type": "stat" + } + ], + "preload": false, + "refresh": "auto", + "schemaVersion": 41, + "tags": [], + "templating": { + "list": [ + { + "current": { + "text": "prometheus", + "value": "aeor9nztfrzlsa" + }, + "includeAll": false, + "label": "Datasource", + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": { + "text": "node_exporter", + "value": "node_exporter" + }, + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "definition": "", + "includeAll": false, + "label": "Job", + "name": "job", + "options": [], + "query": { + "query": "label_values(node_uname_info, job)", + "refId": "Prometheus-job-Variable-Query" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "7686b6dfc965", + "value": "7686b6dfc965" + }, + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "definition": "label_values(node_uname_info{job=\"$job\"}, nodename)", + "includeAll": false, + "label": "Nodename", + "name": "nodename", + "options": [], + "query": { + "query": "label_values(node_uname_info{job=\"$job\"}, nodename)", + "refId": "Prometheus-nodename-Variable-Query" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "node_exporter:9100", + "value": "node_exporter:9100" + }, + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "definition": "label_values(node_uname_info{job=\"$job\", nodename=\"$nodename\"}, instance)", + "includeAll": false, + "label": "Instance", + "name": "node", + "options": [], + "query": { + "query": "label_values(node_uname_info{job=\"$job\", nodename=\"$nodename\"}, instance)", + "refId": "Prometheus-node-Variable-Query" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+" + }, + "hide": 2, + "includeAll": false, + "name": "diskdevices", + "options": [ + { + "selected": true, + "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+" + } + ], + "query": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "type": "custom" + }, + { + "auto": true, + "auto_count": 30, + "auto_min": "1m", + "current": { + "text": "$__auto", + "value": "$__auto" + }, + "name": "interval", + "options": [ + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "2h", + "value": "2h" + }, + { + "selected": false, + "text": "4h", + "value": "4h" + }, + { + "selected": false, + "text": "8h", + "value": "8h" + } + ], + "query": "1m,5m,10m,30m,1h,2h,4h,8h", + "refresh": 2, + "type": "interval" + }, + { + "current": { + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "definition": "label_values(traefik_open_connections, entrypoint)", + "includeAll": true, + "name": "entrypoint", + "options": [], + "query": { + "query": "label_values(traefik_open_connections, entrypoint)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": { + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "definition": "label_values(traefik_service_requests_total, service)", + "includeAll": true, + "name": "service", + "options": [], + "query": { + "query": "label_values(traefik_service_requests_total, service)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "localhost:9090", + "value": "localhost:9090" + }, + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "definition": "label_values(prometheus_build_info, instance)", + "includeAll": false, + "name": "instance", + "options": [], + "query": { + "query": "label_values(prometheus_build_info, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "DevOps Dashboard", + "uid": "9ebb8eed-53cf-48bb-b439-83445685a612", + "version": 50 +} diff --git a/docker/dashboards/resource-optimization.json b/docker/dashboards/resource-optimization.json new file mode 100644 index 0000000..68e98ca --- /dev/null +++ b/docker/dashboards/resource-optimization.json @@ -0,0 +1,1801 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": 2, + "links": [ + { + "icon": "external link", + "tags": [], + "targetBlank": true, + "title": "GitHub", + "type": "link", + "url": "https://github.com/rfmoz/grafana-dashboards" + }, + { + "icon": "external link", + "tags": [], + "targetBlank": true, + "title": "Grafana", + "type": "link", + "url": "https://grafana.com/grafana/dashboards/1860" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 261, + "panels": [], + "title": "Quick CPU / Mem / Disk", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "Resource pressure via PSI", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "links": [], + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + }, + { + "color": "dark-yellow", + "value": 70 + }, + { + "color": "dark-red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 323, + "options": { + "displayMode": "basic", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_cpu_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "CPU", + "range": false, + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_memory_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "Mem", + "range": false, + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_io_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "I/O", + "range": false, + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_irq_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "Irq", + "range": false, + "refId": "D", + "step": 240 + } + ], + "title": "Pressure", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "Overall CPU busy percentage (averaged across all cores)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 1 + }, + "id": 20, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=\"$node\"}[$__rate_interval])))", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "CPU Busy", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "System load over all CPU cores together", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 6, + "y": 1 + }, + "id": 155, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "scalar(node_load1{instance=\"$node\",job=\"$job\"}) * 100 / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Sys Load", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "Real RAM usage excluding cache and reclaimable memory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 9, + "y": 1 + }, + "id": 16, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "(1 - (node_memory_MemAvailable_bytes{instance=\"$node\", job=\"$job\"} / node_memory_MemTotal_bytes{instance=\"$node\", job=\"$job\"})) * 100", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "B", + "step": 240 + } + ], + "title": "RAM Used", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "Used Root FS", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 154, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "(\n (node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"}\n - node_filesystem_avail_bytes{instance=\"$node\", job=\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"})\n / node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"}\n) * 100\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Root FS Used", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 15, + "y": 1 + }, + "id": 14, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "CPU Cores", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 18, + "y": 1 + }, + "id": 15, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_time_seconds{instance=\"$node\",job=\"$job\"} - node_boot_time_seconds{instance=\"$node\",job=\"$job\"}", + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bool_yes_no" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 22, + "y": 1 + }, + "id": 328, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_reboot_required{instance=\"$node\",job=\"$job\"}", + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Reboot Required", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 15, + "y": 3 + }, + "id": 75, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "RAM Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 70 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 18, + "y": 3 + }, + "id": 23, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"}", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "RootFS Total", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 263, + "panels": [], + "title": "Basic CPU / Mem / Net / Disk", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "CPU time spent busy vs idle, split by activity type", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "percent" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Busy Iowait" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Idle" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy System" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy User" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-purple", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy Other" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 77, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true, + "width": 250 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"system\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "Busy System", + "range": true, + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Busy User", + "range": true, + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"iowait\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Busy Iowait", + "range": true, + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=~\".*irq\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Busy IRQs", + "range": true, + "refId": "D", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode!='idle',mode!='user',mode!='system',mode!='iowait',mode!='irq',mode!='softirq'}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Busy Other", + "range": true, + "refId": "E", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"idle\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Idle", + "range": true, + "refId": "F", + "step": 240 + } + ], + "title": "CPU Basic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "RAM and swap usage overview, including caches", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Swap used" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache + Buffer" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 78, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Total", + "range": true, + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - (node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Used", + "range": true, + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cache + Buffer", + "range": true, + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Free", + "range": true, + "refId": "D", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Swap used", + "range": true, + "refId": "E", + "step": 240 + } + ], + "title": "Memory Basic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "Per-interface network traffic (receive and transmit) in bits per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Tx.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 13 + }, + "id": 74, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "rate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Rx {{device}}", + "range": true, + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "rate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Tx {{device}} ", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Basic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "description": "Percentage of filesystem space used for each mounted device", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 13 + }, + "id": 152, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "editorMode": "code", + "expr": "((node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", device!~\"rootfs\"} - node_filesystem_avail_bytes{instance=\"$node\", job=\"$job\", device!~\"rootfs\"}) / node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", device!~\"rootfs\"}) * 100", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{mountpoint}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Disk Space Used Basic", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "5s", + "schemaVersion": 41, + "tags": [ + "linux" + ], + "templating": { + "list": [ + { + "current": { + "text": "prometheus", + "value": "aeor9nztfrzlsa" + }, + "includeAll": false, + "label": "Datasource", + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": { + "text": "node_exporter", + "value": "node_exporter" + }, + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "definition": "", + "includeAll": false, + "label": "Job", + "name": "job", + "options": [], + "query": { + "query": "label_values(node_uname_info, job)", + "refId": "Prometheus-job-Variable-Query" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "7686b6dfc965", + "value": "7686b6dfc965" + }, + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "definition": "label_values(node_uname_info{job=\"$job\"}, nodename)", + "includeAll": false, + "label": "Nodename", + "name": "nodename", + "options": [], + "query": { + "query": "label_values(node_uname_info{job=\"$job\"}, nodename)", + "refId": "Prometheus-nodename-Variable-Query" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "node_exporter:9100", + "value": "node_exporter:9100" + }, + "datasource": { + "type": "prometheus", + "uid": "aeor9nztfrzlsa" + }, + "definition": "label_values(node_uname_info{job=\"$job\", nodename=\"$nodename\"}, instance)", + "includeAll": false, + "label": "Instance", + "name": "node", + "options": [], + "query": { + "query": "label_values(node_uname_info{job=\"$job\", nodename=\"$nodename\"}, instance)", + "refId": "Prometheus-node-Variable-Query" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+" + }, + "hide": 2, + "includeAll": false, + "name": "diskdevices", + "options": [ + { + "selected": true, + "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+" + } + ], + "query": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "type": "custom" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "System Stats", + "uid": "rYdddlPWk", + "version": 4 +} \ No newline at end of file diff --git a/docs/alerts-documentation.md b/docs/alerts-documentation.md new file mode 100644 index 0000000..d828a5c --- /dev/null +++ b/docs/alerts-documentation.md @@ -0,0 +1,287 @@ +# Alerts Dcumentation + +This guide provides clear and actionable procedures for each alert defined in the Prometheus `alerting-rules.yml`. Each section includes what the alert means, possible causes, troubleshooting steps, and resolution procedures. + +--- + +## Alert: `NodeDown` + +**Severity:** Critical +**Trigger:** No metrics received from `node_exporter` for 1 minute +**Expression:** `up{job="node_exporter"} == 0` + +### Meaning + +This alert indicates that a monitored node is unreachable. It could be offline, experiencing network issues, or the `node_exporter` service is down. + +### Possible Causes + +- Node is powered off or crashed +- Network issues between Prometheus and the node +- `node_exporter` is stopped or crashed +- Firewall is blocking port 9100 +- Incorrect `scrape_config` + +### Troubleshooting + +1. Ping the instance: `ping {{ $labels.instance }}` +2. SSH into the node and check service: + `sudo systemctl status node_exporter` +3. Test metrics endpoint: + `curl http://{{ $labels.instance }}:9100/metrics` +4. Check in Prometheus UI: `Status -> Targets` +5. Inspect firewall and port accessibility + +### Resolution + +- Restart `node_exporter` +- Reboot or power on the node +- Fix network or firewall issues +- Correct Prometheus `scrape_config` + +--- + +## Alert: `HighMemoryUsage` + +**Severity:** Warning +**Trigger:** Memory usage > 80% for 10s +**Expression:** +`(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80` + +### Meaning + +Indicates high memory consumption on a node, which may lead to swapping or crashes. + +### Possible Causes + +- Memory leaks in applications +- Unexpected workload spikes +- Poorly configured applications +- Insufficient RAM +- Inefficient garbage collection + +### Troubleshooting + +1. Check memory usage: `htop`, `top`, or `ps aux --sort=-%mem` +2. Review app logs for errors +3. Examine recent deployments/config changes +4. Analyze memory trends on dashboards + +### Resolution + +- Restart high-memory applications +- Tune app memory settings +- Add more RAM or scale services +- Fix memory leaks +- Clean temp files and caches + +--- + +## Alert: `HighCPUUsage` + +**Severity:** Warning +**Trigger:** CPU usage > 85% for 2m +**Expression:** +`100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 85` + +### Meaning + +High CPU utilization is affecting system performance. + +### Possible Causes + +- Heavy computational loads +- Inefficient or stuck applications +- DoS attacks +- Background jobs misbehaving +- Insufficient CPU cores + +### Troubleshooting + +1. Inspect processes: `top`, `htop`, or `ps aux --sort=-%cpu` +2. Check logs of high-CPU apps +3. Review recent changes or cron jobs +4. Analyze CPU graphs on dashboard + +### Resolution + +- Optimize apps using excessive CPU +- Reschedule or throttle background tasks +- Scale out or upgrade CPU +- Address potential DoS issues +- Restart problematic services (temp fix) + +--- + +## Alert: `LowDiskSpace` + +**Severity:** Warning +**Trigger:** Disk usage > 90% for 3m +**Expression:** +`(1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 90` + +### Meaning + +Disk usage is critically high and may affect system stability. + +### Possible Causes + +- Log bloat without rotation +- Orphaned temporary files +- Large data uploads or backups +- Snapshots consuming space +- Users storing large files + +### Troubleshooting + +1. Check usage: `df -h` +2. Locate large files: `du -sh /*`, `/var/*`, `/opt/*` +3. Inspect log rotation (`logrotate`) +4. Check cron cleanup jobs +5. Analyze disk trend graphs + +### Resolution + +- Remove temp or unused files +- Rotate/compress/delete logs +- Delete outdated backups +- Expand disk or add volumes + +--- + +## Alert: `NodeExporterMissing` + +**Severity:** Critical +**Trigger:** No `node_exporter` targets found for 1m +**Expression:** `absent(up{job="node_exporter"})` + +### Meaning + +Prometheus cannot find or scrape any `node_exporter` targets. + +### Possible Causes + +- `node_exporter` not installed +- `scrape_config` missing or incorrect +- Global firewall issue +- Broken service discovery + +### Troubleshooting + +1. Confirm `node_exporter` is installed and running +2. Validate `prometheus.yml` config +3. Check Prometheus UI: `Status -> Targets` +4. Test metrics endpoint from Prometheus host +5. Review Prometheus logs + +### Resolution + +- Install and start `node_exporter` +- Fix or define `scrape_config` +- Resolve network/firewall issues +- Repair service discovery setup + +--- + +## Alert: `HighTraefikErrorRate` + +**Severity:** Warning +**Trigger:** 5xx error rate > 5% over 2m +**Expression:** +`(sum(rate(traefik_service_requests_total{code=~"5..",protocol="http"}[2m])) / sum(rate(traefik_service_requests_total{protocol="http"}[2m]))) > 0.05` + +### Meaning + +Backend services behind Traefik are failing to serve requests successfully. + +### Possible Causes + +- Unhealthy backend services +- High load or timeouts +- Bad deployments +- Misconfigured routing +- External dependencies failing + +### Troubleshooting + +1. Check Traefik dashboards for erroring services +2. Review logs of affected services +3. Inspect backend host resource usage +4. Bypass Traefik and test services directly +5. Check recent changes + +### Resolution + +- Restart affected services +- Scale backend services +- Roll back recent deployments +- Fix application issues +- Resolve database or dependency issues + +--- + +## Alert: `AppEndpointDown` + +**Severity:** Critical +**Trigger:** AdonisJS app metrics endpoint unreachable for 1m +**Expression:** `up{job="adonisjs-app"} == 0` + +### Meaning + +AdonisJS app is down or its metrics endpoint is inaccessible. + +### Possible Causes + +- App crashed or stopped +- App overloaded +- Misconfigured Prometheus scrape target +- Network/firewall issues + +### Troubleshooting + +1. Check app status: `pm2`, `systemctl`, or container status +2. Review application logs +3. `curl` the `/metrics` endpoint +4. Check Prometheus UI: `Status -> Targets` +5. Inspect firewall rules + +### Resolution + +- Restart app +- Fix configuration or dependency issues +- Scale resources +- Correct `scrape_config` + +--- + +## Alert: `PrometheusSelfScrapeFailing` + +**Severity:** Warning +**Trigger:** Prometheus can't scrape itself for 1m +**Expression:** `up{job="prometheus"} == 0` + +### Meaning + +Prometheus is unable to scrape its own metrics, which may indicate a systemic issue. + +### Possible Causes + +- Prometheus is unhealthy or overloaded +- Firewall blocking `localhost:9090` +- Prometheus config is incorrect +- Disk full or out of memory + +### Troubleshooting + +1. Check service: `systemctl status prometheus` or Docker status +2. Test endpoint: `curl http://localhost:9090/metrics` +3. Check Prometheus logs +4. Inspect server resources: `top`, `df -h`, etc. +5. Review `scrape_config` + +### Resolution + +- Restart Prometheus +- Free up system resources +- Fix local firewall config +- Correct the self-scrape configuration diff --git a/docs/images/error-dashboard-2.png b/docs/images/error-dashboard-2.png new file mode 100644 index 0000000..bd35a0b Binary files /dev/null and b/docs/images/error-dashboard-2.png differ diff --git a/docs/images/error-dashboard.png b/docs/images/error-dashboard.png new file mode 100644 index 0000000..50007df Binary files /dev/null and b/docs/images/error-dashboard.png differ diff --git a/docs/images/main-dashboard-2.png b/docs/images/main-dashboard-2.png new file mode 100644 index 0000000..5b827ea Binary files /dev/null and b/docs/images/main-dashboard-2.png differ diff --git a/docs/images/main-dashboard.png b/docs/images/main-dashboard.png new file mode 100644 index 0000000..8a148a6 Binary files /dev/null and b/docs/images/main-dashboard.png differ diff --git a/docs/images/resource-dashboard.png b/docs/images/resource-dashboard.png new file mode 100644 index 0000000..5f2df17 Binary files /dev/null and b/docs/images/resource-dashboard.png differ diff --git a/docs/monitoring-demo-script.md b/docs/monitoring-demo-script.md new file mode 100644 index 0000000..c9fa37f --- /dev/null +++ b/docs/monitoring-demo-script.md @@ -0,0 +1,89 @@ +# **Demonstration Script: Monitoring System Overview** + +**Objective:** Showcase the capabilities of our monitoring system in providing real-time insights into system health, application performance, and efficient problem resolution. + +**Target Audience:** Stakeholders, Developers + +**Time Allotment:** 10-15 minutes + +--- + +**(Start - 0:00)** + +**Presenter:** "Good morning/afternoon everyone, and welcome to this demonstration of our monitoring system. Today, we'll walk through how our system provides comprehensive visibility into our infrastructure and applications, enabling us to proactively identify issues and ensure smooth operations." + +**1. Introduction & Overview (2 minutes)** + +- **Presenter:** "Our monitoring system is built around Grafana dashboards, pulling data from Prometheus, which collects metrics from various exporters like Node Exporter for system resources and Traefik for our API gateway and service performance. This integrated approach gives us a holistic view of our environment." +- **Presenter:** "We'll be focusing on two main areas today: **Resource Utilization** and **Application Performance**." + +**(Transition to Resource Dashboard - `resource-dashboard.png`)** + +**2. Resource Utilization (4 minutes)** + +- **Presenter:** "Let's start with our **System Stats Dashboard** (referencing `resource-dashboard.png`). This dashboard provides a high-level overview of our server's health." +- **Presenter:** "On the top left, we have a quick summary of **CPU, Memory, and I/O pressure**. As you can see, our CPU Busy is currently at 2.7%, Sys Load at 0.0%, and RAM Used at 55.4%. These are all within healthy limits." +- **Presenter:** "Below that, we have more detailed graphs for **CPU Basic**, **Memory Basic**, **Network Traffic Basic**, and **Disk Space Used Basic**. Notice the consistent CPU usage and memory consumption over time. The disk space usage is also stable at 18.1% for the root filesystem." +- **Presenter:** "This dashboard allows us to quickly identify any resource bottlenecks or anomalies that might impact our applications. For example, a sudden spike in CPU or RAM usage would immediately alert us to potential issues." + +**(Transition to Main Dashboard - `main-dashboard-2.png`)** + +**3. Application Performance - Traefik Overview (4 minutes)** + +- **Presenter:** "Now, let's switch gears to our **DevOps Dashboard** (referencing `main-dashboard-2.png`). This dashboard focuses on the performance of our applications, specifically through Traefik, our API gateway." +- **Presenter:** "Here, we can see **Most Requested Services**. Currently, `webscorecom` and `web` are our most active services, showing their request rates." +- **Presenter:** "The **Connections per Entrypoint** graph is crucial for understanding user traffic. We can see a steady number of connections to `webscorecom`, `userdb`, and `web`. Notice the slight increase in connections around 17:42:00 for `webscorecom`, indicating increased activity." +- **Presenter:** "Below that, we have **2xx over 1m** and **Other codes over 1m**. 2xx responses indicate successful requests, and as you can see, we have a healthy rate of successful interactions. The 'Other codes' panel tracks non-2xx responses. We want to keep an eye on this to ensure minimal errors." + +**(Transition to Error Dashboard - `error-dashboard-2.png`)** + +**4. Deep Dive into Errors & Troubleshooting (3 minutes)** + +- **Presenter:** "To delve deeper into potential issues, we use our **Traefik Dashboard** (referencing `error-dashboard-2.png`). This dashboard is designed to pinpoint and troubleshoot errors." +- **Presenter:** "Let's focus on the **HTTP Details** section. We have a clear separation of **2xx over 1m** and **5xx over 1m**. Ideally, we want to see very low or no 5xx errors, which represent server-side issues." +- **Presenter:** "Currently, we have 'No data' for 5xx errors, which is excellent! This indicates no major server-side failures at this moment. If there were errors, this panel would show trends, allowing us to quickly identify when the errors started and their frequency." +- **Presenter:** "The **Requests Size** and **Responses Size** graphs provide insights into the data transfer for our services. This can be helpful in identifying large requests or responses that might be impacting performance." + +**5. Alerting Demonstration (1-2 minutes)** + +- **Presenter:** "Now, let's briefly discuss how our system helps us react to issues. While we don't have a live alert to trigger right now, I can show you our current alert status." + +**(Transition to the Alerts dashboard)** + +- **Presenter:** "This is our Alerts Dashboard, specifically showing our `node-health-alerts`. As you can see, all 8 of our defined alerts — such as `NodeDown`, `HighMemoryUsage`, `HighCPUUsage`, and `HighTraefikErrorRate` — are currently in a 'Normal' state and their 'Health' is 'ok'. This indicates that all our systems are running smoothly and within expected parameters right now." + +- **Presenter:** "However, if any of these conditions were to be met — for example, if a node went down, or if CPU usage consistently exceeded a threshold — the 'State' here would change from 'Normal' to 'Firing' (or 'Pending', depending on the alert configuration). This would trigger an alert." + +- **Presenter:** "These triggered alerts are then sent to our Alertmanager, which handles routing these notifications to the appropriate on-call teams via various channels like Slack, email, or PagerDuty. This ensures that our team is immediately aware of any critical issues and can respond swiftly to minimize any impact on our users." + +- **Presenter:** "Once the underlying issue is resolved, the alert would automatically resolve itself, and we would see its state revert to 'Normal' here on this dashboard." + +- **Presenter:** "This proactive alerting mechanism ensures that we are immediately aware of critical issues and can respond swiftly to minimize any impact on our users." + +**(Back to Main Dashboard - `main-dashboard-2.png` or `resource-dashboard.png` for a concluding shot)** + +**6. Conclusion & Q&A (1 minute)** + +- **Presenter:** "In summary, our monitoring system provides us with real-time visibility into both our infrastructure and application performance. From high-level resource utilization to granular HTTP error details, we have the tools to ensure the stability and reliability of our services." +- **Presenter:** "This comprehensive approach allows us to: + - **Proactively identify and address potential issues.** + - **Minimize downtime and service disruptions.** + - **Optimize resource allocation.** + - **Gain deeper insights into application behavior.**" +- **Presenter:** "Thank you for your time. Are there any questions?" + +--- + +**Key Dashboards to Highlight:** + +- **`resource-dashboard.png` (System Stats):** For overall system health, CPU, RAM, Disk, Network. +- **`main-dashboard-2.png` (DevOps Dashboard):** For Traefik overview, most requested services, connections, and overall request success/error rates. +- **`error-dashboard-2.png` (Traefik Dashboard - Error Focus):** For detailed HTTP response codes (especially 5xx), request/response sizes. + +**Explanation for Important Visualizations:** + +- **Gauge Charts (CPU Busy, RAM Used, etc.):** Show current state and quick health checks. +- **Time-Series Graphs (2xx over 1m, Connections per Entrypoint, CPU Basic):** Show trends over time, crucial for identifying patterns, spikes, and drops. Emphasize the X-axis (time) and Y-axis (metric value). +- **"No data" panels:** Explain that this is a _good_ thing when it comes to errors (like 5xx over 1m), as it means no data was reported for that metric, indicating no errors. +- **Mean/Max values in tables:** Explain how these provide quick numerical summaries of the data presented in the graphs. +- **Prometheus Ready/Targets/Alertmanagers:** Briefly explain their role in data collection and alert routing. \ No newline at end of file diff --git a/docs/monitoring-documentation.md b/docs/monitoring-documentation.md index d8eef6b..7abcb63 100644 --- a/docs/monitoring-documentation.md +++ b/docs/monitoring-documentation.md @@ -10,114 +10,116 @@ This documentation outlines the setup, configuration, and usage of the Prometheu ### Components Overview -| Component | Description | -| ---------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | -| **Prometheus** | Time-series database and metrics scraper. Configured with rules and alerting logic. | -| **Grafana** | Visualization layer that reads from Prometheus to display metrics dashboards. | -| **Node Exporter** | Gathers system-level metrics from host OS (CPU, memory, disk, etc.). | -| **AdonisJS App (Custom Exporter)** | Exposes application metrics via `/metrics` endpoint, including request counters, durations, and internal Node.js stats. | -| **Traefik (Reverse Proxy)** | Exposes its own HTTP metrics at `/metrics` endpoint for monitoring HTTP traffic. | -| **Alertmanager** | Manages alert routing and dispatching (e.g., to Discord via webhook relay). | -| **alertmanager-discord-relay** | Bridges Alertmanager webhook alerts to Discord. | -| | +| Component | Description | +| :------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------- | +| **Prometheus** | Time-series database and metrics scraper. Configured with rules and alerting logic. | +| **Grafana** | Visualization layer that reads from Prometheus to display metrics dashboards. | +| **Node Exporter** | Gathers system-level metrics from host OS (CPU, memory, disk, etc.). | +| **AdonisJS App (Custom Exporter)** | Exposes application metrics via `/metrics` endpoint, including request counters, durations, and internal Node.js stats. | +| **Traefik (Reverse Proxy)** | Exposes its own HTTP metrics at `/metrics` endpoint for monitoring HTTP traffic. | +| **Alertmanager** | Manages alert routing and dispatching (e.g., to Discord via webhook relay). | +| **alertmanager-discord-relay** | Bridges Alertmanager webhook alerts to Discord. | ### Data Flow ![Monitoring Architecture Data Flow](./images/monitoring-architecture.png) -1. **Exporters (Node Exporter, AdonisJS, Traefik)** expose metrics via HTTP endpoints. -2. **Prometheus** scrapes each target at defined intervals (e.g., every 15s). -3. Prometheus **stores** raw time-series data in local TSDB. -4. **Recording rules** generate pre-aggregated metrics for fast querying. -5. **Alerting rules** check metrics and trigger alerts based on thresholds. -6. **Alertmanager** groups and routes alerts to the configured Discord webhook. -7. **Grafana** pulls metrics from Prometheus and renders them into dashboards for visualization. +1. **Exporters (Node Exporter, AdonisJS, Traefik)** expose metrics via HTTP endpoints. +2. **Prometheus** scrapes each target at defined intervals (e.g., every 15s). +3. Prometheus **stores** raw time-series data in local TSDB. +4. **Recording rules** generate pre-aggregated metrics for fast querying. +5. **Alerting rules** check metrics and trigger alerts based on thresholds. +6. **Alertmanager** groups and routes alerts to the configured Discord webhook. +7. **Grafana** pulls metrics from Prometheus and renders them into dashboards for visualization. --- ## 2. Metrics Catalog +> **Note:** The following metrics are **specific to the DevOps Dashboard (Application Performance Overview)** outlined in [Section 5.2](#52-devops-dashboard-application-performance-overview). These metrics are handpicked to provide visibility into the application and system's performance and availability. + + ### Custom Application Metrics (AdonisJS) -| Metric | Description | Example Query | -| --------------------------------------------------------------------------------------------------------- | ----------------------------------------------- | ----------------------------------- | -| `sum(increase(_http_requests_total[$__range]))` | Total number of HTTP requests over time window. | Total requests since $\_\_range. | -| `sum(rate(_http_requests_total{ok="true"}[$__range])) / sum(rate(_http_requests_total[$__range])) * 100` | Percentage of successful HTTP requests. | Success rate %. | -| `sum(rate(_http_requests_total{ok="false"}[$__range])) / sum(rate(_http_requests_total[$__range])) * 100` | Percentage of failed HTTP requests. | Error rate %. | -| `topk(15, sum by (route, method, status) (rate(_http_requests_total[$__rate_interval])) > 0)` | Top 15 routes by traffic. | Route activity insight. | -| `nodejs_eventloop_lag_seconds` | Current Node.js event loop lag. | Measures responsiveness under load. | +| Metric | Description | Example Query | +| :-------------------------------------------------------------------------- | :---------------------------------------------- | :------------------------------ | +| `sum(increase(_http_requests_total[$__range]))` | Total number of HTTP requests over time window. | Total requests since $\_\_range. | +| `sum(rate(_http_requests_total{ok="true"}[$__range])) / sum(rate(_http_requests_total[$__range])) * 100` | Percentage of successful HTTP requests. | Success rate %. | +| `sum(rate(_http_requests_total{ok="false"}[$__range])) / sum(rate(_http_requests_total[$__range])) * 100` | Percentage of failed HTTP requests. | Error rate %. | +| `topk(15, sum by (route, method, status) (rate(_http_requests_total[$__rate_interval])) > 0)` | Top 15 routes by traffic. | Route activity insight. | +| `nodejs_eventloop_lag_seconds` | Current Node.js event loop lag. | Measures responsiveness under load. | --- ### System Metrics (Node Exporter) -| Metric | Description | Example Query | -| ------------------------------------------------------------------------------ | -------------------------------- | --------------------------- | -| `irate(node_pressure_cpu_waiting_seconds_total{...})` | Instant rate of CPU pressure. | CPU wait pressure trend. | -| `irate(node_pressure_memory_waiting_seconds_total{...})` | Instant rate of memory pressure. | Memory wait conditions. | -| `irate(node_pressure_io_waiting_seconds_total{...})` | Instant rate of I/O pressure. | Disk I/O bottleneck signal. | -| `irate(node_pressure_irq_stalled_seconds_total{...})` | Instant rate of IRQ stalls. | Interrupt handling delay. | -| `100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[$__rate_interval])))` | System-wide CPU usage. | Overall CPU load %. | -| `scalar(node_load1) * 100 / count(count(node_cpu_seconds_total) by (cpu))` | Load average per core. | CPU saturation % per core. | -| `(1 - (MemAvailable / MemTotal)) * 100` | Memory usage %. | Tracks available memory. | -| `(size - avail) / size * 100` | Disk usage % (root FS) | Filesystem saturation. | -| `count(count(node_cpu_seconds_total) by (cpu))` | CPU core count. | Total logical CPUs. | -| `node_memory_MemTotal_bytes` | Total memory in bytes. | Base capacity reference. | -| `node_filesystem_size_bytes{...}` | Root filesystem total size. | Base FS storage value. | -| `node_time_seconds - node_boot_time_seconds` | System uptime in seconds. | Duration since last boot. | -| `node_reboot_required` | 1 if reboot required. | Security/patch awareness. | +| Metric | Description | Example Query | +| :------------------------------------------------------------------- | :------------------------- | :------------------------ | +| `irate(node_pressure_cpu_waiting_seconds_total{...})` | Instant rate of CPU pressure. | CPU wait pressure trend. | +| `irate(node_pressure_memory_waiting_seconds_total{...})` | Instant rate of memory pressure. | Memory wait conditions. | +| `irate(node_pressure_io_waiting_seconds_total{...})` | Instant rate of I/O pressure. | Disk I/O bottleneck signal. | +| `irate(node_pressure_irq_stalled_seconds_total{...})` | Instant rate of IRQ stalls. | Interrupt handling delay. | +| `100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[$__rate_interval])))` | System-wide CPU usage. | Overall CPU load %. | +| `scalar(node_load1) * 100 / count(count(node_cpu_seconds_total) by (cpu))` | Load average per core. | CPU saturation % per core. | +| `(1 - (MemAvailable / MemTotal)) * 100` | Memory usage %. | Tracks available memory. | +| `(size - avail) / size * 100` | Disk usage % (root FS) | Filesystem saturation. | +| `count(count(node_cpu_seconds_total) by (cpu))` | CPU core count. | Total logical CPUs. | +| `node_memory_MemTotal_bytes` | Total memory in bytes. | Base capacity reference. | +| `node_filesystem_size_bytes{...}` | Root filesystem total size. | Base FS storage value. | +| `node_time_seconds - node_boot_time_seconds` | System uptime in seconds. | Duration since last boot. | +| `node_reboot_required` | 1 if reboot required. | Security/patch awareness. | --- ### Reverse Proxy Metrics (Traefik) -| Metric | Description | Example Query | | -| --------------------------------------------------------------------------------------------------------------------- | --------------------------------------- | ---------------------------------- | ----------------------------------------- | -| `topk(15, label_replace(sum by (service, code)(rate(traefik_service_requests_total{...})) > 0, ...))` | Top HTTP services by status code. | See request volume per service. | | -| `sum(traefik_open_connections{entrypoint=~"$entrypoint"}) by (entrypoint)` | Active open connections per entrypoint. | Tracks concurrent load. | | -| `topk(15, label_replace(sum by (service, method, code)(rate(traefik_service_requests_total{code=~"2.."})) > 0, ...))` | Top successful (2xx) requests. | Most actively responding services. | | -| `topk(15, label\_replace(sum by (service, method, code)(rate(traefik\_service\_requests\_total{code!\~"2..5.."}) > 0, ...))`| Top 3xx/4xx requests. | Indicates possible routing/client issues. | +| Metric | Description | Example Query | +| :--------------------------------------------------------------------------------------------------- | :-------------------------------------- | :--------------------------------- | +| `topk(15, label_replace(sum by (service, code)(rate(traefik_service_requests_total{...})) > 0, ...))` | Top HTTP services by status code. | See request volume per service. | +| `sum(traefik_open_connections{entrypoint=~"$entrypoint"}) by (entrypoint)` | Active open connections per entrypoint. | Tracks concurrent load. | +| `topk(15, label_replace(sum by (service, method, code)(rate(traefik_service_requests_total{code=~"2.."})) > 0, ...))` | Top successful (2xx) requests. | Most actively responding services. | +| `topk(15, label_replace(sum by (service, method, code)(rate(traefik_service_requests_total{code!~"2..5.."}) > 0, ...))` | Top 3xx/4xx requests. | Indicates possible routing/client issues. | --- ### Prometheus Self-Metrics -| Metric | Description | Example Query | -| --------------------------------------------------------- | --------------------------------------------- | ----------------------------- | -| `prometheus_ready` | Prometheus self-readiness. | Health check metric. | -| `sum(prometheus_target_scrape_pool_targets)` | Total number of scrape targets. | Target discovery validation. | -| `prometheus_notifications_alertmanagers_discovered` | Number of discovered Alertmanager instances. | Alert routing topology check. | +| Metric | Description | Example Query | +| :-------------------------------------------------------- | :-------------------------------------- | :---------------------------- | +| `prometheus_ready` | Prometheus self-readiness. | Health check metric. | +| `sum(prometheus_target_scrape_pool_targets)` | Total number of scrape targets. | Target discovery validation. | +| `prometheus_notifications_alertmanagers_discovered` | Number of discovered Alertmanager instances. | Alert routing topology check. | | `sum(increase(prometheus_notifications_sent_total[10m]))` | Number of alerts sent in the last 10 minutes. | Active alert volume. | -| `rate(process_cpu_seconds_total[$__rate_interval])` | Prometheus process CPU usage. | Resource usage profiling. | +| `rate(process_cpu_seconds_total[$__rate_interval])` | Prometheus process CPU usage. | Resource usage profiling. | --- ## 3. PromQL Query Reference Guide -| Query | Purpose | -| --------------------------------------------------------------------------------------------------------- | ---------------------------------------- | -| `sum(increase(_http_requests_total[$__range]))` | Total requests over time. | -| `sum(rate(_http_requests_total{ok="true"}[$__range])) / sum(rate(_http_requests_total[$__range])) * 100` | % of successful HTTP requests. | -| `sum(rate(_http_requests_total{ok="false"}[$__range])) / sum(rate(_http_requests_total[$__range])) * 100` | % of failed HTTP requests. | -| `topk(15, sum by (route, method, status)(rate(_http_requests_total[$__rate_interval]))) > 0` | Top 15 routes by traffic. | -| `nodejs_eventloop_lag_seconds` | Event loop lag (Node.js responsiveness). | -| `irate(node_pressure_cpu_waiting_seconds_total{...})` | CPU pressure. | -| `irate(node_pressure_memory_waiting_seconds_total{...})` | Memory pressure. | -| `irate(node_pressure_io_waiting_seconds_total{...})` | I/O pressure. | -| `irate(node_pressure_irq_stalled_seconds_total{...})` | IRQ pressure. | -| `100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[$__rate_interval])))` | CPU usage %. | -| `scalar(node_load1) * 100 / count(count(node_cpu_seconds_total) by (cpu))` | Load per CPU. | -| `(1 - (MemAvailable / MemTotal)) * 100` | Memory usage %. | -| `(node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100` | Root filesystem usage %. | -| `count(count(node_cpu_seconds_total) by (cpu))` | Total CPU cores. | -| `node_time_seconds - node_boot_time_seconds` | Uptime. | -| `node_reboot_required` | Check for pending reboots. | -| `sum(traefik_open_connections{entrypoint=~"$entrypoint"}) by (entrypoint)` | Active Traefik connections. | -| `topk(15, label_replace(sum by (service, code)(rate(traefik_service_requests_total{...})), ...))` | Top Traefik services by requests. | -| `prometheus_ready` | Prometheus health. | -| `sum(prometheus_target_scrape_pool_targets)` | Target count. | -| `sum(increase(prometheus_notifications_sent_total[10m]))` | Alerts sent. | -| `rate(process_cpu_seconds_total[$__rate_interval])` | Prometheus CPU usage. | +| Query | Purpose | +| :---------------------------------------------------------------------------------------------------------- | :--------------------------------------- | +| `sum(increase(_http_requests_total[$__range]))` | Total requests over time. | +| `sum(rate(_http_requests_total{ok="true"}[$__range])) / sum(rate(_http_requests_total[$__range])) * 100` | % of successful HTTP requests. | +| `sum(rate(_http_requests_total{ok="false"}[$__range])) / sum(rate(_http_requests_total[$__range])) * 100` | % of failed HTTP requests. | +| `topk(15, sum by (route, method, status)(rate(_http_requests_total[$__rate_interval]))) > 0` | Top 15 routes by traffic. | +| `nodejs_eventloop_lag_seconds` | Event loop lag (Node.js responsiveness). | +| `irate(node_pressure_cpu_waiting_seconds_total{...})` | CPU pressure. | +| `irate(node_pressure_memory_waiting_seconds_total{...})` | Memory pressure. | +| `irate(node_pressure_io_waiting_seconds_total{...})` | I/O pressure. | +| `irate(node_pressure_irq_stalled_seconds_total{...})` | IRQ pressure. | +| `100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[$__rate_interval])))` | CPU usage %. | +| `scalar(node_load1) * 100 / count(count(node_cpu_seconds_total) by (cpu))` | Load per CPU. | +| `(1 - (MemAvailable / MemTotal)) * 100` | Memory usage %. | +| `(node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100` | Root filesystem usage %. | +| `count(count(node_cpu_seconds_total) by (cpu))` | Total CPU cores. | +| `node_time_seconds - node_boot_time_seconds` | Uptime. | +| `node_reboot_required` | Check for pending reboots. | +| `sum(traefik_open_connections{entrypoint=~"$entrypoint"}) by (entrypoint)` | Active Traefik connections. | +| `topk(15, label_replace(sum by (service, code)(rate(traefik_service_requests_total{...})), ...))` | Top Traefik services by requests. | +| `prometheus_ready` | Prometheus health. | +| `sum(prometheus_target_scrape_pool_targets)` | Target count. | +| `sum(increase(prometheus_notifications_sent_total[10m]))` | Alerts sent. | +| `rate(process_cpu_seconds_total[$__rate_interval])` | Prometheus CPU usage. | --- @@ -125,70 +127,359 @@ This documentation outlines the setup, configuration, and usage of the Prometheu ### Defined Alerts -| Alert Name | Expression | Threshold | Duration | Severity | Description | -| --------------------------- | -------------------------------------------- | --------- | -------- | -------- | -------------------------------------- | -| NodeDown | `up{job="node_exporter"} == 0` | No data | 1m | critical | Node Exporter is down. | -| HighMemoryUsage | `(1 - (MemAvailable / MemTotal)) * 100 > 80` | >80% | 10s | warning | Memory usage exceeds threshold. | -| HighCPUUsage | `100 - avg(rate(idle_cpu[2m])) * 100 > 85` | >85% | 2m | warning | CPU usage is high. | -| LowDiskSpace | `UsedDisk > 90%` | >90% | 3m | warning | Disk space is running low. | -| NodeExporterMissing | `absent(up{job="node_exporter"})` | none | 1m | critical | Node Exporter is not reporting at all. | -| HighTraefikErrorRate | `5xx errors > 5%` | >5% | 2m | warning | High HTTP error rate in Traefik. | -| AppEndpointDown | `up{job="adonisjs-app"} == 0` | No data | 1m | critical | AdonisJS app is unreachable. | -| PrometheusSelfScrapeFailing | `up{job="prometheus"} == 0` | No data | 1m | warning | Prometheus cannot scrape itself. | +| Alert Name | Expression | Threshold | Duration | Severity | Description | +| :-------------------------- | :------------------------------------------------------------------------------- | :-------------------- | :------- | :------- | :-------------------------------------------------------------------------------------------- | +| `NodeDown` | `up{job="node_exporter"} == 0` | No heartbeat | 1m | `critical` | Node Exporter is down. | +| `HighMemoryUsage` | `(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80` | `>80%` memory used | 10s | `warning` | Memory usage on instance exceeded `80%`. | +| `HighCPUUsage` | `100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 85` | `>85%` CPU usage | 2m | `warning` | CPU usage on instance exceeded `85%` for 2 minutes. | +| `LowDiskSpace` | `(1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 90` | `>90%` disk used | 3m | `warning` | Disk usage on instance exceeded `90%`. | +| `NodeExporterMissing` | `absent(up{job="node_exporter"})` | No `node_exporter` targets | 1m | `critical` | Prometheus is not receiving metrics from any node_exporter targets. | +| `HighTraefikErrorRate` | `(sum(rate(traefik_service_requests_total{code=~"5..",protocol="http"}[2m])) / sum(rate(traefik_service_requests_total{protocol="http"}[2m]))) > 0.05` | `>5%` 5xx errors | 2m | `warning` | More than `5%` of HTTP requests through Traefik resulted in `5xx` errors. | +| `AppEndpointDown` | `up{job="adonisjs-app"} == 0` | No `adonisjs-app` heartbeat | 1m | `critical` | No response from AdonisJS app. | +| `PrometheusSelfScrapeFailing` | `up{job="prometheus"} == 0` | No self-scrape | 1m | `warning` | Prometheus is not scraping itself. | + +--- ### Response Procedures -| Alert | Standard Procedure | -| --------------------------- | --------------------------------------------------------- | -| NodeDown | Verify server uptime and Prometheus target config. | -| HighMemoryUsage | Check app/container memory usage, restart if needed. | -| HighCPUUsage | Check for app overload or infinite loops. | -| LowDiskSpace | Clear unused files or extend volume. | -| NodeExporterMissing | Ensure service is running; check Docker logs. | -| HighTraefikErrorRate | Inspect service logs for 5xx error causes. | -| AppEndpointDown | Confirm AdonisJS is running and reachable. | -| PrometheusSelfScrapeFailing | Ensure Prometheus instance is healthy and not overloaded. | +| **Alert** | **Description** | **Condition** | **Duration** | **Severity** | **Action Message** | +| ----------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------- | ------------ | ------------ | ---------------------------------------------------------------------------------------------------------------- | +| `NodeDown` | The `node_exporter` instance has not reported metrics for **at least 1 minute**. This indicates the server is likely offline or its monitoring agent has failed. | No heartbeat | 1m | `critical` | No heartbeat from **node\_exporter** at **{{ \$labels.instance }}** for over 1 minute. Check server and network. | +| `HighMemoryUsage` | The node's memory usage has exceeded **80%** for **at least 10 seconds**. This indicates memory pressure that could lead to degraded performance or OOM errors. | `>80%` memory used | 10s | `warning` | Memory usage on **{{ \$labels.instance }}** exceeded **80%**. Consider freeing memory. | +| `HighCPUUsage` | The node's CPU usage has exceeded **85%** for a continuous period of **2 minutes**. This can cause slow application response times and general system sluggishness. | `>85%` CPU usage | 2m | `warning` | CPU usage on **{{ \$labels.instance }}** exceeded **85%** for 2 minutes. Investigate. | +| `LowDiskSpace` | Disk usage on a non-temporary filesystem has exceeded **90%** for **at least 3 minutes**. This can lead to application failures and system instability. | `>90%` disk used | 3m | `warning` | Disk usage on **{{ \$labels.instance }}** exceeded **90%**. Clean files or increase storage. | +| `NodeExporterMissing` | Prometheus is **not receiving metrics from any `node_exporter` targets** for **at least 1 minute**. This may indicate a wider issue with exporters or discovery. | No `node_exporter` targets | 1m | `critical` | Prometheus is **not receiving metrics** from any node\_exporter targets. Check deployment. | +| `HighTraefikErrorRate` | More than **5% of HTTP requests** through Traefik resulted in **5xx (server error) status codes** for a continuous period of **2 minutes**. Backend issues are likely. | `>5%` 5xx errors | 2m | `warning` | More than **5%** of HTTP requests through Traefik resulted in **5xx errors**. | +| `AppEndpointDown` | The **AdonisJS application's Prometheus metrics endpoint is not reachable (`up` is 0)** for **at least 1 minute**. This suggests the app is down or unresponsive. | No `adonisjs-app` heartbeat | 1m | `critical` | No response from AdonisJS app at **{{ \$labels.instance }}**. | +| `PrometheusSelfScrapeFailing` | **Prometheus is unable to scrape its own metrics endpoint** (`up` is 0) for **at least 1 minute**. This is a critical check of Prometheus's own health. | No self-scrape | 1m | `warning` | Prometheus is not reporting its own metrics. | + +--- + +### Response Procedures - Detailed Guide + +#### **Alert Name: `NodeDown`** + +* **What the alert means:** This `critical` alert triggers when Prometheus has not received any metrics from a `node_exporter` instance for **at least 1 minute**. It specifically means that the monitored node is either entirely offline, has network connectivity issues preventing Prometheus from reaching it, or the `node_exporter` service itself has stopped. +* **Possible Causes:** + * The physical server or virtual machine is powered off or crashed. + * Network partitioning or connectivity loss between the Prometheus server and the target node. + * The `node_exporter` service on the target node is stopped, crashed, or not responding. + * Firewall rules (on the node or network path) are blocking Prometheus's access to the `node_exporter` port (default 9100). + * Incorrect IP address or port configured for the node in Prometheus's `scrape_config`. +* **Recommended Troubleshooting Steps:** + 1. **Check basic network connectivity:** From the Prometheus server, ping the IP address or hostname of the affected node (`ping {{ $labels.instance }}`). + 2. **Verify `node_exporter` service status on the target node:** SSH into the problematic node (if reachable) and execute `sudo systemctl status node_exporter` (for systemd) or `sudo service node_exporter status` to check if the service is running. If not, inspect logs with `journalctl -u node_exporter`. + 3. **Test `node_exporter` endpoint:** From the Prometheus server (or any machine with `curl`), try to access the `node_exporter` metrics endpoint: `curl http://{{ $labels.instance }}:9100/metrics`. This checks if the exporter is serving metrics. + 4. **Review Prometheus targets status:** In the Prometheus UI, navigate to `Status -> Targets` and search for the instance `{{ $labels.instance }}`. Look for any specific scrape errors or reasons for being down. + 5. **Check Firewall rules:** Ensure no firewalls (e.g., `ufw`, `firewalld`, cloud security groups, network ACLs) are blocking port 9100 (or your custom `node_exporter` port) between Prometheus and the node. +* **Resolution Procedures:** + 1. **Restart `node_exporter`:** If the service is stopped, start it (`sudo systemctl start node_exporter`). + 2. **Power on/Reboot server:** If the node itself is down, investigate the underlying hardware or virtualization issue and power it back on. + 3. **Resolve network issues:** Fix any identified network connectivity problems (e.g., misconfigured network interfaces, switch issues, routing problems). + 4. **Adjust firewall rules:** Open the necessary port for `node_exporter` in your firewall configurations. + 5. **Correct Prometheus configuration:** If the `scrape_config` for the node is incorrect in `prometheus.yml`, update it and reload/restart Prometheus. + +--- + +#### **Alert Name: `HighMemoryUsage`** + +* **What the alert means:** This `warning` alert triggers when the available memory on a node falls to a point where **less than 20% of the total memory is free** (meaning memory usage exceeds 80%) for **at least 10 seconds**. This indicates that the node is under significant memory pressure, which can lead to performance degradation, excessive swapping (if enabled), and potential out-of-memory errors for applications. +* **Possible Causes:** + * Memory leak within one or more running applications. + * Sudden increase in workload demanding more memory than available. + * Incorrectly configured applications or services consuming more memory than expected (e.g., large cache sizes, unoptimized processes). + * Insufficient physical RAM allocated to the node for its current and anticipated workload. + * Inefficient garbage collection or resource management by applications. +* **Recommended Troubleshooting Steps:** + 1. **Identify top memory consumers:** SSH into the node and use tools like `top`, `htop`, or `ps aux --sort=-%mem | head -n 10` to list processes by memory consumption. + 2. **Review application logs:** Check the logs of the applications identified as high memory consumers for any errors, warnings, or indications of memory-related issues. + 3. **Correlate with recent changes:** Determine if any recent deployments, code changes, or configuration updates coincided with the memory increase. + 4. **Analyze memory trends:** Use the "Memory Basic" graph on the **System Stats Dashboard** (`resource-dashboard.png`) to observe the trend of memory usage over a longer period. Is it a gradual climb (leak) or a sudden spike (workload)? +* **Resolution Procedures:** + 1. **Restart memory-intensive applications:** If a specific application is identified as problematic, try restarting it to free up its allocated memory. (Note: This is a temporary fix; investigate the root cause). + 2. **Optimize application configuration:** Adjust memory limits for services (e.g., JVM heap sizes for Java applications, Docker container memory limits). + 3. **Scale resources:** If the consistent workload genuinely requires more memory, consider adding more RAM to the node or scaling out the application to additional nodes. + 4. **Implement memory leak detection/prevention:** For long-term solutions, work with development teams to identify and fix memory leaks in application code. + 5. **Clean up temporary files/caches:** In some cases, large temporary files or caches can consume significant memory or disk space, indirectly impacting available RAM. + +--- + +#### **Alert Name: `HighCPUUsage`** + +* **What the alert means:** This `warning` alert triggers when a node's average CPU idle time drops significantly, indicating that **CPU usage has exceeded 85% for a continuous period of 2 minutes**. High CPU usage can lead to delayed processing, slow application response times, and overall system sluggishness. +* **Possible Causes:** + * Intensive computational tasks or batch jobs running. + * Application bug leading to an infinite loop, busy-waiting, or highly inefficient code. + * Unusually high legitimate user traffic or a potential Denial of Service (DoS) attack. + * Misconfigured cron jobs or background processes consuming excessive CPU. + * Insufficient CPU cores allocated to the node for its current workload. +* **Recommended Troubleshooting Steps:** + 1. **Identify top CPU consumers:** SSH into the node and use `top`, `htop`, or `ps aux --sort=-%cpu | head -n 10` to list processes by CPU consumption. Look for processes using consistently high percentages. + 2. **Review application logs:** Check the logs of the applications identified as CPU-intensive consumers for any errors, warnings, or abnormal processing patterns. + 3. **Correlate with recent changes:** Investigate any recent code deployments, configuration changes, or new scheduled tasks that might have increased CPU demand. + 4. **Analyze CPU trends:** Use the "CPU Basic" graph on the **System Stats Dashboard** (`resource-dashboard.png`) to observe the trend of CPU usage over a longer period. Is it a sustained high usage or sudden spikes? +* **Resolution Procedures:** + 1. **Optimize CPU-intensive applications:** Work with development teams to identify and optimize inefficient code or algorithms. + 2. **Throttle or reschedule tasks:** If background tasks are causing CPU spikes, consider scheduling them during off-peak hours or limiting their concurrency. + 3. **Scale resources:** If the consistent workload genuinely requires more processing power, consider adding more CPU cores to the node or scaling out the application to additional nodes. + 4. **Mitigate DoS attacks:** If a DoS attack is suspected, implement appropriate security measures (e.g., rate limiting, WAF). + 5. **Restart problematic applications:** As a temporary measure, restarting an application might alleviate a CPU spike, but the root cause should be identified. + +--- + +#### **Alert Name: `LowDiskSpace`** + +* **What the alert means:** This `warning` alert triggers when the disk usage on any non-temporary filesystem (excluding `tmpfs` and `overlay` filesystems, which are typically in-memory or ephemeral) exceeds **90%** for a duration of **3 minutes**. Running out of disk space can lead to application failures, inability to write logs, system instability, and potential data corruption. +* **Possible Causes:** + * Application logs growing excessively without proper rotation or retention policies. + * Temporary files not being cleaned up by applications or the system. + * Large data writes by applications (e.g., database backups, new data imports). + * Creation of large snapshots or backups consuming significant space. + * User errors such as inadvertently storing large files in critical directories. + * `/tmp` or `/var/tmp` filling up. +* **Recommended Troubleshooting Steps:** + 1. **Check overall disk usage:** SSH into the node and use `df -h` to see the percentage usage for all mounted filesystems. Identify the specific filesystem that is full. + 2. **Identify large directories/files:** Use `du -sh /*` to get a summary of top-level directories, then drill down into the largest ones (e.g., `du -sh /var/*`, `du -sh /opt/*`). Pay particular attention to `/var/log`, `/tmp`, and any application data directories. + 3. **Review log rotation settings:** Verify that `logrotate` (or equivalent) is correctly configured and working for all major services that generate logs. + 4. **Check temporary file cleanup:** Ensure that cron jobs or application configurations are set up to periodically clean old temporary files. + 5. **Analyze disk space trends:** Use the "Disk Space Used Basic" graph on the **System Stats Dashboard** (`resource-dashboard.png`) to observe how quickly disk space is being consumed. +* **Resolution Procedures:** + 1. **Clear temporary files:** Delete old or unneeded files from `/tmp`, `/var/tmp`, or application-specific temporary directories. + 2. **Rotate/Compress/Delete logs:** Manually rotate, compress, or delete old log files. Adjust `logrotate` configurations for automatic management. + 3. **Remove old backups/snapshots:** If large backups or snapshots are consuming space, delete outdated ones. + 4. **Identify and remove unnecessary large files:** If large, non-essential files are found, delete them. + 5. **Expand disk size:** If the current disk size is legitimately insufficient for the workload, expand the disk volume or add new storage. + +--- + +#### **Alert Name: `NodeExporterMissing`** + +* **What the alert means:** This `critical` alert triggers when Prometheus expects to find a `node_exporter` target but **no targets with the `job="node_exporter"` label are found or are scraping successfully for at least 1 minute**. This typically means the `node_exporter` service itself is not deployed, or Prometheus is misconfigured and cannot discover it. It's a more general alert than `NodeDown`, indicating a systemic issue with `node_exporter` monitoring. +* **Possible Causes:** + * `node_exporter` is not installed or running on any of your intended nodes. + * Prometheus `scrape_config` for `node_exporter` job is incorrect or missing. + * Prometheus has lost network connectivity to all intended `node_exporter` targets. + * A global firewall rule is blocking all `node_exporter` traffic. + * Service discovery mechanisms (e.g., DNS, file_sd) used by Prometheus for `node_exporter` targets are failing. +* **Recommended Troubleshooting Steps:** + 1. **Verify `node_exporter` deployment:** Confirm that `node_exporter` is installed and running on at least one target node you expect to monitor. + 2. **Check Prometheus `scrape_config`:** Review the `prometheus.yml` file to ensure the `job_name: "node_exporter"` section is correctly defined and includes valid `static_configs` or service discovery configurations. + 3. **Check Prometheus targets status:** In the Prometheus UI, navigate to `Status -> Targets`. Check if any `node_exporter` targets are listed and, if so, what their scrape status is. If no targets are listed for `node_exporter`, the issue is likely in Prometheus's configuration or service discovery. + 4. **Test `node_exporter` endpoint connectivity (generic):** From the Prometheus server, try to `curl` a known `node_exporter` instance's metrics endpoint. + 5. **Review Prometheus logs:** Check Prometheus's own logs for any errors related to scraping or target discovery for the `node_exporter` job. +* **Resolution Procedures:** + 1. **Deploy/Start `node_exporter`:** Ensure `node_exporter` is properly installed, configured, and running on your target nodes. + 2. **Correct Prometheus `scrape_config`:** Fix any errors in the `node_exporter` job configuration within `prometheus.yml` and reload/restart Prometheus. + 3. **Resolve network connectivity:** Address any widespread network issues preventing Prometheus from reaching your `node_exporter` instances. + 4. **Adjust global firewall rules:** Ensure no overarching firewall rules are blocking `node_exporter` traffic. + 5. **Fix service discovery:** If using a service discovery mechanism, troubleshoot its configuration and ensure it's correctly providing target information to Prometheus. + +--- + +#### **Alert Name: `HighTraefikErrorRate`** + +* **What the alert means:** This `warning` alert triggers when the percentage of HTTP requests through Traefik that result in **5xx (server error) status codes exceeds 5% of all HTTP requests for a continuous period of 2 minutes**. This indicates that your backend services are experiencing significant issues and are unable to process requests successfully. +* **Possible Causes:** + * Backend services are crashing, experiencing internal errors, or are generally unhealthy. + * Backend services are overloaded and cannot handle the current request volume, leading to timeouts or internal server errors. + * Database connectivity issues or problems with other critical external dependencies of your backend services. + * Misconfiguration in Traefik's routing rules or load balancing, directing traffic to unhealthy backends. + * Resource exhaustion (CPU, memory, disk I/O) on the hosts running the backend services. +* **Recommended Troubleshooting Steps:** + 1. **Identify affected services/endpoints:** Use the "HTTP Details" and "5xx over 1m" graphs on the **Traefik Dashboard** (`error-dashboard.png` and `error-dashboard-2.png`) to identify which specific services or endpoints are returning 5xx errors. + 2. **Check backend service logs:** Access the logs of the services identified as problematic. Look for stack traces, database connection errors, unhandled exceptions, or any other error messages indicating the root cause. + 3. **Verify backend service health and resources:** Check the health status and resource utilization (CPU, memory, network, disk) of the servers or containers hosting the affected backend services (refer to `resource-dashboard.png` if necessary). + 4. **Test backend services directly:** If possible, bypass Traefik and try to access the problematic backend services directly to isolate whether the issue lies with Traefik or the backend itself. + 5. **Review recent deployments:** Check for any recent code deployments or configuration changes to the backend services or Traefik that might have introduced regressions. +* **Resolution Procedures:** + 1. **Restart problematic backend services:** As a first step, restart the affected backend services. Monitor if the error rate drops. + 2. **Scale backend services:** If the services are overloaded, scale them horizontally (add more instances) or vertically (add more resources to existing instances). + 3. **Rollback deployments:** If a recent deployment is suspected, roll back the code to a known stable version. + 4. **Fix application bugs:** Work with development teams to identify and fix bugs in the application code that are causing internal server errors. + 5. **Check external dependencies:** Ensure databases, message queues, external APIs, and other critical dependencies are healthy and accessible to your backend services. + +--- + +#### **Alert Name: `AppEndpointDown`** + +* **What the alert means:** This `critical` alert triggers specifically when your **AdonisJS application's Prometheus metrics endpoint is not reachable (`up` is 0) for at least 1 minute**. This directly signifies that your AdonisJS application service is either down, crashed, or its metrics endpoint is inaccessible. +* **Possible Causes:** + * The AdonisJS application process has crashed or been stopped. + * Network connectivity issues between Prometheus and the host/port where the AdonisJS app exposes its metrics. + * The AdonisJS application is overloaded and has become unresponsive, including its metrics endpoint. + * A firewall (on the host or network) is blocking access to the AdonisJS application's port. + * Incorrect `scrape_config` for the `adonisjs-app` job in `prometheus.yml`. +* **Recommended Troubleshooting Steps:** + 1. **Verify AdonisJS application process status:** SSH into the server hosting the AdonisJS app and check its process status (e.g., `pm2 status`, `sudo systemctl status adonisjs-app`, or check Docker container status). + 2. **Check AdonisJS application logs:** Examine the logs of your AdonisJS application for any crash reports, startup errors, or unhandled exceptions. + 3. **Test AdonisJS metrics endpoint:** From the Prometheus server (or a diagnostic machine), attempt to `curl` the AdonisJS application's metrics endpoint directly (e.g., `curl http://{{ $labels.instance }}:/metrics`). + 4. **Review Prometheus targets status:** In the Prometheus UI, navigate to `Status -> Targets` and search for the `adonisjs-app` job and its instance `{{ $labels.instance }}`. Look for specific scrape errors. + 5. **Check Firewall rules:** Ensure no firewalls are blocking the port on which your AdonisJS application is listening for HTTP requests and/or metrics. +* **Resolution Procedures:** + 1. **Restart AdonisJS application:** If the process is stopped or crashed, try restarting it. + 2. **Troubleshoot application startup issues:** If the app fails to start, investigate its configuration, environment variables, and dependencies. + 3. **Address network connectivity:** Resolve any network issues preventing access to the application's port. + 4. **Scale application resources:** If the app is unresponsive due to overload, consider scaling it horizontally (add more instances) or vertically (add more resources to its host). + 5. **Correct Prometheus configuration:** If the `scrape_config` for the `adonisjs-app` job is incorrect in `prometheus.yml`, update it and reload/restart Prometheus. + +--- + +#### **Alert Name: `PrometheusSelfScrapeFailing`** + +* **What the alert means:** This `warning` alert triggers when **Prometheus itself is unable to scrape its own metrics endpoint for at least 1 minute**. This is a critical internal health check, as it indicates that Prometheus, the core of your monitoring system, is experiencing issues and may not be reliably collecting or alerting on *any* other metrics. +* **Possible Causes:** + * The Prometheus service has stopped, crashed, or is in an unhealthy state. + * Prometheus's own HTTP server (which exposes its metrics) is unresponsive due to overload or an internal error. + * A local firewall on the Prometheus server is blocking Prometheus from connecting to its own `localhost:9090` (or its configured port) endpoint. + * Resource exhaustion on the Prometheus server itself (e.g., high CPU, out of memory, full disk preventing writes to TSDB). + * Incorrect `scrape_config` for its `localhost` target within `prometheus.yml`. +* **Recommended Troubleshooting Steps:** + 1. **Check Prometheus service status:** On the Prometheus server, execute `sudo systemctl status prometheus` (for systemd) or check its Docker container status (`docker ps -a`) if running in a container. + 2. **Test Prometheus's own metrics endpoint:** From the Prometheus server itself, try to `curl http://localhost:9090/metrics` to see if it responds with Prometheus's internal metrics. + 3. **Review Prometheus logs:** Examine Prometheus's own logs for any errors, warnings, or indications of internal problems (e.g., `journalctl -u prometheus` or check container logs). + 4. **Check Prometheus server resources:** Use `top`, `htop`, `df -h` on the Prometheus server to ensure it has sufficient CPU, memory, and disk space. High I/O wait can also cause issues. + 5. **Review Prometheus configuration for self-scrape:** Ensure the `scrape_config` section for `job_name: 'prometheus'` in `prometheus.yml` is correctly defined (it typically scrapes `localhost:9090`). +* **Resolution Procedures:** + 1. **Restart Prometheus service:** `sudo systemctl restart prometheus`. This often resolves transient issues. + 2. **Address resource exhaustion:** If Prometheus is running out of resources, allocate more CPU/memory to its host, or optimize its storage and retention settings. + 3. **Correct configuration:** If the `scrape_config` for Prometheus's own metrics is incorrect, fix it in `prometheus.yml` and reload/restart Prometheus. + 4. **Adjust local firewall:** Ensure the local firewall on the Prometheus server is not blocking Prometheus from accessing its own port. + 5. **Clear disk space (if applicable):** If the disk where Prometheus stores its Time-Series Database (TSDB) is full, free up space. ### Alert Grouping and Routing -- Alerts are grouped by `alertname` for clarity. -- Discord integration via: +* Alerts are grouped by `alertname` for clarity. +* Discord integration via: + * `alertmanager-discord-relay` + * Receives all alert payloads + * Uses Discord webhook from environment variable + +--- + +## 5. Dashboard Catalog + +This section provides an overview of the key Grafana dashboards used to visualize system health and application performance, along with their purpose, key metrics, and use cases. + +### 5.1. System Stats Dashboard (Resource Utilization) + +![Resource Dashboard](./images/resource-dashboard.png) + +* **Purpose:** Provides a high-level overview of server resource health (CPU, memory, disk, network). Essential for quick checks of infrastructure well-being. +* **Key Metrics & Visualizations:** + * **Quick CPU / Mem / Disk:** Gauges for immediate visibility into CPU Busy, Sys Load, RAM Used, and Root FS Used percentages. + * **Basic CPU / Mem / Net / Disk:** Time-series graphs showing trends for CPU utilization, memory usage (including available, total, cached), network traffic (Tx/Rx), and detailed disk space usage by mount point. + * **CPU Cores, RAM Total, RootFS Total, Uptime:** Static indicators providing context about the system's capacity and duration of operation. +* **Target Audience:** Operations Team, System Administrators. +* **Use Cases:** + * Quickly identify resource bottlenecks. + * Monitor system stability after deployments or configuration changes. + * Diagnose performance issues related to underlying infrastructure. + * Proactively identify low disk space or high memory pressure. + +### 5.2. DevOps Dashboard (Application Performance Overview) + +![Main Dashboard](./images/main-dashboard.png) +![Main Dashboard 2](./images/main-dashboard-2.png) + +* **Purpose:** Offers a comprehensive view of application performance, primarily focusing on metrics from Traefik (HTTP traffic) and the AdonisJS application (custom metrics). +* **Key Metrics & Visualizations:** + * **Total HTTP Requests:** Overall request count to identify traffic volume. + * **Max Event Loop Lag:** Measures the responsiveness of the Node.js event loop in the AdonisJS application, indicating potential bottlenecks under load. + * **Overall Success Rate of Requests / Overall Error Rate of Requests:** High-level indicators of application health from an HTTP perspective. + * **Most Requested Endpoint / Most Requested Services:** Identify popular or heavily used API endpoints and services. + * **Connections per Entrypoint:** Tracks concurrent connections handled by Traefik entrypoints, showing current load. + * **2xx over 1m / Other codes over 1m:** Monitor rates of successful (2xx) and non-successful (e.g., 3xx, 4xx) HTTP responses, highlighting potential client-side issues or redirects. + * **Prometheus Ready, Targets, Active Alertmanagers, Sent Alerts (10m):** Provides direct insight into the health and activity of the monitoring system itself. +* **Target Audience:** DevOps Engineers, Developers, Operations Team. +* **Use Cases:** + * Monitor overall application health and user traffic. + * Quickly spot performance degradations or error spikes. + * Understand which services/endpoints are most active. + * Verify the health of the monitoring stack. + +### 5.3. Traefik Dashboard (HTTP Details & Troubleshooting) + +![Error Dashboard](./images/error-dashboard.png) +![Error Dashboard 2](./images/error-dashboard-2.png) + +* **Purpose:** Provides granular details about HTTP traffic handled by Traefik, primarily for deep-diving into request patterns, error rates, and service performance. Essential for troubleshooting application and routing issues. +* **Key Metrics & Visualizations:** + * **HTTP Code Pie Chart:** Visual breakdown of HTTP status codes (2xx, 3xx, 4xx, 5xx) to quickly see the distribution of responses. + * **Requests Per Entrypoint:** Details the volume of requests coming into Traefik via different entrypoints. + * **Apdex score:** Measures user satisfaction with the response time of your application. + * **Top Slow Services:** Identifies services with the highest latency, pointing to performance bottlenecks in specific backends. + * **Services failing SLO of 300ms/1200ms:** Tracks services violating predefined Service Level Objectives (SLOs) based on response times. + * **HTTP Details (2xx, 5xx, Other codes over 1m):** Specific time-series graphs breaking down success rates and various error types, enabling precise identification of when errors occurred. + * **Request Sizes / Response Sizes:** Analyze the size of requests and responses to identify potential bandwidth issues or unusually large data transfers. + * **Connections per Entrypoint:** Detailed view of active connections for specific Traefik entrypoints. +* **Target Audience:** DevOps Engineers, Network Engineers, Developers, Operations Team. +* **Use Cases:** + * Pinpoint the exact services and timeframes of HTTP errors. + * Troubleshoot routing issues or misconfigurations within Traefik. + * Identify slow-performing services and analyze latency trends. + * Monitor compliance with SLOs for application responsiveness. + * Deep-dive into network traffic patterns. + +--- - - `alertmanager-discord-relay` - - Receives all alert payloads - - Uses Discord webhook from environment variable +## 6. Security Measures and Considerations + +Ensuring the security of your monitoring system is paramount as it often contains sensitive data about your infrastructure and applications. + +* **Authentication & Authorization:** + * **Grafana:** Configure strong authentication (e.g., OAuth, LDAP, or robust username/password with 2FA). Implement role-based access control (RBAC) to restrict dashboard and data source access based on user roles (e.g., read-only for developers, admin for ops). + * **Prometheus:** While Prometheus itself doesn't have built-in user authentication, its web UI and API endpoints should be secured, ideally by placing it behind a reverse proxy (like Traefik) that handles authentication and authorization. + * **Alertmanager:** Secure Alertmanager's UI and API endpoints, preferably also behind an authenticated reverse proxy. +* **Network Segmentation & Firewalling:** + * **Isolate Monitoring Stack:** Deploy Prometheus, Grafana, and Alertmanager in a dedicated, isolated network segment or VLAN. + * **Strict Firewall Rules:** Implement strict firewall rules to allow only necessary traffic: + * Prometheus can only scrape exporters (e.g., Node Exporter on port 9100, custom app metrics on specific ports). + * Grafana can only query Prometheus. + * Alertmanager can only receive alerts from Prometheus and send notifications to allowed endpoints (e.g., Discord webhook relay). + * Limit inbound access to Grafana/Prometheus/Alertmanager UIs to specific IPs or VPNs. +* **Data in Transit (Encryption):** + * **HTTPS:** All web UIs (Grafana, Prometheus, Alertmanager) should be served over HTTPS to encrypt data in transit. + * **Exporter Scrapes:** While Prometheus's default scraping is HTTP, consider configuring `tls_config` in Prometheus for secure (HTTPS) scraping of exporters, especially for sensitive environments. +* **Data at Rest (Disk Encryption):** + * Ensure the underlying storage for Prometheus's TSDB (Time-Series Database) and Grafana's database (if applicable) is encrypted at rest using OS-level or volume-level encryption. +* **Access Control for Configuration Files:** + * Protect sensitive configuration files (`prometheus.yml`, `alertmanager.yml`, Grafana configs, `.env` files with webhooks/credentials) using strict file permissions (e.g., `chmod 600`) and restrict access to authorized users only. +* **Regular Updates:** + * Keep all components (Prometheus, Grafana, Alertmanager, exporters, Docker, OS) updated to their latest stable versions to benefit from security patches. +* **Audit Logging:** + * Enable and review audit logs for Grafana and the underlying operating system to track access and configuration changes. --- -## How to Run the Monitoring Stack +## 7. How to Run the Monitoring Stack -1. Create external network `main-network` if not already present: +1. Create external network `main-network` if not already present: -```bash -docker network create main-network -``` + ```bash + docker network create main-network + ``` -2. Navigate to your project root and launch the monitoring stack: +2. Navigate to your project root and launch the monitoring stack: -```bash -docker compose -f compose.monitor.yml up -d -``` + ```bash + docker compose -f compose.monitor.yml up -d + ``` -3. Ensure you have a valid `.env` file containing: +3. Ensure you have a valid `.env` file containing: - - `DISCORD_WEBHOOK_URL` - - `GRAFANA_URL_FQDN` - - `DOCKER_USER_GROUP` + * `DISCORD_WEBHOOK_URL` + * `GRAFANA_URL_FQDN` + * `DOCKER_USER_GROUP` -4. Access Grafana: +4. Access Grafana: -``` -https:// -``` + ``` + https:// + ``` --- -## Screenshot Evidence of Working Prometheus & Grafana Installation +## 8. Screenshot Evidence of Working Prometheus & Grafana Installation ### Prometheus Connection on Grafana diff --git a/docs/simulation-documentation.md b/docs/simulation-documentation.md index b4fdec1..e3c962c 100644 --- a/docs/simulation-documentation.md +++ b/docs/simulation-documentation.md @@ -2,15 +2,13 @@ ## Overview -This script simulates realistic user interactions with an API-based chat application for the purpose of testing and generating traffic. It performs the following: +This script simulates realistic user interactions with an API-based chat application to generate traffic and test endpoint behavior. It is suitable for: -- User registration and login -- Chat creation -- Retrieval and update of chat data -- Sending and retrieving messages -- Cleanup by deleting the chat +- Load testing +- Development environment traffic simulation +- Integration and stability checks -This script is ideal for load testing, development environment traffic simulation, and basic integration checks. +The script loops indefinitely and performs varied API operations with randomized timing to mimic organic usage patterns. --- @@ -24,16 +22,14 @@ simulation/index.js ## Requirements -- [Node.js](https://nodejs.org/) installed -- [pnpm](https://pnpm.io/) as the package manager -- `.env` file with the following environment variables: - -``` +- [Node.js](https://nodejs.org/) +- [pnpm](https://pnpm.io/) (or your preferred Node.js package manager) +- A `.env` file with the following environment variables: +```env API_BASE_URL=https://api.brainbytes.mcube.uk TEST_EMAIL=test@email.com TEST_PASSWORD=yourpassword - ``` --- @@ -46,7 +42,7 @@ TEST_PASSWORD=yourpassword cd simulation ``` -2. **Install dependencies (if you haven’t already):** +2. **Install dependencies (if needed):** ```bash pnpm install @@ -58,50 +54,75 @@ TEST_PASSWORD=yourpassword pnpm run dev ``` + > ⚠️ The script runs indefinitely in a loop. Use `CTRL+C` to stop it manually. + --- -## Script Breakdown +## Script Flow + +1. **Register user** + Attempts to register using credentials from `.env`. Silent fail if already exists. + +2. **Login user** + Sends a login request and retrieves an `accessToken`. + +3. **Get or create a chat** + Checks for existing chats. If none exist, it creates one using a default prompt. + +4. **Simulation loop** + In each cycle: -### 1. `.env` Usage + - Randomly shuffles and performs API requests: -The script reads these values from your `.env` file: + - `GET /me` + - `GET /chats` + - `GET /chats/:id` + - `PATCH /chats/:id` (renames the chat) + - `GET /chats/:id/messages` -- `API_BASE_URL`: Base URL of the backend API -- `TEST_EMAIL` & `TEST_PASSWORD`: Credentials for simulated user + - Introduces randomized delays between requests to mimic human usage. -### 2. User Flow +--- + +## Code Features + +### Randomization + +- **Delays**: Each API call is followed by a randomized delay (`20ms–100ms`). +- **Shuffling**: Task execution order is randomized per cycle. -The simulation follows this sequence: +### Error Handling -1. **Register** the user (silently fails if already exists) -2. **Login** and retrieve an `accessToken` -3. **Create a chat** -4. Perform the following actions using the `accessToken`: - - Fetch `/me` - - Get all chats - - Get specific chat by ID - - Update chat name - - Send a message - - Retrieve messages - - Delete the chat +- Network and logical errors are logged but don't interrupt the loop. +- Login errors and missing `accessToken`s throw descriptive messages. --- -## Output +## Console Output + +You will see output like: + +``` +No chat found. Creating one... +Bombarding chat ID: abc123 +[2025-06-30T10:45:12.654Z] Cycle complete. +``` -You will see console logs like: +In case of failures: ``` -Create chat response: { chat: { id: "abc123", ... } } -Simulated run finished. +Error during bombardment: Login failed: 401 - Unauthorized ``` --- ## Troubleshooting -- Ensure `.env` is correctly configured in the `simulation` directory. -- Make sure the API server is running and accessible at `API_BASE_URL`. -- If you see `Login failed: 401`, check the credentials in `.env`. +| Issue | Possible Cause | Solution | +| --------------------------- | --------------------------------------- | -------------------------------------- | +| `Login failed: 401` | Wrong credentials or backend issue | Double-check `.env` credentials | +| `Failed to create chat` | Invalid request body or API error | Confirm backend supports POST `/chats` | +| Script does nothing | Missing `.env` or wrong path | Ensure `.env` is in the same folder | +| Too many requests / blocked | Server rate-limiting or DDoS protection | Add longer delays or limit iterations | --- diff --git a/simulation/index.js b/simulation/index.js index 61758fe..e7eb12a 100644 --- a/simulation/index.js +++ b/simulation/index.js @@ -9,8 +9,20 @@ const userCreds = { lastName: "Ngo", }; -const dummyChat1 = { prompt: "Can you teach me about biology?" }; -const dummyChat2 = { prompt: "How does cells work?" }; +const dummyChat = { prompt: "Can you teach me about biology?" }; + +function randomDelay(min = 20, max = 100) { + return new Promise((res) => + setTimeout(res, Math.floor(Math.random() * (max - min + 1)) + min) + ); +} + +function shuffleArray(array) { + for (let i = array.length - 1; i > 0; i--) { + const j = Math.floor(Math.random() * (i + 1)); + [array[i], array[j]] = [array[j], array[i]]; + } +} async function registerUser() { try { @@ -44,20 +56,20 @@ async function login() { return data.accessToken; } -async function simulate() { - await registerUser(); - const token = await login(); +async function getOrCreateChat(headers) { + const getRes = await fetch(`${BASE_URL}/chats`, { headers }); + const chats = await getRes.json(); - const headers = { - "Content-Type": "application/json", - cookie: `accessToken=${token}`, - }; + if (Array.isArray(chats) && chats.length > 0) { + console.log(`Found existing chat: ${chats[0].id}`); + return chats[0].id; + } - // Create chat first + console.log("No chat found. Creating one..."); const createRes = await fetch(`${BASE_URL}/chats`, { method: "POST", headers, - body: JSON.stringify(dummyChat1), + body: JSON.stringify(dummyChat), }); if (!createRes.ok) { @@ -68,47 +80,53 @@ async function simulate() { } const data = await createRes.json(); - console.log("Create chat response:", data); - - const { chat } = data; - if (!chat || !chat.id) { + if (!data.chat || !data.chat.id) { throw new Error("Chat creation failed. No chat object returned."); } - // Proceed only after chat is confirmed - // 1. Get /me - await fetch(`${BASE_URL}/me`, { headers }); - - // 2. Get all chats - await fetch(`${BASE_URL}/chats`, { headers }); - - // 3. Get chat by ID - await fetch(`${BASE_URL}/chats/${chat.id}`, { headers }); - - // 4. Update chat name - await fetch(`${BASE_URL}/chats/${chat.id}`, { - method: "PATCH", - headers, - body: JSON.stringify({ name: "Updated from traffic sim" }), - }); - - // 5. Send message - await fetch(`${BASE_URL}/chats/${chat.id}/messages`, { - method: "POST", - headers, - body: JSON.stringify(dummyChat2), - }); + return data.chat.id; +} - // 6. Get messages - await fetch(`${BASE_URL}/chats/${chat.id}/messages`, { headers }); +async function simulateLoop() { + await registerUser(); + const token = await login(); - // 7. Delete chat - await fetch(`${BASE_URL}/chats/${chat.id}`, { - method: "DELETE", - headers, - }); + const headers = { + "Content-Type": "application/json", + cookie: `accessToken=${token}`, + }; - console.log(`Simulated run finished.`); + const chatId = await getOrCreateChat(headers); + console.log(`Bombarding chat ID: ${chatId}`); + + const tasks = [ + async () => await fetch(`${BASE_URL}/me`, { headers }), + async () => await fetch(`${BASE_URL}/chats`, { headers }), + async () => await fetch(`${BASE_URL}/chats/${chatId}`, { headers }), + async () => + await fetch(`${BASE_URL}/chats/${chatId}`, { + method: "PATCH", + headers, + body: JSON.stringify({ name: `Updated ${Date.now()}` }), + }), + async () => + await fetch(`${BASE_URL}/chats/${chatId}/messages`, { headers }), + ]; + + while (true) { + try { + shuffleArray(tasks); + for (const task of tasks) { + await task(); + await randomDelay(); // Random delay between requests + } + console.log(`[${new Date().toISOString()}] Cycle complete.`); + } catch (err) { + console.error("Error during bombardment:", err.message); + } + + await randomDelay(); + } } -simulate(); \ No newline at end of file +simulateLoop();