From 47edab1a7e45d0a890b586239c6f2ca3f60b10ec Mon Sep 17 00:00:00 2001 From: garden-zero Date: Wed, 1 Apr 2026 23:17:18 +0900 Subject: [PATCH 1/3] =?UTF-8?q?feat:=20=EA=B5=AC=EC=A1=B0=ED=99=94=20?= =?UTF-8?q?=EB=A1=9C=EA=B7=B8=20=EB=B0=8F=20Alloy=20=EC=88=98=EC=A7=91?= =?UTF-8?q?=EA=B8=B0=20=EC=84=A4=EC=A0=95=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build.gradle | 3 + docker/alloy-config.alloy | 95 +++++++++++++++++++++++++++++ docker/docker-compose.app.yml | 27 +++++++- src/main/resources/logback-dev.xml | 5 +- src/main/resources/logback-prod.xml | 8 +-- 5 files changed, 129 insertions(+), 9 deletions(-) create mode 100644 docker/alloy-config.alloy diff --git a/build.gradle b/build.gradle index 76cf250f..0db97a69 100644 --- a/build.gradle +++ b/build.gradle @@ -67,6 +67,9 @@ dependencies { //Retry implementation 'org.springframework.retry:spring-retry' implementation 'org.springframework:spring-aspects' + + //Logging + implementation 'net.logstash.logback:logstash-logback-encoder:8.0' } tasks.named('test') { diff --git a/docker/alloy-config.alloy b/docker/alloy-config.alloy new file mode 100644 index 00000000..427ad7e5 --- /dev/null +++ b/docker/alloy-config.alloy @@ -0,0 +1,95 @@ +// ============================================================ +// Alloy Configuration - SilRok Monitoring +// Replaces Promtail: log collection + metric scrape & push +// ============================================================ + +// ---- Log Collection ---- + +// Discover Spring Boot log files (blue/green) +local.file_match "spring_logs" { + path_targets = [ + {__path__ = "/var/log/docker/spring/**/*.log", job = "spring-boot", app = "silrok"}, + ] + sync_period = "5s" +} + +// Read matched log files +loki.source.file "spring_logs" { + targets = local.file_match.spring_logs.targets + forward_to = [loki.process.spring_pipeline.receiver] + + file_watch { + min_poll_frequency = "250ms" + max_poll_frequency = "5s" + } +} + +// Parse JSON logs and extract labels +loki.process "spring_pipeline" { + forward_to = [loki.write.default.receiver] + + stage.json { + expressions = { + level = "level", + logger = "logger_name", + thread = "thread_name", + } + } + + stage.labels { + values = { + level = "", + logger = "", + thread = "", + } + } +} + +// Push logs to Loki on monitoring EC2 +loki.write "default" { + endpoint { + url = "http://" + env("LOKI_HOST") + ":3100/loki/api/v1/push" + } + external_labels = { + source = "alloy", + } +} + +// ---- Metric Collection ---- + +// Scrape Spring Boot Actuator metrics +prometheus.scrape "spring_boot" { + targets = [ + {"__address__" = "blue:8080", "instance" = "blue"}, + {"__address__" = "green:8081", "instance" = "green"}, + ] + metrics_path = "/actuator/prometheus" + scrape_interval = "15s" + forward_to = [prometheus.remote_write.default.receiver] +} + +// Scrape Redis Exporter metrics (Data EC2) +prometheus.scrape "redis" { + targets = [ + {"__address__" = env("DATA_HOST") + ":9121", "instance" = "redis_cache"}, + ] + scrape_interval = "15s" + forward_to = [prometheus.remote_write.default.receiver] +} + +// Scrape RabbitMQ metrics (Data EC2) +prometheus.scrape "rabbitmq" { + targets = [ + {"__address__" = env("DATA_HOST") + ":15692", "instance" = "rabbitmq_broker"}, + ] + metrics_path = "/metrics" + scrape_interval = "15s" + forward_to = [prometheus.remote_write.default.receiver] +} + +// Push metrics to Prometheus on monitoring EC2 +prometheus.remote_write "default" { + endpoint { + url = "http://" + env("PROMETHEUS_HOST") + ":9090/api/v1/write" + } +} diff --git a/docker/docker-compose.app.yml b/docker/docker-compose.app.yml index bf744ebb..c5a97108 100644 --- a/docker/docker-compose.app.yml +++ b/docker/docker-compose.app.yml @@ -11,7 +11,7 @@ services: ports: - "${BLUE_PORT:-8080}:8080" volumes: - - ${BLUE_LOG_PATH:-./docker_logs/spring/blue}:/app/logs + - ${BLUE_LOG_PATH:-./docker_logs/spring/blue}:/opt/app/logs networks: - app_network restart: always @@ -34,7 +34,7 @@ services: ports: - "${GREEN_PORT:-8081}:8080" volumes: - - ${GREEN_LOG_PATH:-./docker_logs/spring/green}:/app/logs + - ${GREEN_LOG_PATH:-./docker_logs/spring/green}:/opt/app/logs networks: - app_network restart: always @@ -45,6 +45,29 @@ services: retries: 5 start_period: 20s + alloy: + image: grafana/alloy:latest + container_name: my-alloy + environment: + - TZ=Asia/Seoul + - LOKI_HOST=${LOKI_HOST} + - PROMETHEUS_HOST=${PROMETHEUS_HOST} + - DATA_HOST=${DATA_HOST} + volumes: + - ./alloy-config.alloy:/etc/alloy/config.alloy + - alloy_positions:/alloy + - ${BLUE_LOG_PATH:-./docker_logs/spring/blue}:/var/log/docker/spring/blue:ro + - ${GREEN_LOG_PATH:-./docker_logs/spring/green}:/var/log/docker/spring/green:ro + command: + - run + - /etc/alloy/config.alloy + networks: + - app_network + restart: unless-stopped + +volumes: + alloy_positions: + networks: app_network: external: true diff --git a/src/main/resources/logback-dev.xml b/src/main/resources/logback-dev.xml index 9b8c8ddd..10d1614d 100644 --- a/src/main/resources/logback-dev.xml +++ b/src/main/resources/logback-dev.xml @@ -15,15 +15,14 @@ ${LOG_PATH}/logfile.%d{yyyy-MM-dd}.log.gz 30 - - %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + traceId - diff --git a/src/main/resources/logback-prod.xml b/src/main/resources/logback-prod.xml index c9128d38..524e5b43 100644 --- a/src/main/resources/logback-prod.xml +++ b/src/main/resources/logback-prod.xml @@ -4,8 +4,8 @@ - - %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + traceId @@ -15,8 +15,8 @@ ${LOG_PATH}/logfile.%d{yyyy-MM-dd}.log.gz 30 - - %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + traceId From 060165cc74342680af34e3edddd57d3f31dd5287 Mon Sep 17 00:00:00 2001 From: garden-zero Date: Wed, 1 Apr 2026 23:17:52 +0900 Subject: [PATCH 2/3] =?UTF-8?q?feat:=20=EB=AA=A8=EB=8B=88=ED=84=B0?= =?UTF-8?q?=EB=A7=81=20=EC=95=8C=EB=A6=BC=20=EB=B0=8F=20=EB=A9=94=ED=8A=B8?= =?UTF-8?q?=EB=A6=AD=20=EC=88=98=EC=A7=91=20=EC=9D=B8=ED=94=84=EB=9D=BC=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docker/alert-rules.yml | 63 +++++++++++++++++++ docker/alertmanager.yml | 25 ++++++++ docker/docker-compose.infra.yml | 15 +++++ docker/docker-compose.monitor.yml | 36 +++++++---- .../grafana/provisioning/datasources/loki.yml | 5 +- .../provisioning/datasources/prometheus.yml | 1 + 6 files changed, 131 insertions(+), 14 deletions(-) create mode 100644 docker/alert-rules.yml create mode 100644 docker/alertmanager.yml diff --git a/docker/alert-rules.yml b/docker/alert-rules.yml new file mode 100644 index 00000000..3442d9eb --- /dev/null +++ b/docker/alert-rules.yml @@ -0,0 +1,63 @@ +groups: + - name: instance + rules: + - alert: InstanceDown + expr: up == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Instance {{ $labels.instance }} down" + description: "{{ $labels.job }}/{{ $labels.instance }} has been down for more than 1 minute." + + - name: spring-boot + rules: + - alert: HighErrorRate + expr: > + sum(rate(http_server_requests_seconds_count{status=~"5.."}[5m])) + / + sum(rate(http_server_requests_seconds_count[5m])) + > 0.05 + for: 5m + labels: + severity: critical + annotations: + summary: "High 5xx error rate" + description: "5xx error rate is above 5% for 5 minutes (current: {{ $value | humanizePercentage }})." + + - alert: HighMemoryUsage + expr: > + jvm_memory_used_bytes{area="heap"} + / + jvm_memory_max_bytes{area="heap"} + > 0.9 + for: 5m + labels: + severity: warning + annotations: + summary: "JVM heap memory usage > 90%" + description: "{{ $labels.instance }} JVM heap usage is {{ $value | humanizePercentage }}." + + - name: infrastructure + rules: + - alert: DiskSpaceLow + expr: > + (node_filesystem_avail_bytes{mountpoint="/"} + / + node_filesystem_size_bytes{mountpoint="/"}) + < 0.2 + for: 5m + labels: + severity: warning + annotations: + summary: "Disk space low on {{ $labels.instance }}" + description: "Available disk space is below 20% (current: {{ $value | humanizePercentage }} free)." + + - alert: RabbitMQQueueBacklog + expr: rabbitmq_queue_messages > 1000 + for: 5m + labels: + severity: warning + annotations: + summary: "RabbitMQ queue backlog on {{ $labels.queue }}" + description: "Queue {{ $labels.queue }} has {{ $value }} messages (threshold: 1000)." diff --git a/docker/alertmanager.yml b/docker/alertmanager.yml new file mode 100644 index 00000000..0e28d44f --- /dev/null +++ b/docker/alertmanager.yml @@ -0,0 +1,25 @@ +global: + resolve_timeout: 5m + +route: + receiver: slack + group_by: ['alertname', 'job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + +receivers: + - name: slack + slack_configs: + - api_url: '${SLACK_WEBHOOK_URL}' + channel: '${SLACK_CHANNEL}' + send_resolved: true + title: '[{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}' + text: >- + *Alert:* {{ .CommonLabels.alertname }} + *Severity:* {{ .CommonLabels.severity }} + *Description:* {{ .CommonAnnotations.description }} + *Details:* + {{ range .Alerts }} + - *{{ .Labels.instance }}*: {{ .Annotations.description }} + {{ end }} diff --git a/docker/docker-compose.infra.yml b/docker/docker-compose.infra.yml index e92f58b7..e9eb4611 100644 --- a/docker/docker-compose.infra.yml +++ b/docker/docker-compose.infra.yml @@ -37,6 +37,21 @@ services: timeout: 10s retries: 5 + redis-exporter: + image: oliver006/redis_exporter:v1.58.0 + container_name: redis-exporter + environment: + - TZ=Asia/Seoul + - REDIS_ADDR=redis://redis:6379 + ports: + - "9121:9121" + networks: + - infra_net + restart: unless-stopped + depends_on: + redis: + condition: service_healthy + volumes: redis_data: diff --git a/docker/docker-compose.monitor.yml b/docker/docker-compose.monitor.yml index 92b8fb77..6fdeaee9 100644 --- a/docker/docker-compose.monitor.yml +++ b/docker/docker-compose.monitor.yml @@ -4,12 +4,15 @@ services: container_name: my-loki environment: - TZ=Asia/Seoul + - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} + - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} + - S3_BUCKET=${S3_BUCKET} ports: - "${LOKI_PORT:-3100}:3100" volumes: - ${LOKI_CONFIG_PATH:-./loki-config.yml}:/etc/loki/loki-config.yml - ${LOKI_DATA_VOLUME:-loki_data}:/loki - command: -config.file=/etc/loki/loki-config.yml + command: -config.file=/etc/loki/loki-config.yml -config.expand-env=true networks: - monitor_net restart: unless-stopped @@ -28,8 +31,9 @@ services: - "${GRAFANA_PORT:-3000}:3000" volumes: - ${GRAFANA_DATA_VOLUME:-grafana_data}:/var/lib/grafana - - ${GRAFANA_LOG_PATH:-./docker_logs/grafana}:/var/log/grafana # Adjusted default path + - ${GRAFANA_LOG_PATH:-./docker_logs/grafana}:/var/log/grafana - ./grafana/provisioning:/etc/grafana/provisioning + - ./grafana/dashboards:/var/lib/grafana/dashboards networks: - monitor_net restart: always @@ -41,19 +45,25 @@ services: prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v2.49.1} - container_name: ${PROMETHEUS_CONTAINER_NAME:-my-prometheus} + container_name: my-prometheus environment: - TZ=Asia/Seoul ports: - "${PROMETHEUS_PORT:-9090}:9090" volumes: - ${PROMETHEUS_CONFIG_PATH:-./prometheus.yml}:/etc/prometheus/prometheus.yml + - ./alert-rules.yml:/etc/prometheus/alert-rules.yml - ${PROMETHEUS_DATA_VOLUME:-prometheus_data}:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--storage.tsdb.retention.time=15d' + - '--web.enable-remote-write-receiver' networks: - monitor_net restart: always healthcheck: - test: [ "CMD", "curl", "-f", "http://prometheus:9090/-/ready" ] + test: [ "CMD", "curl", "-f", "http://localhost:9090/-/ready" ] interval: 10s timeout: 5s retries: 5 @@ -63,17 +73,19 @@ services: max-size: "10m" max-file: "3" - promtail: - image: grafana/promtail:${PROMTAIL_VERSION:-3.0.0} - container_name: ${PROMTAIL_CONTAINER_NAME:-my-promtail} + alertmanager: + image: prom/alertmanager:v0.27.0 + container_name: my-alertmanager environment: - TZ=Asia/Seoul + - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} + - SLACK_CHANNEL=${SLACK_CHANNEL} + ports: + - "9093:9093" volumes: - - ${PROMTAIL_CONFIG_PATH:-./promtail-config.yml}:/etc/promtail/promtail-config.yml - - ${DOCKER_LOGS_PATH:-./docker_logs}:/var/log/docker - - /var/lib/docker/containers:/var/lib/docker/containers:ro - - /var/run/docker.sock:/var/run/docker.sock:ro - command: -config.file=/etc/promtail/promtail-config.yml + - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' networks: - monitor_net restart: unless-stopped diff --git a/docker/grafana/provisioning/datasources/loki.yml b/docker/grafana/provisioning/datasources/loki.yml index 43f7a874..729ee83c 100644 --- a/docker/grafana/provisioning/datasources/loki.yml +++ b/docker/grafana/provisioning/datasources/loki.yml @@ -2,6 +2,7 @@ apiVersion: 1 datasources: - name: Loki + uid: silrok-logs type: loki access: proxy url: http://my-loki:3100 @@ -10,7 +11,7 @@ datasources: jsonData: maxLines: 1000 derivedFields: - - datasourceUid: prometheus + - datasourceUid: silrok-prometheus matcherRegex: "traceID=(\\w+)" name: TraceID - url: "$${__value.raw}" \ No newline at end of file + url: "$${__value.raw}" diff --git a/docker/grafana/provisioning/datasources/prometheus.yml b/docker/grafana/provisioning/datasources/prometheus.yml index 4ad96bbe..4652a6ed 100644 --- a/docker/grafana/provisioning/datasources/prometheus.yml +++ b/docker/grafana/provisioning/datasources/prometheus.yml @@ -2,6 +2,7 @@ apiVersion: 1 datasources: - name: Prometheus + uid: silrok-prometheus type: prometheus access: proxy url: http://my-prometheus:9090 From 489f5a1e9730511807cdf25f58a3ad57abf94dc1 Mon Sep 17 00:00:00 2001 From: garden-zero Date: Wed, 1 Apr 2026 23:18:05 +0900 Subject: [PATCH 3/3] =?UTF-8?q?feat:=20Grafana=20=EB=AA=A8=EB=8B=88?= =?UTF-8?q?=ED=84=B0=EB=A7=81=20=EB=8C=80=EC=8B=9C=EB=B3=B4=EB=93=9C=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docker/grafana/dashboards/infrastructure.json | 549 ++++++++++++++++++ docker/grafana/dashboards/logs.json | 166 ++++++ docker/grafana/dashboards/spring-boot.json | 341 +++++++++++ .../provisioning/dashboards/dashboards.yml | 12 + 4 files changed, 1068 insertions(+) create mode 100644 docker/grafana/dashboards/infrastructure.json create mode 100644 docker/grafana/dashboards/logs.json create mode 100644 docker/grafana/dashboards/spring-boot.json create mode 100644 docker/grafana/provisioning/dashboards/dashboards.yml diff --git a/docker/grafana/dashboards/infrastructure.json b/docker/grafana/dashboards/infrastructure.json new file mode 100644 index 00000000..daeaf0c4 --- /dev/null +++ b/docker/grafana/dashboards/infrastructure.json @@ -0,0 +1,549 @@ +{ + "uid": "silrok-infrastructure", + "title": "Infrastructure", + "tags": [ + "redis", + "rabbitmq", + "silrok" + ], + "timezone": "Asia/Seoul", + "schemaVersion": 39, + "version": 1, + "refresh": "30s", + "time": { + "from": "now-1h", + "to": "now" + }, + "panels": [ + { + "type": "row", + "title": "Redis Overview", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "collapsed": false + }, + { + "type": "stat", + "title": "Connected Clients", + "datasource": { + "type": "prometheus", + "uid": "silrok-prometheus" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 1 + }, + "targets": [ + { + "expr": "redis_connected_clients", + "legendFormat": "clients" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 100 + } + ] + } + } + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + } + }, + { + "type": "stat", + "title": "Used Memory", + "datasource": { + "type": "prometheus", + "uid": "silrok-prometheus" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 1 + }, + "targets": [ + { + "expr": "redis_memory_used_bytes", + "legendFormat": "used" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + } + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + } + }, + { + "type": "stat", + "title": "Hit Rate", + "datasource": { + "type": "prometheus", + "uid": "silrok-prometheus" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "targets": [ + { + "expr": "redis_keyspace_hits_total / (redis_keyspace_hits_total + redis_keyspace_misses_total)", + "legendFormat": "hit rate" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 0.8 + }, + { + "color": "green", + "value": 0.95 + } + ] + } + } + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + } + }, + { + "type": "stat", + "title": "Ops/sec", + "datasource": { + "type": "prometheus", + "uid": "silrok-prometheus" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 1 + }, + "targets": [ + { + "expr": "rate(redis_commands_processed_total[5m])", + "legendFormat": "ops/s" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + } + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + } + }, + { + "type": "row", + "title": "Redis Detail", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "collapsed": false + }, + { + "type": "timeseries", + "title": "Memory Over Time", + "datasource": { + "type": "prometheus", + "uid": "silrok-prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 6 + }, + "targets": [ + { + "expr": "redis_memory_used_bytes", + "legendFormat": "used" + }, + { + "expr": "redis_memory_max_bytes", + "legendFormat": "max" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "pointSize": 5, + "lineWidth": 1 + } + } + } + }, + { + "type": "timeseries", + "title": "Commands/sec", + "datasource": { + "type": "prometheus", + "uid": "silrok-prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 6 + }, + "targets": [ + { + "expr": "rate(redis_commands_processed_total[5m])", + "legendFormat": "cmd/s" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "pointSize": 5, + "lineWidth": 1 + } + } + } + }, + { + "type": "timeseries", + "title": "Hits vs Misses", + "datasource": { + "type": "prometheus", + "uid": "silrok-prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 6 + }, + "targets": [ + { + "expr": "rate(redis_keyspace_hits_total[5m])", + "legendFormat": "hits" + }, + { + "expr": "rate(redis_keyspace_misses_total[5m])", + "legendFormat": "misses" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "pointSize": 5, + "lineWidth": 1 + } + } + } + }, + { + "type": "row", + "title": "RabbitMQ Overview", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, + "collapsed": false + }, + { + "type": "stat", + "title": "Queue Messages", + "datasource": { + "type": "prometheus", + "uid": "silrok-prometheus" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 15 + }, + "targets": [ + { + "expr": "sum(rabbitmq_queue_messages)", + "legendFormat": "messages" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 100 + }, + { + "color": "red", + "value": 1000 + } + ] + } + } + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + } + }, + { + "type": "stat", + "title": "Publish Rate", + "datasource": { + "type": "prometheus", + "uid": "silrok-prometheus" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 15 + }, + "targets": [ + { + "expr": "sum(rate(rabbitmq_queue_messages_published_total[5m]))", + "legendFormat": "pub/s" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + } + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + } + }, + { + "type": "stat", + "title": "Connections", + "datasource": { + "type": "prometheus", + "uid": "silrok-prometheus" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 16, + "y": 15 + }, + "targets": [ + { + "expr": "rabbitmq_connections", + "legendFormat": "connections" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 100 + } + ] + } + } + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + } + }, + { + "type": "row", + "title": "RabbitMQ Detail", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "collapsed": false + }, + { + "type": "timeseries", + "title": "Queue Depth Over Time", + "datasource": { + "type": "prometheus", + "uid": "silrok-prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 20 + }, + "targets": [ + { + "expr": "rabbitmq_queue_messages", + "legendFormat": "{{queue}}" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "pointSize": 5, + "lineWidth": 1 + } + } + } + }, + { + "type": "timeseries", + "title": "Message Rates", + "datasource": { + "type": "prometheus", + "uid": "silrok-prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 20 + }, + "targets": [ + { + "expr": "sum(rate(rabbitmq_queue_messages_published_total[5m]))", + "legendFormat": "publish" + }, + { + "expr": "sum(rate(rabbitmq_queue_messages_delivered_total[5m]))", + "legendFormat": "deliver" + }, + { + "expr": "sum(rate(rabbitmq_queue_messages_acknowledged_total[5m]))", + "legendFormat": "ack" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "pointSize": 5, + "lineWidth": 1 + } + } + } + } + ] +} diff --git a/docker/grafana/dashboards/logs.json b/docker/grafana/dashboards/logs.json new file mode 100644 index 00000000..d9f16a4b --- /dev/null +++ b/docker/grafana/dashboards/logs.json @@ -0,0 +1,166 @@ +{ + "uid": "silrok-logs", + "title": "Logs", + "tags": ["loki", "logs", "silrok"], + "timezone": "Asia/Seoul", + "schemaVersion": 39, + "version": 1, + "refresh": "30s", + "time": { + "from": "now-1h", + "to": "now" + }, + "templating": { + "list": [ + { + "name": "level", + "type": "custom", + "query": "ERROR,WARN,INFO,DEBUG", + "current": { + "selected": true, + "text": ["ERROR", "WARN", "INFO"], + "value": ["ERROR", "WARN", "INFO"] + }, + "multi": true, + "includeAll": true, + "allValue": ".*", + "options": [] + }, + { + "name": "logger", + "type": "textbox", + "current": { + "text": "", + "value": "" + }, + "options": [] + } + ] + }, + "panels": [ + { + "type": "row", + "title": "Overview", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "collapsed": false + }, + { + "type": "timeseries", + "title": "Log Volume by Level", + "datasource": { "type": "loki", "uid": "silrok-logs" }, + "gridPos": { "h": 8, "w": 18, "x": 0, "y": 1 }, + "targets": [ + { + "expr": "sum by (level)(count_over_time({job=\"spring-boot\"} | json | level=~\"$level\" [1m]))", + "legendFormat": "{{level}}" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "bars", + "fillOpacity": 80, + "pointSize": 5, + "lineWidth": 1, + "stacking": { "mode": "normal" } + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "ERROR" }, + "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "WARN" }, + "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "INFO" }, + "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "DEBUG" }, + "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] + } + ] + } + }, + { + "type": "stat", + "title": "Error Count (1h)", + "datasource": { "type": "loki", "uid": "silrok-logs" }, + "gridPos": { "h": 8, "w": 6, "x": 18, "y": 1 }, + "targets": [ + { + "expr": "count_over_time({job=\"spring-boot\"} | json | level=\"ERROR\" [1h])", + "legendFormat": "errors" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 10 } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["sum"] } + } + }, + { + "type": "row", + "title": "Logs", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 }, + "collapsed": false + }, + { + "type": "logs", + "title": "Log Stream", + "datasource": { "type": "loki", "uid": "silrok-logs" }, + "gridPos": { "h": 16, "w": 24, "x": 0, "y": 10 }, + "targets": [ + { + "expr": "{job=\"spring-boot\"} | json | level=~\"$level\" | logger_name=~\".*$logger.*\"", + "legendFormat": "" + } + ], + "options": { + "showTime": true, + "showLabels": false, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "sortOrder": "Descending", + "dedupStrategy": "none" + } + }, + { + "type": "logs", + "title": "Error Logs", + "datasource": { "type": "loki", "uid": "silrok-logs" }, + "gridPos": { "h": 12, "w": 24, "x": 0, "y": 26 }, + "targets": [ + { + "expr": "{job=\"spring-boot\"} | json | level=\"ERROR\"", + "legendFormat": "" + } + ], + "options": { + "showTime": true, + "showLabels": false, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "sortOrder": "Descending", + "dedupStrategy": "none" + } + } + ] +} diff --git a/docker/grafana/dashboards/spring-boot.json b/docker/grafana/dashboards/spring-boot.json new file mode 100644 index 00000000..62051b86 --- /dev/null +++ b/docker/grafana/dashboards/spring-boot.json @@ -0,0 +1,341 @@ +{ + "uid": "silrok-spring-boot", + "title": "Spring Boot", + "tags": ["spring-boot", "silrok"], + "timezone": "Asia/Seoul", + "schemaVersion": 39, + "version": 1, + "refresh": "30s", + "time": { + "from": "now-1h", + "to": "now" + }, + "templating": { + "list": [ + { + "name": "instance", + "type": "query", + "datasource": { "type": "prometheus", "uid": "silrok-prometheus" }, + "query": "label_values(process_uptime_seconds, instance)", + "refresh": 2, + "includeAll": false, + "multi": false, + "sort": 1, + "current": {} + } + ] + }, + "panels": [ + { + "type": "row", + "title": "Overview", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "collapsed": false + }, + { + "type": "stat", + "title": "Uptime", + "datasource": { "type": "prometheus", "uid": "silrok-prometheus" }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, + "targets": [ + { + "expr": "process_uptime_seconds{instance=~\"$instance\"}", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] } + } + }, + { + "type": "stat", + "title": "Request Rate", + "datasource": { "type": "prometheus", "uid": "silrok-prometheus" }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 }, + "targets": [ + { + "expr": "sum(rate(http_server_requests_seconds_count{instance=~\"$instance\"}[5m]))", + "legendFormat": "req/s" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] } + } + }, + { + "type": "stat", + "title": "Error Rate (5xx)", + "datasource": { "type": "prometheus", "uid": "silrok-prometheus" }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "targets": [ + { + "expr": "sum(rate(http_server_requests_seconds_count{instance=~\"$instance\", status=~\"5..\"}[5m])) / sum(rate(http_server_requests_seconds_count{instance=~\"$instance\"}[5m]))", + "legendFormat": "error rate" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.01 }, + { "color": "red", "value": 0.05 } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] } + } + }, + { + "type": "stat", + "title": "Avg Response Time", + "datasource": { "type": "prometheus", "uid": "silrok-prometheus" }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "targets": [ + { + "expr": "sum(rate(http_server_requests_seconds_sum{instance=~\"$instance\"}[5m])) / sum(rate(http_server_requests_seconds_count{instance=~\"$instance\"}[5m]))", + "legendFormat": "avg" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.5 }, + { "color": "red", "value": 1 } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] } + } + }, + { + "type": "row", + "title": "HTTP", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "collapsed": false + }, + { + "type": "timeseries", + "title": "Request Rate by Status", + "datasource": { "type": "prometheus", "uid": "silrok-prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "targets": [ + { + "expr": "sum by (status)(rate(http_server_requests_seconds_count{instance=~\"$instance\"}[5m]))", + "legendFormat": "{{status}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "pointSize": 5, + "lineWidth": 1 + } + } + } + }, + { + "type": "timeseries", + "title": "Response Time p95 / p99", + "datasource": { "type": "prometheus", "uid": "silrok-prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le)(rate(http_server_requests_seconds_bucket{instance=~\"$instance\"}[5m])))", + "legendFormat": "p95" + }, + { + "expr": "histogram_quantile(0.99, sum by (le)(rate(http_server_requests_seconds_bucket{instance=~\"$instance\"}[5m])))", + "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "pointSize": 5, + "lineWidth": 1 + } + } + } + }, + { + "type": "row", + "title": "JVM Memory", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "collapsed": false + }, + { + "type": "timeseries", + "title": "Heap Used vs Max", + "datasource": { "type": "prometheus", "uid": "silrok-prometheus" }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 15 }, + "targets": [ + { + "expr": "sum(jvm_memory_used_bytes{instance=~\"$instance\", area=\"heap\"})", + "legendFormat": "used" + }, + { + "expr": "sum(jvm_memory_max_bytes{instance=~\"$instance\", area=\"heap\"})", + "legendFormat": "max" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "pointSize": 5, + "lineWidth": 1 + } + } + } + }, + { + "type": "timeseries", + "title": "Non-Heap Memory", + "datasource": { "type": "prometheus", "uid": "silrok-prometheus" }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 15 }, + "targets": [ + { + "expr": "sum(jvm_memory_used_bytes{instance=~\"$instance\", area=\"nonheap\"})", + "legendFormat": "used" + }, + { + "expr": "sum(jvm_memory_committed_bytes{instance=~\"$instance\", area=\"nonheap\"})", + "legendFormat": "committed" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "pointSize": 5, + "lineWidth": 1 + } + } + } + }, + { + "type": "timeseries", + "title": "GC Pause Time", + "datasource": { "type": "prometheus", "uid": "silrok-prometheus" }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 15 }, + "targets": [ + { + "expr": "sum by (gc)(rate(jvm_gc_pause_seconds_sum{instance=~\"$instance\"}[5m]))", + "legendFormat": "{{gc}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "pointSize": 5, + "lineWidth": 1 + } + } + } + }, + { + "type": "row", + "title": "JVM Threads & CPU", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, + "collapsed": false + }, + { + "type": "timeseries", + "title": "Threads by State", + "datasource": { "type": "prometheus", "uid": "silrok-prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "targets": [ + { + "expr": "jvm_threads_states_threads{instance=~\"$instance\"}", + "legendFormat": "{{state}}" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 30, + "pointSize": 5, + "lineWidth": 1, + "stacking": { "mode": "normal" } + } + } + } + }, + { + "type": "timeseries", + "title": "CPU Usage", + "datasource": { "type": "prometheus", "uid": "silrok-prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "targets": [ + { + "expr": "process_cpu_usage{instance=~\"$instance\"}", + "legendFormat": "process" + }, + { + "expr": "system_cpu_usage{instance=~\"$instance\"}", + "legendFormat": "system" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "pointSize": 5, + "lineWidth": 1 + } + } + } + } + ] +} diff --git a/docker/grafana/provisioning/dashboards/dashboards.yml b/docker/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 00000000..732bc954 --- /dev/null +++ b/docker/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: silrok + orgId: 1 + folder: SilRok + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false