diff --git a/build.gradle b/build.gradle
index 76cf250..0db97a6 100644
--- a/build.gradle
+++ b/build.gradle
@@ -67,6 +67,9 @@ dependencies {
//Retry
implementation 'org.springframework.retry:spring-retry'
implementation 'org.springframework:spring-aspects'
+
+ //Logging
+ implementation 'net.logstash.logback:logstash-logback-encoder:8.0'
}
tasks.named('test') {
diff --git a/docker/alert-rules.yml b/docker/alert-rules.yml
new file mode 100644
index 0000000..3442d9e
--- /dev/null
+++ b/docker/alert-rules.yml
@@ -0,0 +1,63 @@
+groups:
+ - name: instance
+ rules:
+ - alert: InstanceDown
+ expr: up == 0
+ for: 1m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Instance {{ $labels.instance }} down"
+ description: "{{ $labels.job }}/{{ $labels.instance }} has been down for more than 1 minute."
+
+ - name: spring-boot
+ rules:
+ - alert: HighErrorRate
+ expr: >
+ sum(rate(http_server_requests_seconds_count{status=~"5.."}[5m]))
+ /
+ sum(rate(http_server_requests_seconds_count[5m]))
+ > 0.05
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "High 5xx error rate"
+ description: "5xx error rate is above 5% for 5 minutes (current: {{ $value | humanizePercentage }})."
+
+ - alert: HighMemoryUsage
+ expr: >
+ jvm_memory_used_bytes{area="heap"}
+ /
+ jvm_memory_max_bytes{area="heap"}
+ > 0.9
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "JVM heap memory usage > 90%"
+ description: "{{ $labels.instance }} JVM heap usage is {{ $value | humanizePercentage }}."
+
+ - name: infrastructure
+ rules:
+ - alert: DiskSpaceLow
+ expr: >
+ (node_filesystem_avail_bytes{mountpoint="/"}
+ /
+ node_filesystem_size_bytes{mountpoint="/"})
+ < 0.2
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Disk space low on {{ $labels.instance }}"
+ description: "Available disk space is below 20% (current: {{ $value | humanizePercentage }} free)."
+
+ - alert: RabbitMQQueueBacklog
+ expr: rabbitmq_queue_messages > 1000
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "RabbitMQ queue backlog on {{ $labels.queue }}"
+ description: "Queue {{ $labels.queue }} has {{ $value }} messages (threshold: 1000)."
diff --git a/docker/alertmanager.yml b/docker/alertmanager.yml
new file mode 100644
index 0000000..0e28d44
--- /dev/null
+++ b/docker/alertmanager.yml
@@ -0,0 +1,25 @@
+global:
+ resolve_timeout: 5m
+
+route:
+ receiver: slack
+ group_by: ['alertname', 'job']
+ group_wait: 30s
+ group_interval: 5m
+ repeat_interval: 4h
+
+receivers:
+ - name: slack
+ slack_configs:
+ - api_url: '${SLACK_WEBHOOK_URL}'
+ channel: '${SLACK_CHANNEL}'
+ send_resolved: true
+ title: '[{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
+ text: >-
+ *Alert:* {{ .CommonLabels.alertname }}
+ *Severity:* {{ .CommonLabels.severity }}
+ *Description:* {{ .CommonAnnotations.description }}
+ *Details:*
+ {{ range .Alerts }}
+ - *{{ .Labels.instance }}*: {{ .Annotations.description }}
+ {{ end }}
diff --git a/docker/alloy-config.alloy b/docker/alloy-config.alloy
new file mode 100644
index 0000000..427ad7e
--- /dev/null
+++ b/docker/alloy-config.alloy
@@ -0,0 +1,95 @@
+// ============================================================
+// Alloy Configuration - SilRok Monitoring
+// Replaces Promtail: log collection + metric scrape & push
+// ============================================================
+
+// ---- Log Collection ----
+
+// Discover Spring Boot log files (blue/green)
+local.file_match "spring_logs" {
+ path_targets = [
+ {__path__ = "/var/log/docker/spring/**/*.log", job = "spring-boot", app = "silrok"},
+ ]
+ sync_period = "5s"
+}
+
+// Read matched log files
+loki.source.file "spring_logs" {
+ targets = local.file_match.spring_logs.targets
+ forward_to = [loki.process.spring_pipeline.receiver]
+
+ file_watch {
+ min_poll_frequency = "250ms"
+ max_poll_frequency = "5s"
+ }
+}
+
+// Parse JSON logs and extract labels
+loki.process "spring_pipeline" {
+ forward_to = [loki.write.default.receiver]
+
+ stage.json {
+ expressions = {
+ level = "level",
+ logger = "logger_name",
+ thread = "thread_name",
+ }
+ }
+
+ stage.labels {
+ values = {
+ level = "",
+ logger = "",
+ thread = "",
+ }
+ }
+}
+
+// Push logs to Loki on monitoring EC2
+loki.write "default" {
+ endpoint {
+ url = "http://" + env("LOKI_HOST") + ":3100/loki/api/v1/push"
+ }
+ external_labels = {
+ source = "alloy",
+ }
+}
+
+// ---- Metric Collection ----
+
+// Scrape Spring Boot Actuator metrics
+prometheus.scrape "spring_boot" {
+ targets = [
+ {"__address__" = "blue:8080", "instance" = "blue"},
+ {"__address__" = "green:8081", "instance" = "green"},
+ ]
+ metrics_path = "/actuator/prometheus"
+ scrape_interval = "15s"
+ forward_to = [prometheus.remote_write.default.receiver]
+}
+
+// Scrape Redis Exporter metrics (Data EC2)
+prometheus.scrape "redis" {
+ targets = [
+ {"__address__" = env("DATA_HOST") + ":9121", "instance" = "redis_cache"},
+ ]
+ scrape_interval = "15s"
+ forward_to = [prometheus.remote_write.default.receiver]
+}
+
+// Scrape RabbitMQ metrics (Data EC2)
+prometheus.scrape "rabbitmq" {
+ targets = [
+ {"__address__" = env("DATA_HOST") + ":15692", "instance" = "rabbitmq_broker"},
+ ]
+ metrics_path = "/metrics"
+ scrape_interval = "15s"
+ forward_to = [prometheus.remote_write.default.receiver]
+}
+
+// Push metrics to Prometheus on monitoring EC2
+prometheus.remote_write "default" {
+ endpoint {
+ url = "http://" + env("PROMETHEUS_HOST") + ":9090/api/v1/write"
+ }
+}
diff --git a/docker/docker-compose.app.yml b/docker/docker-compose.app.yml
index bf744eb..c5a9710 100644
--- a/docker/docker-compose.app.yml
+++ b/docker/docker-compose.app.yml
@@ -11,7 +11,7 @@ services:
ports:
- "${BLUE_PORT:-8080}:8080"
volumes:
- - ${BLUE_LOG_PATH:-./docker_logs/spring/blue}:/app/logs
+ - ${BLUE_LOG_PATH:-./docker_logs/spring/blue}:/opt/app/logs
networks:
- app_network
restart: always
@@ -34,7 +34,7 @@ services:
ports:
- "${GREEN_PORT:-8081}:8080"
volumes:
- - ${GREEN_LOG_PATH:-./docker_logs/spring/green}:/app/logs
+ - ${GREEN_LOG_PATH:-./docker_logs/spring/green}:/opt/app/logs
networks:
- app_network
restart: always
@@ -45,6 +45,29 @@ services:
retries: 5
start_period: 20s
+ alloy:
+ image: grafana/alloy:latest
+ container_name: my-alloy
+ environment:
+ - TZ=Asia/Seoul
+ - LOKI_HOST=${LOKI_HOST}
+ - PROMETHEUS_HOST=${PROMETHEUS_HOST}
+ - DATA_HOST=${DATA_HOST}
+ volumes:
+ - ./alloy-config.alloy:/etc/alloy/config.alloy
+ - alloy_positions:/alloy
+ - ${BLUE_LOG_PATH:-./docker_logs/spring/blue}:/var/log/docker/spring/blue:ro
+ - ${GREEN_LOG_PATH:-./docker_logs/spring/green}:/var/log/docker/spring/green:ro
+ command:
+ - run
+ - /etc/alloy/config.alloy
+ networks:
+ - app_network
+ restart: unless-stopped
+
+volumes:
+ alloy_positions:
+
networks:
app_network:
external: true
diff --git a/docker/docker-compose.infra.yml b/docker/docker-compose.infra.yml
index e92f58b..e9eb461 100644
--- a/docker/docker-compose.infra.yml
+++ b/docker/docker-compose.infra.yml
@@ -37,6 +37,21 @@ services:
timeout: 10s
retries: 5
+ redis-exporter:
+ image: oliver006/redis_exporter:v1.58.0
+ container_name: redis-exporter
+ environment:
+ - TZ=Asia/Seoul
+ - REDIS_ADDR=redis://redis:6379
+ ports:
+ - "9121:9121"
+ networks:
+ - infra_net
+ restart: unless-stopped
+ depends_on:
+ redis:
+ condition: service_healthy
+
volumes:
redis_data:
diff --git a/docker/docker-compose.monitor.yml b/docker/docker-compose.monitor.yml
index 92b8fb7..6fdeaee 100644
--- a/docker/docker-compose.monitor.yml
+++ b/docker/docker-compose.monitor.yml
@@ -4,12 +4,15 @@ services:
container_name: my-loki
environment:
- TZ=Asia/Seoul
+ - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
+ - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
+ - S3_BUCKET=${S3_BUCKET}
ports:
- "${LOKI_PORT:-3100}:3100"
volumes:
- ${LOKI_CONFIG_PATH:-./loki-config.yml}:/etc/loki/loki-config.yml
- ${LOKI_DATA_VOLUME:-loki_data}:/loki
- command: -config.file=/etc/loki/loki-config.yml
+ command: -config.file=/etc/loki/loki-config.yml -config.expand-env=true
networks:
- monitor_net
restart: unless-stopped
@@ -28,8 +31,9 @@ services:
- "${GRAFANA_PORT:-3000}:3000"
volumes:
- ${GRAFANA_DATA_VOLUME:-grafana_data}:/var/lib/grafana
- - ${GRAFANA_LOG_PATH:-./docker_logs/grafana}:/var/log/grafana # Adjusted default path
+ - ${GRAFANA_LOG_PATH:-./docker_logs/grafana}:/var/log/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
+ - ./grafana/dashboards:/var/lib/grafana/dashboards
networks:
- monitor_net
restart: always
@@ -41,19 +45,25 @@ services:
prometheus:
image: prom/prometheus:${PROMETHEUS_VERSION:-v2.49.1}
- container_name: ${PROMETHEUS_CONTAINER_NAME:-my-prometheus}
+ container_name: my-prometheus
environment:
- TZ=Asia/Seoul
ports:
- "${PROMETHEUS_PORT:-9090}:9090"
volumes:
- ${PROMETHEUS_CONFIG_PATH:-./prometheus.yml}:/etc/prometheus/prometheus.yml
+ - ./alert-rules.yml:/etc/prometheus/alert-rules.yml
- ${PROMETHEUS_DATA_VOLUME:-prometheus_data}:/prometheus
+ command:
+ - '--config.file=/etc/prometheus/prometheus.yml'
+ - '--storage.tsdb.path=/prometheus'
+ - '--storage.tsdb.retention.time=15d'
+ - '--web.enable-remote-write-receiver'
networks:
- monitor_net
restart: always
healthcheck:
- test: [ "CMD", "curl", "-f", "http://prometheus:9090/-/ready" ]
+ test: [ "CMD", "curl", "-f", "http://localhost:9090/-/ready" ]
interval: 10s
timeout: 5s
retries: 5
@@ -63,17 +73,19 @@ services:
max-size: "10m"
max-file: "3"
- promtail:
- image: grafana/promtail:${PROMTAIL_VERSION:-3.0.0}
- container_name: ${PROMTAIL_CONTAINER_NAME:-my-promtail}
+ alertmanager:
+ image: prom/alertmanager:v0.27.0
+ container_name: my-alertmanager
environment:
- TZ=Asia/Seoul
+ - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
+ - SLACK_CHANNEL=${SLACK_CHANNEL}
+ ports:
+ - "9093:9093"
volumes:
- - ${PROMTAIL_CONFIG_PATH:-./promtail-config.yml}:/etc/promtail/promtail-config.yml
- - ${DOCKER_LOGS_PATH:-./docker_logs}:/var/log/docker
- - /var/lib/docker/containers:/var/lib/docker/containers:ro
- - /var/run/docker.sock:/var/run/docker.sock:ro
- command: -config.file=/etc/promtail/promtail-config.yml
+ - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
+ command:
+ - '--config.file=/etc/alertmanager/alertmanager.yml'
networks:
- monitor_net
restart: unless-stopped
diff --git a/docker/grafana/dashboards/infrastructure.json b/docker/grafana/dashboards/infrastructure.json
new file mode 100644
index 0000000..daeaf0c
--- /dev/null
+++ b/docker/grafana/dashboards/infrastructure.json
@@ -0,0 +1,549 @@
+{
+ "uid": "silrok-infrastructure",
+ "title": "Infrastructure",
+ "tags": [
+ "redis",
+ "rabbitmq",
+ "silrok"
+ ],
+ "timezone": "Asia/Seoul",
+ "schemaVersion": 39,
+ "version": 1,
+ "refresh": "30s",
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "panels": [
+ {
+ "type": "row",
+ "title": "Redis Overview",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "collapsed": false
+ },
+ {
+ "type": "stat",
+ "title": "Connected Clients",
+ "datasource": {
+ "type": "prometheus",
+ "uid": "silrok-prometheus"
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 6,
+ "x": 0,
+ "y": 1
+ },
+ "targets": [
+ {
+ "expr": "redis_connected_clients",
+ "legendFormat": "clients"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "yellow",
+ "value": 50
+ },
+ {
+ "color": "red",
+ "value": 100
+ }
+ ]
+ }
+ }
+ },
+ "options": {
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ]
+ }
+ }
+ },
+ {
+ "type": "stat",
+ "title": "Used Memory",
+ "datasource": {
+ "type": "prometheus",
+ "uid": "silrok-prometheus"
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 6,
+ "x": 6,
+ "y": 1
+ },
+ "targets": [
+ {
+ "expr": "redis_memory_used_bytes",
+ "legendFormat": "used"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "bytes",
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ }
+ }
+ },
+ "options": {
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ]
+ }
+ }
+ },
+ {
+ "type": "stat",
+ "title": "Hit Rate",
+ "datasource": {
+ "type": "prometheus",
+ "uid": "silrok-prometheus"
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 6,
+ "x": 12,
+ "y": 1
+ },
+ "targets": [
+ {
+ "expr": "redis_keyspace_hits_total / (redis_keyspace_hits_total + redis_keyspace_misses_total)",
+ "legendFormat": "hit rate"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "percentunit",
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "red",
+ "value": null
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "green",
+ "value": 0.95
+ }
+ ]
+ }
+ }
+ },
+ "options": {
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ]
+ }
+ }
+ },
+ {
+ "type": "stat",
+ "title": "Ops/sec",
+ "datasource": {
+ "type": "prometheus",
+ "uid": "silrok-prometheus"
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 6,
+ "x": 18,
+ "y": 1
+ },
+ "targets": [
+ {
+ "expr": "rate(redis_commands_processed_total[5m])",
+ "legendFormat": "ops/s"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "ops",
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ }
+ }
+ },
+ "options": {
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ]
+ }
+ }
+ },
+ {
+ "type": "row",
+ "title": "Redis Detail",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 5
+ },
+ "collapsed": false
+ },
+ {
+ "type": "timeseries",
+ "title": "Memory Over Time",
+ "datasource": {
+ "type": "prometheus",
+ "uid": "silrok-prometheus"
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 0,
+ "y": 6
+ },
+ "targets": [
+ {
+ "expr": "redis_memory_used_bytes",
+ "legendFormat": "used"
+ },
+ {
+ "expr": "redis_memory_max_bytes",
+ "legendFormat": "max"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "bytes",
+ "custom": {
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "pointSize": 5,
+ "lineWidth": 1
+ }
+ }
+ }
+ },
+ {
+ "type": "timeseries",
+ "title": "Commands/sec",
+ "datasource": {
+ "type": "prometheus",
+ "uid": "silrok-prometheus"
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 8,
+ "y": 6
+ },
+ "targets": [
+ {
+ "expr": "rate(redis_commands_processed_total[5m])",
+ "legendFormat": "cmd/s"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "ops",
+ "custom": {
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "pointSize": 5,
+ "lineWidth": 1
+ }
+ }
+ }
+ },
+ {
+ "type": "timeseries",
+ "title": "Hits vs Misses",
+ "datasource": {
+ "type": "prometheus",
+ "uid": "silrok-prometheus"
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 16,
+ "y": 6
+ },
+ "targets": [
+ {
+ "expr": "rate(redis_keyspace_hits_total[5m])",
+ "legendFormat": "hits"
+ },
+ {
+ "expr": "rate(redis_keyspace_misses_total[5m])",
+ "legendFormat": "misses"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "ops",
+ "custom": {
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "pointSize": 5,
+ "lineWidth": 1
+ }
+ }
+ }
+ },
+ {
+ "type": "row",
+ "title": "RabbitMQ Overview",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 14
+ },
+ "collapsed": false
+ },
+ {
+ "type": "stat",
+ "title": "Queue Messages",
+ "datasource": {
+ "type": "prometheus",
+ "uid": "silrok-prometheus"
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 8,
+ "x": 0,
+ "y": 15
+ },
+ "targets": [
+ {
+ "expr": "sum(rabbitmq_queue_messages)",
+ "legendFormat": "messages"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "yellow",
+ "value": 100
+ },
+ {
+ "color": "red",
+ "value": 1000
+ }
+ ]
+ }
+ }
+ },
+ "options": {
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ]
+ }
+ }
+ },
+ {
+ "type": "stat",
+ "title": "Publish Rate",
+ "datasource": {
+ "type": "prometheus",
+ "uid": "silrok-prometheus"
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 8,
+ "x": 8,
+ "y": 15
+ },
+ "targets": [
+ {
+ "expr": "sum(rate(rabbitmq_queue_messages_published_total[5m]))",
+ "legendFormat": "pub/s"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "ops",
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ }
+ }
+ },
+ "options": {
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ]
+ }
+ }
+ },
+ {
+ "type": "stat",
+ "title": "Connections",
+ "datasource": {
+ "type": "prometheus",
+ "uid": "silrok-prometheus"
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 8,
+ "x": 16,
+ "y": 15
+ },
+ "targets": [
+ {
+ "expr": "rabbitmq_connections",
+ "legendFormat": "connections"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "yellow",
+ "value": 50
+ },
+ {
+ "color": "red",
+ "value": 100
+ }
+ ]
+ }
+ }
+ },
+ "options": {
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ]
+ }
+ }
+ },
+ {
+ "type": "row",
+ "title": "RabbitMQ Detail",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 19
+ },
+ "collapsed": false
+ },
+ {
+ "type": "timeseries",
+ "title": "Queue Depth Over Time",
+ "datasource": {
+ "type": "prometheus",
+ "uid": "silrok-prometheus"
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 20
+ },
+ "targets": [
+ {
+ "expr": "rabbitmq_queue_messages",
+ "legendFormat": "{{queue}}"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "pointSize": 5,
+ "lineWidth": 1
+ }
+ }
+ }
+ },
+ {
+ "type": "timeseries",
+ "title": "Message Rates",
+ "datasource": {
+ "type": "prometheus",
+ "uid": "silrok-prometheus"
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 20
+ },
+ "targets": [
+ {
+ "expr": "sum(rate(rabbitmq_queue_messages_published_total[5m]))",
+ "legendFormat": "publish"
+ },
+ {
+ "expr": "sum(rate(rabbitmq_queue_messages_delivered_total[5m]))",
+ "legendFormat": "deliver"
+ },
+ {
+ "expr": "sum(rate(rabbitmq_queue_messages_acknowledged_total[5m]))",
+ "legendFormat": "ack"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "ops",
+ "custom": {
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "pointSize": 5,
+ "lineWidth": 1
+ }
+ }
+ }
+ }
+ ]
+}
diff --git a/docker/grafana/dashboards/logs.json b/docker/grafana/dashboards/logs.json
new file mode 100644
index 0000000..d9f16a4
--- /dev/null
+++ b/docker/grafana/dashboards/logs.json
@@ -0,0 +1,166 @@
+{
+ "uid": "silrok-logs",
+ "title": "Logs",
+ "tags": ["loki", "logs", "silrok"],
+ "timezone": "Asia/Seoul",
+ "schemaVersion": 39,
+ "version": 1,
+ "refresh": "30s",
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "templating": {
+ "list": [
+ {
+ "name": "level",
+ "type": "custom",
+ "query": "ERROR,WARN,INFO,DEBUG",
+ "current": {
+ "selected": true,
+ "text": ["ERROR", "WARN", "INFO"],
+ "value": ["ERROR", "WARN", "INFO"]
+ },
+ "multi": true,
+ "includeAll": true,
+ "allValue": ".*",
+ "options": []
+ },
+ {
+ "name": "logger",
+ "type": "textbox",
+ "current": {
+ "text": "",
+ "value": ""
+ },
+ "options": []
+ }
+ ]
+ },
+ "panels": [
+ {
+ "type": "row",
+ "title": "Overview",
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+ "collapsed": false
+ },
+ {
+ "type": "timeseries",
+ "title": "Log Volume by Level",
+ "datasource": { "type": "loki", "uid": "silrok-logs" },
+ "gridPos": { "h": 8, "w": 18, "x": 0, "y": 1 },
+ "targets": [
+ {
+ "expr": "sum by (level)(count_over_time({job=\"spring-boot\"} | json | level=~\"$level\" [1m]))",
+ "legendFormat": "{{level}}"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ "drawStyle": "bars",
+ "fillOpacity": 80,
+ "pointSize": 5,
+ "lineWidth": 1,
+ "stacking": { "mode": "normal" }
+ }
+ },
+ "overrides": [
+ {
+ "matcher": { "id": "byName", "options": "ERROR" },
+ "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
+ },
+ {
+ "matcher": { "id": "byName", "options": "WARN" },
+ "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }]
+ },
+ {
+ "matcher": { "id": "byName", "options": "INFO" },
+ "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
+ },
+ {
+ "matcher": { "id": "byName", "options": "DEBUG" },
+ "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }]
+ }
+ ]
+ }
+ },
+ {
+ "type": "stat",
+ "title": "Error Count (1h)",
+ "datasource": { "type": "loki", "uid": "silrok-logs" },
+ "gridPos": { "h": 8, "w": 6, "x": 18, "y": 1 },
+ "targets": [
+ {
+ "expr": "count_over_time({job=\"spring-boot\"} | json | level=\"ERROR\" [1h])",
+ "legendFormat": "errors"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "yellow", "value": 1 },
+ { "color": "red", "value": 10 }
+ ]
+ }
+ }
+ },
+ "options": {
+ "reduceOptions": { "calcs": ["sum"] }
+ }
+ },
+ {
+ "type": "row",
+ "title": "Logs",
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 },
+ "collapsed": false
+ },
+ {
+ "type": "logs",
+ "title": "Log Stream",
+ "datasource": { "type": "loki", "uid": "silrok-logs" },
+ "gridPos": { "h": 16, "w": 24, "x": 0, "y": 10 },
+ "targets": [
+ {
+ "expr": "{job=\"spring-boot\"} | json | level=~\"$level\" | logger_name=~\".*$logger.*\"",
+ "legendFormat": ""
+ }
+ ],
+ "options": {
+ "showTime": true,
+ "showLabels": false,
+ "showCommonLabels": false,
+ "wrapLogMessage": true,
+ "prettifyLogMessage": false,
+ "enableLogDetails": true,
+ "sortOrder": "Descending",
+ "dedupStrategy": "none"
+ }
+ },
+ {
+ "type": "logs",
+ "title": "Error Logs",
+ "datasource": { "type": "loki", "uid": "silrok-logs" },
+ "gridPos": { "h": 12, "w": 24, "x": 0, "y": 26 },
+ "targets": [
+ {
+ "expr": "{job=\"spring-boot\"} | json | level=\"ERROR\"",
+ "legendFormat": ""
+ }
+ ],
+ "options": {
+ "showTime": true,
+ "showLabels": false,
+ "showCommonLabels": false,
+ "wrapLogMessage": true,
+ "prettifyLogMessage": false,
+ "enableLogDetails": true,
+ "sortOrder": "Descending",
+ "dedupStrategy": "none"
+ }
+ }
+ ]
+}
diff --git a/docker/grafana/dashboards/spring-boot.json b/docker/grafana/dashboards/spring-boot.json
new file mode 100644
index 0000000..62051b8
--- /dev/null
+++ b/docker/grafana/dashboards/spring-boot.json
@@ -0,0 +1,341 @@
+{
+ "uid": "silrok-spring-boot",
+ "title": "Spring Boot",
+ "tags": ["spring-boot", "silrok"],
+ "timezone": "Asia/Seoul",
+ "schemaVersion": 39,
+ "version": 1,
+ "refresh": "30s",
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "templating": {
+ "list": [
+ {
+ "name": "instance",
+ "type": "query",
+ "datasource": { "type": "prometheus", "uid": "silrok-prometheus" },
+ "query": "label_values(process_uptime_seconds, instance)",
+ "refresh": 2,
+ "includeAll": false,
+ "multi": false,
+ "sort": 1,
+ "current": {}
+ }
+ ]
+ },
+ "panels": [
+ {
+ "type": "row",
+ "title": "Overview",
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+ "collapsed": false
+ },
+ {
+ "type": "stat",
+ "title": "Uptime",
+ "datasource": { "type": "prometheus", "uid": "silrok-prometheus" },
+ "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
+ "targets": [
+ {
+ "expr": "process_uptime_seconds{instance=~\"$instance\"}",
+ "legendFormat": "{{instance}}"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "s",
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null }
+ ]
+ }
+ }
+ },
+ "options": {
+ "reduceOptions": { "calcs": ["lastNotNull"] }
+ }
+ },
+ {
+ "type": "stat",
+ "title": "Request Rate",
+ "datasource": { "type": "prometheus", "uid": "silrok-prometheus" },
+ "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
+ "targets": [
+ {
+ "expr": "sum(rate(http_server_requests_seconds_count{instance=~\"$instance\"}[5m]))",
+ "legendFormat": "req/s"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "reqps",
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null }
+ ]
+ }
+ }
+ },
+ "options": {
+ "reduceOptions": { "calcs": ["lastNotNull"] }
+ }
+ },
+ {
+ "type": "stat",
+ "title": "Error Rate (5xx)",
+ "datasource": { "type": "prometheus", "uid": "silrok-prometheus" },
+ "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
+ "targets": [
+ {
+ "expr": "sum(rate(http_server_requests_seconds_count{instance=~\"$instance\", status=~\"5..\"}[5m])) / sum(rate(http_server_requests_seconds_count{instance=~\"$instance\"}[5m]))",
+ "legendFormat": "error rate"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "percentunit",
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "yellow", "value": 0.01 },
+ { "color": "red", "value": 0.05 }
+ ]
+ }
+ }
+ },
+ "options": {
+ "reduceOptions": { "calcs": ["lastNotNull"] }
+ }
+ },
+ {
+ "type": "stat",
+ "title": "Avg Response Time",
+ "datasource": { "type": "prometheus", "uid": "silrok-prometheus" },
+ "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
+ "targets": [
+ {
+ "expr": "sum(rate(http_server_requests_seconds_sum{instance=~\"$instance\"}[5m])) / sum(rate(http_server_requests_seconds_count{instance=~\"$instance\"}[5m]))",
+ "legendFormat": "avg"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "s",
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "yellow", "value": 0.5 },
+ { "color": "red", "value": 1 }
+ ]
+ }
+ }
+ },
+ "options": {
+ "reduceOptions": { "calcs": ["lastNotNull"] }
+ }
+ },
+ {
+ "type": "row",
+ "title": "HTTP",
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
+ "collapsed": false
+ },
+ {
+ "type": "timeseries",
+ "title": "Request Rate by Status",
+ "datasource": { "type": "prometheus", "uid": "silrok-prometheus" },
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
+ "targets": [
+ {
+ "expr": "sum by (status)(rate(http_server_requests_seconds_count{instance=~\"$instance\"}[5m]))",
+ "legendFormat": "{{status}}"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "reqps",
+ "custom": {
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "pointSize": 5,
+ "lineWidth": 1
+ }
+ }
+ }
+ },
+ {
+ "type": "timeseries",
+ "title": "Response Time p95 / p99",
+ "datasource": { "type": "prometheus", "uid": "silrok-prometheus" },
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.95, sum by (le)(rate(http_server_requests_seconds_bucket{instance=~\"$instance\"}[5m])))",
+ "legendFormat": "p95"
+ },
+ {
+ "expr": "histogram_quantile(0.99, sum by (le)(rate(http_server_requests_seconds_bucket{instance=~\"$instance\"}[5m])))",
+ "legendFormat": "p99"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "s",
+ "custom": {
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "pointSize": 5,
+ "lineWidth": 1
+ }
+ }
+ }
+ },
+ {
+ "type": "row",
+ "title": "JVM Memory",
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
+ "collapsed": false
+ },
+ {
+ "type": "timeseries",
+ "title": "Heap Used vs Max",
+ "datasource": { "type": "prometheus", "uid": "silrok-prometheus" },
+ "gridPos": { "h": 8, "w": 8, "x": 0, "y": 15 },
+ "targets": [
+ {
+ "expr": "sum(jvm_memory_used_bytes{instance=~\"$instance\", area=\"heap\"})",
+ "legendFormat": "used"
+ },
+ {
+ "expr": "sum(jvm_memory_max_bytes{instance=~\"$instance\", area=\"heap\"})",
+ "legendFormat": "max"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "bytes",
+ "custom": {
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "pointSize": 5,
+ "lineWidth": 1
+ }
+ }
+ }
+ },
+ {
+ "type": "timeseries",
+ "title": "Non-Heap Memory",
+ "datasource": { "type": "prometheus", "uid": "silrok-prometheus" },
+ "gridPos": { "h": 8, "w": 8, "x": 8, "y": 15 },
+ "targets": [
+ {
+ "expr": "sum(jvm_memory_used_bytes{instance=~\"$instance\", area=\"nonheap\"})",
+ "legendFormat": "used"
+ },
+ {
+ "expr": "sum(jvm_memory_committed_bytes{instance=~\"$instance\", area=\"nonheap\"})",
+ "legendFormat": "committed"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "bytes",
+ "custom": {
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "pointSize": 5,
+ "lineWidth": 1
+ }
+ }
+ }
+ },
+ {
+ "type": "timeseries",
+ "title": "GC Pause Time",
+ "datasource": { "type": "prometheus", "uid": "silrok-prometheus" },
+ "gridPos": { "h": 8, "w": 8, "x": 16, "y": 15 },
+ "targets": [
+ {
+ "expr": "sum by (gc)(rate(jvm_gc_pause_seconds_sum{instance=~\"$instance\"}[5m]))",
+ "legendFormat": "{{gc}}"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "s",
+ "custom": {
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "pointSize": 5,
+ "lineWidth": 1
+ }
+ }
+ }
+ },
+ {
+ "type": "row",
+ "title": "JVM Threads & CPU",
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 },
+ "collapsed": false
+ },
+ {
+ "type": "timeseries",
+ "title": "Threads by State",
+ "datasource": { "type": "prometheus", "uid": "silrok-prometheus" },
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
+ "targets": [
+ {
+ "expr": "jvm_threads_states_threads{instance=~\"$instance\"}",
+ "legendFormat": "{{state}}"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ "drawStyle": "line",
+ "fillOpacity": 30,
+ "pointSize": 5,
+ "lineWidth": 1,
+ "stacking": { "mode": "normal" }
+ }
+ }
+ }
+ },
+ {
+ "type": "timeseries",
+ "title": "CPU Usage",
+ "datasource": { "type": "prometheus", "uid": "silrok-prometheus" },
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 },
+ "targets": [
+ {
+ "expr": "process_cpu_usage{instance=~\"$instance\"}",
+ "legendFormat": "process"
+ },
+ {
+ "expr": "system_cpu_usage{instance=~\"$instance\"}",
+ "legendFormat": "system"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "percentunit",
+ "min": 0,
+ "max": 1,
+ "custom": {
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "pointSize": 5,
+ "lineWidth": 1
+ }
+ }
+ }
+ }
+ ]
+}
diff --git a/docker/grafana/provisioning/dashboards/dashboards.yml b/docker/grafana/provisioning/dashboards/dashboards.yml
new file mode 100644
index 0000000..732bc95
--- /dev/null
+++ b/docker/grafana/provisioning/dashboards/dashboards.yml
@@ -0,0 +1,12 @@
+apiVersion: 1
+
+providers:
+ - name: silrok
+ orgId: 1
+ folder: SilRok
+ type: file
+ disableDeletion: false
+ editable: true
+ options:
+ path: /var/lib/grafana/dashboards
+ foldersFromFilesStructure: false
diff --git a/docker/grafana/provisioning/datasources/loki.yml b/docker/grafana/provisioning/datasources/loki.yml
index 43f7a87..729ee83 100644
--- a/docker/grafana/provisioning/datasources/loki.yml
+++ b/docker/grafana/provisioning/datasources/loki.yml
@@ -2,6 +2,7 @@ apiVersion: 1
datasources:
- name: Loki
+ uid: silrok-logs
type: loki
access: proxy
url: http://my-loki:3100
@@ -10,7 +11,7 @@ datasources:
jsonData:
maxLines: 1000
derivedFields:
- - datasourceUid: prometheus
+ - datasourceUid: silrok-prometheus
matcherRegex: "traceID=(\\w+)"
name: TraceID
- url: "$${__value.raw}"
\ No newline at end of file
+ url: "$${__value.raw}"
diff --git a/docker/grafana/provisioning/datasources/prometheus.yml b/docker/grafana/provisioning/datasources/prometheus.yml
index 4ad96bb..4652a6e 100644
--- a/docker/grafana/provisioning/datasources/prometheus.yml
+++ b/docker/grafana/provisioning/datasources/prometheus.yml
@@ -2,6 +2,7 @@ apiVersion: 1
datasources:
- name: Prometheus
+ uid: silrok-prometheus
type: prometheus
access: proxy
url: http://my-prometheus:9090
diff --git a/src/main/resources/logback-dev.xml b/src/main/resources/logback-dev.xml
index 9b8c8dd..10d1614 100644
--- a/src/main/resources/logback-dev.xml
+++ b/src/main/resources/logback-dev.xml
@@ -15,15 +15,14 @@
${LOG_PATH}/logfile.%d{yyyy-MM-dd}.log.gz
30
-
- %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
+
+ traceId
-
diff --git a/src/main/resources/logback-prod.xml b/src/main/resources/logback-prod.xml
index c9128d3..524e5b4 100644
--- a/src/main/resources/logback-prod.xml
+++ b/src/main/resources/logback-prod.xml
@@ -4,8 +4,8 @@
-
- %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
+
+ traceId
@@ -15,8 +15,8 @@
${LOG_PATH}/logfile.%d{yyyy-MM-dd}.log.gz
30
-
- %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
+
+ traceId