Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ dependencies {
//Retry
implementation 'org.springframework.retry:spring-retry'
implementation 'org.springframework:spring-aspects'

//Logging
implementation 'net.logstash.logback:logstash-logback-encoder:8.0'
}

tasks.named('test') {
Expand Down
63 changes: 63 additions & 0 deletions docker/alert-rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
groups:
- name: instance
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.job }}/{{ $labels.instance }} has been down for more than 1 minute."

- name: spring-boot
rules:
- alert: HighErrorRate
expr: >
sum(rate(http_server_requests_seconds_count{status=~"5.."}[5m]))
/
sum(rate(http_server_requests_seconds_count[5m]))
> 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High 5xx error rate"
description: "5xx error rate is above 5% for 5 minutes (current: {{ $value | humanizePercentage }})."

- alert: HighMemoryUsage
expr: >
jvm_memory_used_bytes{area="heap"}
/
jvm_memory_max_bytes{area="heap"}
> 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "JVM heap memory usage > 90%"
description: "{{ $labels.instance }} JVM heap usage is {{ $value | humanizePercentage }}."

- name: infrastructure
rules:
- alert: DiskSpaceLow
expr: >
(node_filesystem_avail_bytes{mountpoint="/"}
/
node_filesystem_size_bytes{mountpoint="/"})
< 0.2
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space low on {{ $labels.instance }}"
description: "Available disk space is below 20% (current: {{ $value | humanizePercentage }} free)."

- alert: RabbitMQQueueBacklog
expr: rabbitmq_queue_messages > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "RabbitMQ queue backlog on {{ $labels.queue }}"
description: "Queue {{ $labels.queue }} has {{ $value }} messages (threshold: 1000)."
25 changes: 25 additions & 0 deletions docker/alertmanager.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
global:
resolve_timeout: 5m

route:
receiver: slack
group_by: ['alertname', 'job']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h

receivers:
- name: slack
slack_configs:
- api_url: '${SLACK_WEBHOOK_URL}'
channel: '${SLACK_CHANNEL}'
send_resolved: true
title: '[{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
text: >-
*Alert:* {{ .CommonLabels.alertname }}
*Severity:* {{ .CommonLabels.severity }}
*Description:* {{ .CommonAnnotations.description }}
*Details:*
{{ range .Alerts }}
- *{{ .Labels.instance }}*: {{ .Annotations.description }}
{{ end }}
95 changes: 95 additions & 0 deletions docker/alloy-config.alloy
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
// ============================================================
// Alloy Configuration - SilRok Monitoring
// Replaces Promtail: log collection + metric scrape & push
// ============================================================

// ---- Log Collection ----

// Discover Spring Boot log files (blue/green)
local.file_match "spring_logs" {
path_targets = [
{__path__ = "/var/log/docker/spring/**/*.log", job = "spring-boot", app = "silrok"},
]
sync_period = "5s"
}

// Read matched log files
loki.source.file "spring_logs" {
targets = local.file_match.spring_logs.targets
forward_to = [loki.process.spring_pipeline.receiver]

file_watch {
min_poll_frequency = "250ms"
max_poll_frequency = "5s"
}
}

// Parse JSON logs and extract labels
loki.process "spring_pipeline" {
forward_to = [loki.write.default.receiver]

stage.json {
expressions = {
level = "level",
logger = "logger_name",
thread = "thread_name",
}
}

stage.labels {
values = {
level = "",
logger = "",
thread = "",
}
}
}

// Push logs to Loki on monitoring EC2
loki.write "default" {
endpoint {
url = "http://" + env("LOKI_HOST") + ":3100/loki/api/v1/push"
}
external_labels = {
source = "alloy",
}
}

// ---- Metric Collection ----

// Scrape Spring Boot Actuator metrics
prometheus.scrape "spring_boot" {
targets = [
{"__address__" = "blue:8080", "instance" = "blue"},
{"__address__" = "green:8081", "instance" = "green"},
]
metrics_path = "/actuator/prometheus"
scrape_interval = "15s"
forward_to = [prometheus.remote_write.default.receiver]
}

// Scrape Redis Exporter metrics (Data EC2)
prometheus.scrape "redis" {
targets = [
{"__address__" = env("DATA_HOST") + ":9121", "instance" = "redis_cache"},
]
scrape_interval = "15s"
forward_to = [prometheus.remote_write.default.receiver]
}

// Scrape RabbitMQ metrics (Data EC2)
prometheus.scrape "rabbitmq" {
targets = [
{"__address__" = env("DATA_HOST") + ":15692", "instance" = "rabbitmq_broker"},
]
metrics_path = "/metrics"
scrape_interval = "15s"
forward_to = [prometheus.remote_write.default.receiver]
}

// Push metrics to Prometheus on monitoring EC2
prometheus.remote_write "default" {
endpoint {
url = "http://" + env("PROMETHEUS_HOST") + ":9090/api/v1/write"
}
}
27 changes: 25 additions & 2 deletions docker/docker-compose.app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ services:
ports:
- "${BLUE_PORT:-8080}:8080"
volumes:
- ${BLUE_LOG_PATH:-./docker_logs/spring/blue}:/app/logs
- ${BLUE_LOG_PATH:-./docker_logs/spring/blue}:/opt/app/logs
networks:
- app_network
restart: always
Expand All @@ -34,7 +34,7 @@ services:
ports:
- "${GREEN_PORT:-8081}:8080"
volumes:
- ${GREEN_LOG_PATH:-./docker_logs/spring/green}:/app/logs
- ${GREEN_LOG_PATH:-./docker_logs/spring/green}:/opt/app/logs
networks:
- app_network
restart: always
Expand All @@ -45,6 +45,29 @@ services:
retries: 5
start_period: 20s

alloy:
image: grafana/alloy:latest
container_name: my-alloy
environment:
- TZ=Asia/Seoul
- LOKI_HOST=${LOKI_HOST}
- PROMETHEUS_HOST=${PROMETHEUS_HOST}
- DATA_HOST=${DATA_HOST}
volumes:
- ./alloy-config.alloy:/etc/alloy/config.alloy
- alloy_positions:/alloy
- ${BLUE_LOG_PATH:-./docker_logs/spring/blue}:/var/log/docker/spring/blue:ro
- ${GREEN_LOG_PATH:-./docker_logs/spring/green}:/var/log/docker/spring/green:ro
command:
- run
- /etc/alloy/config.alloy
networks:
- app_network
restart: unless-stopped

volumes:
alloy_positions:

networks:
app_network:
external: true
Expand Down
15 changes: 15 additions & 0 deletions docker/docker-compose.infra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,21 @@ services:
timeout: 10s
retries: 5

redis-exporter:
image: oliver006/redis_exporter:v1.58.0
container_name: redis-exporter
environment:
- TZ=Asia/Seoul
- REDIS_ADDR=redis://redis:6379
ports:
- "9121:9121"
networks:
- infra_net
restart: unless-stopped
depends_on:
redis:
condition: service_healthy

volumes:
redis_data:

Expand Down
36 changes: 24 additions & 12 deletions docker/docker-compose.monitor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@ services:
container_name: my-loki
environment:
- TZ=Asia/Seoul
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- S3_BUCKET=${S3_BUCKET}
ports:
- "${LOKI_PORT:-3100}:3100"
volumes:
- ${LOKI_CONFIG_PATH:-./loki-config.yml}:/etc/loki/loki-config.yml
- ${LOKI_DATA_VOLUME:-loki_data}:/loki
command: -config.file=/etc/loki/loki-config.yml
command: -config.file=/etc/loki/loki-config.yml -config.expand-env=true
networks:
- monitor_net
restart: unless-stopped
Expand All @@ -28,8 +31,9 @@ services:
- "${GRAFANA_PORT:-3000}:3000"
volumes:
- ${GRAFANA_DATA_VOLUME:-grafana_data}:/var/lib/grafana
- ${GRAFANA_LOG_PATH:-./docker_logs/grafana}:/var/log/grafana # Adjusted default path
- ${GRAFANA_LOG_PATH:-./docker_logs/grafana}:/var/log/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
- ./grafana/dashboards:/var/lib/grafana/dashboards
networks:
- monitor_net
restart: always
Expand All @@ -41,19 +45,25 @@ services:

prometheus:
image: prom/prometheus:${PROMETHEUS_VERSION:-v2.49.1}
container_name: ${PROMETHEUS_CONTAINER_NAME:-my-prometheus}
container_name: my-prometheus
environment:
- TZ=Asia/Seoul
ports:
- "${PROMETHEUS_PORT:-9090}:9090"
volumes:
- ${PROMETHEUS_CONFIG_PATH:-./prometheus.yml}:/etc/prometheus/prometheus.yml
- ./alert-rules.yml:/etc/prometheus/alert-rules.yml
- ${PROMETHEUS_DATA_VOLUME:-prometheus_data}:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=15d'
- '--web.enable-remote-write-receiver'
networks:
- monitor_net
restart: always
healthcheck:
test: [ "CMD", "curl", "-f", "http://prometheus:9090/-/ready" ]
test: [ "CMD", "curl", "-f", "http://localhost:9090/-/ready" ]
interval: 10s
timeout: 5s
retries: 5
Expand All @@ -63,17 +73,19 @@ services:
max-size: "10m"
max-file: "3"

promtail:
image: grafana/promtail:${PROMTAIL_VERSION:-3.0.0}
container_name: ${PROMTAIL_CONTAINER_NAME:-my-promtail}
alertmanager:
image: prom/alertmanager:v0.27.0
container_name: my-alertmanager
environment:
- TZ=Asia/Seoul
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
- SLACK_CHANNEL=${SLACK_CHANNEL}
ports:
- "9093:9093"
volumes:
- ${PROMTAIL_CONFIG_PATH:-./promtail-config.yml}:/etc/promtail/promtail-config.yml
- ${DOCKER_LOGS_PATH:-./docker_logs}:/var/log/docker
- /var/lib/docker/containers:/var/lib/docker/containers:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
command: -config.file=/etc/promtail/promtail-config.yml
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
networks:
- monitor_net
restart: unless-stopped
Expand Down
Loading
Loading