diff --git a/clean.sh b/clean.sh new file mode 100755 index 0000000..c2ea6a3 --- /dev/null +++ b/clean.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Fix script for Tempo port conflict +echo "=== Fixing Tempo port 4317 conflict ===" + +# 1. Stop with force and longer grace period +docker-compose stop -t 30 tempo + +# 2. Kill any remaining tempo processes in containers +for container in $(docker ps -aq --filter "name=tempo"); do + echo "Checking container $container..." + docker exec $container pkill -9 -f "tempo|grpc" 2>/dev/null || true +done + +# 3. Force remove +docker-compose rm -f tempo + +# 4. Clean Docker network +docker network prune -f + +# 5. Remove tempo volume (optional - will lose trace data) +# docker volume rm $(docker volume ls -q | grep tempo) 2>/dev/null || true + +# 6. Start with longer healthcheck timeout +echo "Starting Tempo with extended healthcheck..." +docker-compose up -d tempo + +# 7. Wait and check logs +sleep 5 +echo "=== Checking Tempo logs ===" +docker-compose logs --tail=50 tempo diff --git a/docker-compose.yml b/docker-compose.yml index 2f10aab..86d60fc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,46 +1,137 @@ -version: "3.8" +version: "3.9" services: + # ----------------------------- + # Prometheus - Metrics + # ----------------------------- prometheus: image: prom/prometheus:latest container_name: prometheus + ports: + - "9090:9090" volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - ./prometheus/alert.rules.yml:/etc/prometheus/alert.rules.yml:ro + - prometheus-data:/prometheus command: - - "--config.file=/etc/prometheus/prometheus.yml" - - "--web.enable-lifecycle" - ports: - - "9090:9090" + - --config.file=/etc/prometheus/prometheus.yml + - --web.enable-lifecycle depends_on: - - alertmanager + alertmanager: + condition: service_healthy restart: unless-stopped + healthcheck: + test: + [ + "CMD-SHELL", + "wget --spider -q http://prometheus:9090/-/ready || exit 1", + ] + interval: 10s + timeout: 5s + retries: 5 + # ----------------------------- + # Alertmanager - Alerts + # ----------------------------- alertmanager: image: prom/alertmanager:latest container_name: alertmanager - volumes: - - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro - command: - - "--config.file=/etc/alertmanager/alertmanager.yml" ports: - "9093:9093" + volumes: + - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + - alertmanager-data:/alertmanager + command: ["--config.file=/etc/alertmanager/alertmanager.yml"] restart: unless-stopped + healthcheck: + test: + [ + "CMD-SHELL", + "wget --spider -q http://alertmanager:9093/-/ready || exit 1", + ] + interval: 10s + timeout: 5s + retries: 5 + # ----------------------------- + # Grafana - Dashboards / Visualization + # ----------------------------- grafana: image: grafana/grafana:latest container_name: grafana ports: - "3000:3000" environment: - - GF_SECURITY_ADMIN_USER=admin - - GF_SECURITY_ADMIN_PASSWORD=admin + GF_SECURITY_ADMIN_USER: admin + GF_SECURITY_ADMIN_PASSWORD: admin + GF_LOG_LEVEL: info depends_on: - prometheus volumes: - grafana-storage:/var/lib/grafana restart: unless-stopped + healthcheck: + test: + [ + "CMD-SHELL", + "wget --spider -q http://grafana:3000/api/health || exit 1", + ] + interval: 10s + timeout: 5s + retries: 5 + + # ----------------------------- + # Loki - Logs aggregation + # ----------------------------- + loki: + image: grafana/loki:2.8.2 + container_name: loki + ports: + - "3100:3100" + user: "10001:10001" + command: + - -config.file=/etc/loki/loki-config.yaml + volumes: + - ./loki/loki-config.yaml:/etc/loki/loki-config.yaml:ro + - loki-index:/loki/index + - loki-cache:/loki/cache + - loki-chunks:/loki/chunks + - loki-wal:/loki/wal + - loki-compactor:/loki/compactor + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --spider -q http://loki:3100/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + # ----------------------------- + # Tempo - Traces aggregation + # ----------------------------- + tempo: + image: grafana/tempo:2.5.0 + container_name: tempo + ports: + - "3200:3200" # HTTP API/Query + - "4320:4320" # Internal gRPC (server port) + - "4318:4318" # OTLP gRPC receiver (for receiving traces) + - "4319:4319" # OTLP HTTP receiver + user: "10001:10001" + volumes: + - ./tempo/tempo-config.yaml:/etc/tempo/tempo-config.yaml:ro + - tempo-data:/tempo-data + command: + - -config.file=/etc/tempo/tempo-config.yaml + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --spider -q http://tempo:3200/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + + # ----------------------------- + # IRC relay - Alert notifications + # ----------------------------- irc-relay: build: context: ./irc-deamon @@ -52,6 +143,16 @@ services: - ./irc-deamon/config.yml:/etc/alertmanager-irc-relay/config.yml:ro command: ["--config", "/etc/alertmanager-irc-relay/config.yml"] restart: unless-stopped + depends_on: + - alertmanager volumes: + prometheus-data: + alertmanager-data: grafana-storage: + tempo-data: + loki-index: + loki-cache: + loki-chunks: + loki-wal: + loki-compactor: diff --git a/docker-compose.yml.bak b/docker-compose.yml.bak new file mode 100644 index 0000000..b496e2f --- /dev/null +++ b/docker-compose.yml.bak @@ -0,0 +1,158 @@ +version: "3.9" + +services: + # ----------------------------- + # Prometheus - Metrics + # ----------------------------- + prometheus: + image: prom/prometheus:latest + container_name: prometheus + ports: + - "9090:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./prometheus/alert.rules.yml:/etc/prometheus/alert.rules.yml:ro + - prometheus-data:/prometheus + command: + - --config.file=/etc/prometheus/prometheus.yml + - --web.enable-lifecycle + depends_on: + alertmanager: + condition: service_healthy + restart: unless-stopped + healthcheck: + test: + [ + "CMD-SHELL", + "wget --spider -q http://prometheus:9090/-/ready || exit 1", + ] + interval: 10s + timeout: 5s + retries: 5 + + # ----------------------------- + # Alertmanager - Alerts + # ----------------------------- + alertmanager: + image: prom/alertmanager:latest + container_name: alertmanager + ports: + - "9093:9093" + volumes: + - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + - alertmanager-data:/alertmanager + command: + - --config.file=/etc/alertmanager/alertmanager.yml + restart: unless-stopped + healthcheck: + test: + [ + "CMD-SHELL", + "wget --spider -q http://alertmanager:9093/-/ready || exit 1", + ] + interval: 10s + timeout: 5s + retries: 5 + + # ----------------------------- + # Grafana - Dashboards / Visualization + # ----------------------------- + grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "3000:3000" + environment: + GF_SECURITY_ADMIN_USER: admin + GF_SECURITY_ADMIN_PASSWORD: admin + GF_LOG_LEVEL: info + depends_on: + - prometheus + volumes: + - grafana-storage:/var/lib/grafana + restart: unless-stopped + healthcheck: + test: + [ + "CMD-SHELL", + "wget --spider -q http://grafana:3000/api/health || exit 1", + ] + interval: 10s + timeout: 5s + retries: 5 + + # ----------------------------- + # Loki - Logs aggregation + # ----------------------------- + loki: + image: grafana/loki:2.8.2 + container_name: loki + ports: + - "3100:3100" + user: "0:0" + #user: "10001:10001" # ensures proper permissions on volumes + command: + - -config.file=/etc/loki/loki-config.yaml + volumes: + - ./loki/loki-config.yaml:/etc/loki/loki-config.yaml:ro + - loki-index:/loki/index + #- loki-cache:/loki/cache + - loki-chunks:/loki/chunks + #- ./wal:/wal + - loki-wal:/loki/wal + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --spider -q http://loki:3100/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + entrypoint: ["/loki-init.sh"] + + # ----------------------------- + # Tempo - Traces aggregation + # ----------------------------- + tempo: + image: grafana/tempo:2.6.0 + container_name: tempo + ports: + - "3200:3200" # HTTP API + - "4317:4317" # OTLP gRPC + user: "10001:10001" # ensures proper permissions + volumes: + - ./tempo/tempo-config.yaml:/etc/tempo/tempo-config.yaml:ro + - tempo-data:/tempo-data + command: + - -config.file=/etc/tempo/tempo-config.yaml + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --spider -q http://tempo:3200/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + + # ----------------------------- + # IRC relay - Alert notifications + # ----------------------------- + irc-relay: + build: + context: ./irc-deamon + dockerfile: Dockerfile.irc + container_name: irc-relay + ports: + - "8010:8010" + volumes: + - ./irc-deamon/config.yml:/etc/alertmanager-irc-relay/config.yml:ro + command: ["--config", "/etc/alertmanager-irc-relay/config.yml"] + restart: unless-stopped + depends_on: + - alertmanager + +volumes: + prometheus-data: + alertmanager-data: + grafana-storage: + tempo-data: + loki-index: + loki-cache: + loki-chunks: + loki-wal: diff --git a/loki-init.sh b/loki-init.sh new file mode 100755 index 0000000..d1ddce7 --- /dev/null +++ b/loki-init.sh @@ -0,0 +1,14 @@ +#!/bin/sh +# loki-init.sh +# Ensure all mounted volumes have correct ownership before starting Loki + +LOKI_UID=10001 +LOKI_GID=10001 + +echo "Fixing permissions for Loki volumes..." +# Recursively chown mounted paths +chown -R $LOKI_UID:$LOKI_GID /loki/index /loki/chunks /loki/cache || true + +echo "Starting Loki..." +# Execute original Loki command +exec /usr/bin/loki "$@" diff --git a/loki/data/index/uploader/name b/loki/data/index/uploader/name new file mode 100755 index 0000000..24f57e9 --- /dev/null +++ b/loki/data/index/uploader/name @@ -0,0 +1 @@ +0cb302e1b50b-1764570257388144636 \ No newline at end of file diff --git a/loki/loki-config.yaml b/loki/loki-config.yaml new file mode 100755 index 0000000..34ab518 --- /dev/null +++ b/loki/loki-config.yaml @@ -0,0 +1,55 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9095 + log_level: info + +ingester: + wal: + enabled: true + dir: /loki/wal # path must exist and be writable by UID 10001 + lifecycler: + address: 0.0.0.0 + ring: + kvstore: + store: inmemory + replication_factor: 1 + final_sleep: 0s + chunk_idle_period: 5m + chunk_retain_period: 30s + max_transfer_retries: 0 + +schema_config: + configs: + - from: 2020-10-24 + store: boltdb-shipper + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 24h + +storage_config: + boltdb_shipper: + active_index_directory: /loki/index + cache_location: /loki/cache + shared_store: filesystem + filesystem: + directory: /loki/chunks + +limits_config: + enforce_metric_name: false + reject_old_samples: true + reject_old_samples_max_age: 168h + +chunk_store_config: + max_look_back_period: 0s + +table_manager: + retention_deletes_enabled: false + retention_period: 0s + +compactor: + working_directory: /loki/compactor + shared_store: filesystem diff --git a/prometheus/alert.rules.yml b/prometheus/alert.rules.yml index e72767e..1bed6c6 100644 --- a/prometheus/alert.rules.yml +++ b/prometheus/alert.rules.yml @@ -42,11 +42,11 @@ groups: description: "{{ $labels.instance }} has been unreachable for 1 minute." # test alert always fires - - alert: TestAlert - expr: vector(1) # always true, will fire immediately - for: 10s - labels: - severity: critical - annotations: - summary: "This is a test alert" - description: "Verifying Alertmanager -> IRC relay pipeline" + #- alert: TestAlert + # expr: vector(1) # always true, will fire immediately + # for: 10s + # labels: + # severity: critical + # annotations: + # summary: "This is a test alert" + # description: "Verifying Alertmanager -> IRC relay pipeline" diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml index 398ef60..c255f2e 100644 --- a/prometheus/prometheus.yml +++ b/prometheus/prometheus.yml @@ -23,3 +23,7 @@ scrape_configs: - job_name: "prod-node" static_configs: - targets: ["149.28.239.165:9100"] + + - job_name: "testforums" + static_configs: + - targets: ["144.202.63.236:9100"] diff --git a/tempo/tempo-config.yaml b/tempo/tempo-config.yaml new file mode 100644 index 0000000..e216d4d --- /dev/null +++ b/tempo/tempo-config.yaml @@ -0,0 +1,27 @@ +server: + http_listen_port: 3200 + grpc_listen_port: 4320 + log_level: info + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4318" # Different port from server + http: + endpoint: "0.0.0.0:4319" + +ingester: + lifecycler: + ring: + kvstore: + store: inmemory + trace_idle_period: 5m + max_block_duration: 1h + +storage: + trace: + backend: local + local: + path: /tempo-data/traces