diff --git a/Dockerfile b/Dockerfile index 2b48c5fb4..3017cc97a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,3 +38,10 @@ ADD start / RUN ["chmod", "+x", "start"] ENTRYPOINT ["/start"] +# Copy health check script +COPY health/healthcheck.sh /health/healthcheck.sh +RUN chmod +x /health/healthcheck.sh + +# Docker native healthcheck +HEALTHCHECK --interval=30s --timeout=5s --retries=3 --start-period=60s \ + CMD /health/healthcheck.sh || exit 1 diff --git a/docker-compose.monitoring.yml b/docker-compose.monitoring.yml new file mode 100644 index 000000000..0f1be202c --- /dev/null +++ b/docker-compose.monitoring.yml @@ -0,0 +1,16 @@ +version: "3.9" + +services: + prometheus: + image: prom/prometheus + container_name: pi-prometheus + volumes: + - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml + ports: + - "9090:9090" + + grafana: + image: grafana/grafana + container_name: pi-grafana + ports: + - "3000:3000" diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 000000000..55d4dc13a --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,13 @@ +services: + mainnet: + image: pinetwork/pi-node-docker:organization_mainnet-v1.3-p19.6 + container_name: pi-node + restart: unless-stopped + volumes: + - ./data/stellar:/opt/stellar + - ./data/logs:/var/log/supervisor + ports: + - "31401:8000" + - "31402:31402" + - "31403:1570" + command: ["--mainnet", "--enable-auto-migrations"] diff --git a/health/alert_telegram.sh b/health/alert_telegram.sh new file mode 100644 index 000000000..1743d99d0 --- /dev/null +++ b/health/alert_telegram.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +BOT_TOKEN="ISI_DENGAN_BOT_TOKEN_KAMU" +CHAT_ID="ISI_DENGAN_CHAT_ID_KAMU" + +MSG="$1" + +curl -s -X POST "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \ + -d chat_id="${CHAT_ID}" \ + -d text="🚨 [Pi Node Alert]\n${MSG}" \ + -d parse_mode="HTML" diff --git a/health/auto_recover.sh b/health/auto_recover.sh new file mode 100644 index 000000000..6265794d4 --- /dev/null +++ b/health/auto_recover.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +while true; do + /health/healthcheck.sh + if [ $? -ne 0 ]; then + echo "[AUTO-RECOVER] Restarting services..." + supervisorctl restart stellar-core + supervisorctl restart horizon + fi + sleep 60 +done diff --git a/health/health/auto_recover.sh b/health/health/auto_recover.sh new file mode 100644 index 000000000..f6d66ea08 --- /dev/null +++ b/health/health/auto_recover.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +while true; do + /health/healthcheck.sh + if [ $? -ne 0 ]; then + MSG="Node unhealthy. Restarting services..." + echo "[AUTO] $MSG" + /health/alert_telegram.sh "$MSG" + supervisorctl restart stellar-core + supervisorctl restart horizon + fi + sleep 60 +done diff --git a/health/health/health/MONITORING.md b/health/health/health/MONITORING.md new file mode 100644 index 000000000..306cd3d99 --- /dev/null +++ b/health/health/health/MONITORING.md @@ -0,0 +1,17 @@ +# Pi Node Docker – Enterprise Monitoring & Auto-Recovery + +This document describes the production-grade monitoring, alerting, and auto-recovery stack for Pi Node Docker. + +## 🚀 Features + +- Advanced Health Check (Horizon, Stellar-Core, PostgreSQL, disk, sync) +- Smart Auto-Recovery with exponential backoff +- Real-time Telegram alerts (extensible to Slack/Discord/Email) +- Prometheus metrics exporter +- Grafana dashboard (ready-to-import) +- Prometheus alert rules +- One-command setup script + +--- + +## 📁 Directory Structure diff --git a/health/health/health/alert_manager.sh b/health/health/health/alert_manager.sh new file mode 100644 index 000000000..a0e61d997 --- /dev/null +++ b/health/health/health/alert_manager.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +set -euo pipefail + +CONFIG="/opt/stellar/health/alert_config.env" +LOG_FILE="/var/log/pi-node-alerts.log" + +source "$CONFIG" + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] [ALERT] $*" >> "$LOG_FILE" +} + +send_telegram() { + local msg="$1" + curl -s -X POST "https://api.telegram.org/bot${TG_BOT_TOKEN}/sendMessage" \ + -d chat_id="${TG_CHAT_ID}" \ + -d text="$msg" >/dev/null +} + +send_alert() { + local text="$1" + local level="${2:-info}" + + message="[$(hostname)] [$level] $text" + log "$message" + + if [[ "${ENABLE_TELEGRAM}" == "true" ]]; then + send_telegram "$message" + fi +} + +if [[ $# -ge 1 ]]; then + send_alert "$1" "${2:-info}" +fi diff --git a/health/health/healthcheck_auto_recover.sh b/health/health/healthcheck_auto_recover.sh new file mode 100644 index 000000000..fbc443bee --- /dev/null +++ b/health/health/healthcheck_auto_recover.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +set -euo pipefail + +CHECK_SCRIPT="/opt/stellar/health/healthcheck.sh" +ALERT_SCRIPT="/opt/stellar/health/alert_manager.sh" +LOG_FILE="/var/log/pi-node-recover.log" + +MAX_RETRIES=5 +BASE_DELAY=10 + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] [RECOVER] $*" | tee -a "$LOG_FILE" +} + +attempt_recover() { + local retry=0 + while (( retry < MAX_RETRIES )); do + if "$CHECK_SCRIPT"; then + log "Node healthy again" + "$ALERT_SCRIPT" "Node recovered successfully" "info" + return 0 + fi + + delay=$(( BASE_DELAY * (2 ** retry) )) + log "Health failed. Restarting services... attempt=$((retry+1)) wait=${delay}s" + "$ALERT_SCRIPT" "Node unhealthy. Restart attempt $((retry+1))" "warning" + + docker compose restart pi-node || true + sleep "$delay" + + ((retry++)) + done + + log "Max retries reached. Node still unhealthy." + "$ALERT_SCRIPT" "CRITICAL: Node recovery failed after $MAX_RETRIES attempts" "critical" + return 1 +} + +main() { + if ! "$CHECK_SCRIPT"; then + log "Health check failed → starting recovery" + attempt_recover + fi +} + +main diff --git a/health/health/metrics/metrics/metrics_server.sh b/health/health/metrics/metrics/metrics_server.sh new file mode 100644 index 000000000..033fadfb1 --- /dev/null +++ b/health/health/metrics/metrics/metrics_server.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +PORT=9105 + +while true; do + { + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\n" + /metrics/node_metrics.sh + } | nc -l -p $PORT -q 1 +done diff --git a/health/health/metrics/metrics/monitoring/prometheus.yml b/health/health/metrics/metrics/monitoring/prometheus.yml new file mode 100644 index 000000000..1f00d8a91 --- /dev/null +++ b/health/health/metrics/metrics/monitoring/prometheus.yml @@ -0,0 +1,7 @@ +global: + scrape_interval: 15s + +scrape_configs: + - job_name: "pi-node" + static_configs: + - targets: ["pi-node:9105"] diff --git a/health/health/metrics/metrics/supervisord.conf b/health/health/metrics/metrics/supervisord.conf new file mode 100644 index 000000000..9e596f022 --- /dev/null +++ b/health/health/metrics/metrics/supervisord.conf @@ -0,0 +1,6 @@ +[program:metrics_server] +command=/metrics/metrics_server.sh +autostart=true +autorestart=true +stderr_logfile=/var/log/supervisor/metrics.err.log +stdout_logfile=/var/log/supervisor/metrics.out.log diff --git a/health/health/metrics/node_metrics.sh b/health/health/metrics/node_metrics.sh new file mode 100644 index 000000000..a1b086142 --- /dev/null +++ b/health/health/metrics/node_metrics.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +HORIZON_OK=$(curl -sf http://localhost:8000/ > /dev/null && echo 1 || echo 0) +CORE_OK=$(curl -sf http://localhost:11626/info > /dev/null && echo 1 || echo 0) + +echo "# HELP pi_node_horizon_up Horizon service status" +echo "# TYPE pi_node_horizon_up gauge" +echo "pi_node_horizon_up $HORIZON_OK" + +echo "# HELP pi_node_core_up Stellar-core service status" +echo "# TYPE pi_node_core_up gauge" +echo "pi_node_core_up $CORE_OK" diff --git a/health/health/supervisord.conf b/health/health/supervisord.conf new file mode 100644 index 000000000..8b8ee7286 --- /dev/null +++ b/health/health/supervisord.conf @@ -0,0 +1,6 @@ +[program:auto_recover] +command=/health/auto_recover.sh +autostart=true +autorestart=true +stderr_logfile=/var/log/supervisor/auto_recover.err.log +stdout_logfile=/var/log/supervisor/auto_recover.out.log diff --git a/health/healthcheck.sh b/health/healthcheck.sh new file mode 100644 index 000000000..49b9680a1 --- /dev/null +++ b/health/healthcheck.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +set -e + +HORIZON_URL="http://localhost:8000/" +CORE_INFO_URL="http://localhost:11626/info" + +# Check Horizon +if ! curl -sf "$HORIZON_URL" > /dev/null; then + echo "[HEALTH] Horizon is DOWN" + exit 1 +fi + +# Check stellar-core +if ! curl -sf "$CORE_INFO_URL" > /dev/null; then + echo "[HEALTH] stellar-core is DOWN" + exit 1 +fi + +echo "[HEALTH] Pi Node is HEALTHY" +exit 0 diff --git a/health/healthcheck_v2.sh b/health/healthcheck_v2.sh new file mode 100644 index 000000000..ff39c4a27 --- /dev/null +++ b/health/healthcheck_v2.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +set -euo pipefail + +LOG_FILE="/var/log/pi-node-health.log" +MIN_DISK_GB=10 +HORIZON_URL="http://localhost:8000" +CORE_PORT=11626 + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] [HEALTH] $*" | tee -a "$LOG_FILE" +} + +check_service() { + local name="$1" + local cmd="$2" + if eval "$cmd" >/dev/null 2>&1; then + log "OK: $name" + return 0 + else + log "FAIL: $name" + return 1 + fi +} + +check_disk() { + local avail + avail=$(df -BG / | awk 'NR==2{gsub("G","",$4);print $4}') + if (( avail < MIN_DISK_GB )); then + log "FAIL: Disk space low (${avail}GB)" + return 1 + fi + log "OK: Disk space ${avail}GB" +} + +main() { + log "Starting health check..." + + check_service "Horizon API" "curl -sf ${HORIZON_URL}/" || return 1 + check_service "Stellar-Core Port" "nc -z localhost ${CORE_PORT}" || return 1 + check_service "PostgreSQL" "pg_isready" || return 1 + check_disk || return 1 + + log "Health check PASSED" + return 0 +} + +main diff --git a/supervisord.conf b/supervisord.conf new file mode 100644 index 000000000..8b8ee7286 --- /dev/null +++ b/supervisord.conf @@ -0,0 +1,6 @@ +[program:auto_recover] +command=/health/auto_recover.sh +autostart=true +autorestart=true +stderr_logfile=/var/log/supervisor/auto_recover.err.log +stdout_logfile=/var/log/supervisor/auto_recover.out.log