Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,10 @@ ADD start /
RUN ["chmod", "+x", "start"]

ENTRYPOINT ["/start"]
# Copy health check script
COPY health/healthcheck.sh /health/healthcheck.sh
RUN chmod +x /health/healthcheck.sh

# Docker native healthcheck
HEALTHCHECK --interval=30s --timeout=5s --retries=3 --start-period=60s \
CMD /health/healthcheck.sh || exit 1
16 changes: 16 additions & 0 deletions docker-compose.monitoring.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
version: "3.9"

services:
prometheus:
image: prom/prometheus
container_name: pi-prometheus
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml
ports:
- "9090:9090"

grafana:
image: grafana/grafana
container_name: pi-grafana
ports:
- "3000:3000"
13 changes: 13 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
services:
mainnet:
image: pinetwork/pi-node-docker:organization_mainnet-v1.3-p19.6
container_name: pi-node
restart: unless-stopped
volumes:
- ./data/stellar:/opt/stellar
- ./data/logs:/var/log/supervisor
ports:
- "31401:8000"
- "31402:31402"
- "31403:1570"
command: ["--mainnet", "--enable-auto-migrations"]
11 changes: 11 additions & 0 deletions health/alert_telegram.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env bash

BOT_TOKEN="ISI_DENGAN_BOT_TOKEN_KAMU"
CHAT_ID="ISI_DENGAN_CHAT_ID_KAMU"

MSG="$1"

curl -s -X POST "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \
-d chat_id="${CHAT_ID}" \
-d text="🚨 [Pi Node Alert]\n${MSG}" \
-d parse_mode="HTML"
11 changes: 11 additions & 0 deletions health/auto_recover.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env bash

while true; do
/health/healthcheck.sh
if [ $? -ne 0 ]; then
echo "[AUTO-RECOVER] Restarting services..."
supervisorctl restart stellar-core
supervisorctl restart horizon
fi
sleep 60
done
13 changes: 13 additions & 0 deletions health/health/auto_recover.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env bash

while true; do
/health/healthcheck.sh
if [ $? -ne 0 ]; then
MSG="Node unhealthy. Restarting services..."
echo "[AUTO] $MSG"
/health/alert_telegram.sh "$MSG"
supervisorctl restart stellar-core
supervisorctl restart horizon
fi
sleep 60
done
34 changes: 34 additions & 0 deletions health/health/health/alert_manager.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env bash
set -euo pipefail

CONFIG="/opt/stellar/health/alert_config.env"
LOG_FILE="/var/log/pi-node-alerts.log"

source "$CONFIG"

log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [ALERT] $*" >> "$LOG_FILE"
}

send_telegram() {
local msg="$1"
curl -s -X POST "https://api.telegram.org/bot${TG_BOT_TOKEN}/sendMessage" \
-d chat_id="${TG_CHAT_ID}" \
-d text="$msg" >/dev/null
}

send_alert() {
local text="$1"
local level="${2:-info}"

message="[$(hostname)] [$level] $text"
log "$message"

if [[ "${ENABLE_TELEGRAM}" == "true" ]]; then
send_telegram "$message"
fi
}

if [[ $# -ge 1 ]]; then
send_alert "$1" "${2:-info}"
fi
46 changes: 46 additions & 0 deletions health/health/healthcheck_auto_recover.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env bash
set -euo pipefail

CHECK_SCRIPT="/opt/stellar/health/healthcheck.sh"
ALERT_SCRIPT="/opt/stellar/health/alert_manager.sh"
LOG_FILE="/var/log/pi-node-recover.log"

MAX_RETRIES=5
BASE_DELAY=10

log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [RECOVER] $*" | tee -a "$LOG_FILE"
}

attempt_recover() {
local retry=0
while (( retry < MAX_RETRIES )); do
if "$CHECK_SCRIPT"; then
log "Node healthy again"
"$ALERT_SCRIPT" "Node recovered successfully" "info"
return 0
fi

delay=$(( BASE_DELAY * (2 ** retry) ))
log "Health failed. Restarting services... attempt=$((retry+1)) wait=${delay}s"
"$ALERT_SCRIPT" "Node unhealthy. Restart attempt $((retry+1))" "warning"

docker compose restart pi-node || true
sleep "$delay"

((retry++))
done

log "Max retries reached. Node still unhealthy."
"$ALERT_SCRIPT" "CRITICAL: Node recovery failed after $MAX_RETRIES attempts" "critical"
return 1
}

main() {
if ! "$CHECK_SCRIPT"; then
log "Health check failed → starting recovery"
attempt_recover
fi
}

main
10 changes: 10 additions & 0 deletions health/health/metrics/metrics/metrics_server.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env bash

PORT=9105

while true; do
{
echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\n"
/metrics/node_metrics.sh
} | nc -l -p $PORT -q 1
done
7 changes: 7 additions & 0 deletions health/health/metrics/metrics/monitoring/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
global:
scrape_interval: 15s

scrape_configs:
- job_name: "pi-node"
static_configs:
- targets: ["pi-node:9105"]
6 changes: 6 additions & 0 deletions health/health/metrics/metrics/supervisord.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[program:metrics_server]
command=/metrics/metrics_server.sh
autostart=true
autorestart=true
stderr_logfile=/var/log/supervisor/metrics.err.log
stdout_logfile=/var/log/supervisor/metrics.out.log
12 changes: 12 additions & 0 deletions health/health/metrics/node_metrics.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env bash

HORIZON_OK=$(curl -sf http://localhost:8000/ > /dev/null && echo 1 || echo 0)
CORE_OK=$(curl -sf http://localhost:11626/info > /dev/null && echo 1 || echo 0)

echo "# HELP pi_node_horizon_up Horizon service status"
echo "# TYPE pi_node_horizon_up gauge"
echo "pi_node_horizon_up $HORIZON_OK"

echo "# HELP pi_node_core_up Stellar-core service status"
echo "# TYPE pi_node_core_up gauge"
echo "pi_node_core_up $CORE_OK"
6 changes: 6 additions & 0 deletions health/health/supervisord.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[program:auto_recover]
command=/health/auto_recover.sh
autostart=true
autorestart=true
stderr_logfile=/var/log/supervisor/auto_recover.err.log
stdout_logfile=/var/log/supervisor/auto_recover.out.log
20 changes: 20 additions & 0 deletions health/healthcheck.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env bash
set -e

HORIZON_URL="http://localhost:8000/"
CORE_INFO_URL="http://localhost:11626/info"

# Check Horizon
if ! curl -sf "$HORIZON_URL" > /dev/null; then
echo "[HEALTH] Horizon is DOWN"
exit 1
fi

# Check stellar-core
if ! curl -sf "$CORE_INFO_URL" > /dev/null; then
echo "[HEALTH] stellar-core is DOWN"
exit 1
fi

echo "[HEALTH] Pi Node is HEALTHY"
exit 0
47 changes: 47 additions & 0 deletions health/healthcheck_v2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env bash
set -euo pipefail

LOG_FILE="/var/log/pi-node-health.log"
MIN_DISK_GB=10
HORIZON_URL="http://localhost:8000"
CORE_PORT=11626

log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [HEALTH] $*" | tee -a "$LOG_FILE"
}

check_service() {
local name="$1"
local cmd="$2"
if eval "$cmd" >/dev/null 2>&1; then
log "OK: $name"
return 0
else
log "FAIL: $name"
return 1
fi
}

check_disk() {
local avail
avail=$(df -BG / | awk 'NR==2{gsub("G","",$4);print $4}')
if (( avail < MIN_DISK_GB )); then
log "FAIL: Disk space low (${avail}GB)"
return 1
fi
log "OK: Disk space ${avail}GB"
}

main() {
log "Starting health check..."

check_service "Horizon API" "curl -sf ${HORIZON_URL}/" || return 1
check_service "Stellar-Core Port" "nc -z localhost ${CORE_PORT}" || return 1
check_service "PostgreSQL" "pg_isready" || return 1
check_disk || return 1

log "Health check PASSED"
return 0
}

main
6 changes: 6 additions & 0 deletions supervisord.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[program:auto_recover]
command=/health/auto_recover.sh
autostart=true
autorestart=true
stderr_logfile=/var/log/supervisor/auto_recover.err.log
stdout_logfile=/var/log/supervisor/auto_recover.out.log