From 927af54ac8db6b5224dedc76e946ea8026d9fa2e Mon Sep 17 00:00:00 2001 From: Kapten boneng Date: Fri, 6 Feb 2026 01:06:33 +0700 Subject: [PATCH 01/17] Add Telegram alert script for notifications --- health/alert_telegram.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 health/alert_telegram.sh diff --git a/health/alert_telegram.sh b/health/alert_telegram.sh new file mode 100644 index 00000000..1743d99d --- /dev/null +++ b/health/alert_telegram.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +BOT_TOKEN="ISI_DENGAN_BOT_TOKEN_KAMU" +CHAT_ID="ISI_DENGAN_CHAT_ID_KAMU" + +MSG="$1" + +curl -s -X POST "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \ + -d chat_id="${CHAT_ID}" \ + -d text="🚨 [Pi Node Alert]\n${MSG}" \ + -d parse_mode="HTML" From 013aaeb204d13f0174046740d993f9fae8c5c1d6 Mon Sep 17 00:00:00 2001 From: Kapten boneng Date: Fri, 6 Feb 2026 01:07:08 +0700 Subject: [PATCH 02/17] Add auto_recover.sh for health monitoring Implement an auto-recovery script that checks node health and restarts services if unhealthy. --- health/health/auto_recover.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 health/health/auto_recover.sh diff --git a/health/health/auto_recover.sh b/health/health/auto_recover.sh new file mode 100644 index 00000000..f6d66ea0 --- /dev/null +++ b/health/health/auto_recover.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +while true; do + /health/healthcheck.sh + if [ $? -ne 0 ]; then + MSG="Node unhealthy. Restarting services..." + echo "[AUTO] $MSG" + /health/alert_telegram.sh "$MSG" + supervisorctl restart stellar-core + supervisorctl restart horizon + fi + sleep 60 +done From bce730e3cd3f50f7bf9cdfe69b6fe77550c58809 Mon Sep 17 00:00:00 2001 From: Kapten boneng Date: Fri, 6 Feb 2026 01:08:27 +0700 Subject: [PATCH 03/17] Add supervisord configuration for auto_recover program --- health/health/supervisord.conf | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 health/health/supervisord.conf diff --git a/health/health/supervisord.conf b/health/health/supervisord.conf new file mode 100644 index 00000000..8b8ee728 --- /dev/null +++ b/health/health/supervisord.conf @@ -0,0 +1,6 @@ +[program:auto_recover] +command=/health/auto_recover.sh +autostart=true +autorestart=true +stderr_logfile=/var/log/supervisor/auto_recover.err.log +stdout_logfile=/var/log/supervisor/auto_recover.out.log From 3e4469f935f42509909106ffc60325f46ab42f43 Mon Sep 17 00:00:00 2001 From: Kapten boneng Date: Fri, 6 Feb 2026 01:09:12 +0700 Subject: [PATCH 04/17] Create node_metrics.sh for service health checks Add a script to check the status of Horizon and Stellar-core services. --- health/health/metrics/node_metrics.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 health/health/metrics/node_metrics.sh diff --git a/health/health/metrics/node_metrics.sh b/health/health/metrics/node_metrics.sh new file mode 100644 index 00000000..a1b08614 --- /dev/null +++ b/health/health/metrics/node_metrics.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +HORIZON_OK=$(curl -sf http://localhost:8000/ > /dev/null && echo 1 || echo 0) +CORE_OK=$(curl -sf http://localhost:11626/info > /dev/null && echo 1 || echo 0) + +echo "# HELP pi_node_horizon_up Horizon service status" +echo "# TYPE pi_node_horizon_up gauge" +echo "pi_node_horizon_up $HORIZON_OK" + +echo "# HELP pi_node_core_up Stellar-core service status" +echo "# TYPE pi_node_core_up gauge" +echo "pi_node_core_up $CORE_OK" From 36117a9e9e77c5e58a5bbe567189bdd774650e77 Mon Sep 17 00:00:00 2001 From: Kapten boneng Date: Fri, 6 Feb 2026 01:09:47 +0700 Subject: [PATCH 05/17] Add metrics server script to serve metrics --- health/health/metrics/metrics/metrics_server.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 health/health/metrics/metrics/metrics_server.sh diff --git a/health/health/metrics/metrics/metrics_server.sh b/health/health/metrics/metrics/metrics_server.sh new file mode 100644 index 00000000..033fadfb --- /dev/null +++ b/health/health/metrics/metrics/metrics_server.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +PORT=9105 + +while true; do + { + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\n" + /metrics/node_metrics.sh + } | nc -l -p $PORT -q 1 +done From cb6eb368abe73dd1455ce9a7cd3dd0c978eef313 Mon Sep 17 00:00:00 2001 From: Kapten boneng Date: Fri, 6 Feb 2026 01:10:22 +0700 Subject: [PATCH 06/17] Add supervisord configuration for metrics server --- health/health/metrics/metrics/supervisord.conf | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 health/health/metrics/metrics/supervisord.conf diff --git a/health/health/metrics/metrics/supervisord.conf b/health/health/metrics/metrics/supervisord.conf new file mode 100644 index 00000000..9e596f02 --- /dev/null +++ b/health/health/metrics/metrics/supervisord.conf @@ -0,0 +1,6 @@ +[program:metrics_server] +command=/metrics/metrics_server.sh +autostart=true +autorestart=true +stderr_logfile=/var/log/supervisor/metrics.err.log +stdout_logfile=/var/log/supervisor/metrics.out.log From fd7e2ea21713b2f459218fc34cb65e2031bcc022 Mon Sep 17 00:00:00 2001 From: Kapten boneng Date: Fri, 6 Feb 2026 01:11:02 +0700 Subject: [PATCH 07/17] Add Prometheus configuration for pi-node monitoring --- health/health/metrics/metrics/monitoring/prometheus.yml | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 health/health/metrics/metrics/monitoring/prometheus.yml diff --git a/health/health/metrics/metrics/monitoring/prometheus.yml b/health/health/metrics/metrics/monitoring/prometheus.yml new file mode 100644 index 00000000..1f00d8a9 --- /dev/null +++ b/health/health/metrics/metrics/monitoring/prometheus.yml @@ -0,0 +1,7 @@ +global: + scrape_interval: 15s + +scrape_configs: + - job_name: "pi-node" + static_configs: + - targets: ["pi-node:9105"] From cc2f2aa73bff414790e1b5fb2a9fc82ee84df4fa Mon Sep 17 00:00:00 2001 From: Kapten boneng Date: Fri, 6 Feb 2026 01:12:30 +0700 Subject: [PATCH 08/17] Add Docker Compose configuration for monitoring services --- docker-compose.monitoring.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 docker-compose.monitoring.yml diff --git a/docker-compose.monitoring.yml b/docker-compose.monitoring.yml new file mode 100644 index 00000000..0f1be202 --- /dev/null +++ b/docker-compose.monitoring.yml @@ -0,0 +1,16 @@ +version: "3.9" + +services: + prometheus: + image: prom/prometheus + container_name: pi-prometheus + volumes: + - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml + ports: + - "9090:9090" + + grafana: + image: grafana/grafana + container_name: pi-grafana + ports: + - "3000:3000" From 2b9f339ec73f96a0ef9c07622d38525d8d6773f1 Mon Sep 17 00:00:00 2001 From: Kapten boneng Date: Fri, 6 Feb 2026 01:18:32 +0700 Subject: [PATCH 09/17] Add healthcheck script for Horizon and stellar-core This script checks the health of the Horizon and stellar-core services by making HTTP requests and reporting their status. --- health/healthcheck.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 health/healthcheck.sh diff --git a/health/healthcheck.sh b/health/healthcheck.sh new file mode 100644 index 00000000..49b9680a --- /dev/null +++ b/health/healthcheck.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +set -e + +HORIZON_URL="http://localhost:8000/" +CORE_INFO_URL="http://localhost:11626/info" + +# Check Horizon +if ! curl -sf "$HORIZON_URL" > /dev/null; then + echo "[HEALTH] Horizon is DOWN" + exit 1 +fi + +# Check stellar-core +if ! curl -sf "$CORE_INFO_URL" > /dev/null; then + echo "[HEALTH] stellar-core is DOWN" + exit 1 +fi + +echo "[HEALTH] Pi Node is HEALTHY" +exit 0 From 89e8c4e973bf15a3c348c0621547bdc4d89a3b5d Mon Sep 17 00:00:00 2001 From: Kapten boneng Date: Fri, 6 Feb 2026 01:20:14 +0700 Subject: [PATCH 10/17] Add health check support to Dockerfile Add health check script and configure Docker healthcheck. --- Dockerfile | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Dockerfile b/Dockerfile index 2b48c5fb..3017cc97 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,3 +38,10 @@ ADD start / RUN ["chmod", "+x", "start"] ENTRYPOINT ["/start"] +# Copy health check script +COPY health/healthcheck.sh /health/healthcheck.sh +RUN chmod +x /health/healthcheck.sh + +# Docker native healthcheck +HEALTHCHECK --interval=30s --timeout=5s --retries=3 --start-period=60s \ + CMD /health/healthcheck.sh || exit 1 From d739fba08b534e080543505cdcc36584f833072a Mon Sep 17 00:00:00 2001 From: Kapten boneng Date: Fri, 6 Feb 2026 01:23:17 +0700 Subject: [PATCH 11/17] Add Docker Compose configuration for mainnet service --- docker-compose.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 docker-compose.yml diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..55d4dc13 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,13 @@ +services: + mainnet: + image: pinetwork/pi-node-docker:organization_mainnet-v1.3-p19.6 + container_name: pi-node + restart: unless-stopped + volumes: + - ./data/stellar:/opt/stellar + - ./data/logs:/var/log/supervisor + ports: + - "31401:8000" + - "31402:31402" + - "31403:1570" + command: ["--mainnet", "--enable-auto-migrations"] From 79d056a0c6aed6a83b66f5d9339113a3a4434c94 Mon Sep 17 00:00:00 2001 From: Kapten boneng Date: Fri, 6 Feb 2026 01:23:55 +0700 Subject: [PATCH 12/17] Add auto-recovery script for service monitoring This script continuously checks the health of services and restarts them if they are not healthy. --- health/auto_recover.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 health/auto_recover.sh diff --git a/health/auto_recover.sh b/health/auto_recover.sh new file mode 100644 index 00000000..6265794d --- /dev/null +++ b/health/auto_recover.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +while true; do + /health/healthcheck.sh + if [ $? -ne 0 ]; then + echo "[AUTO-RECOVER] Restarting services..." + supervisorctl restart stellar-core + supervisorctl restart horizon + fi + sleep 60 +done From 25b815e3eb5c88e782835b8610d1e4f6d27d48b2 Mon Sep 17 00:00:00 2001 From: Kapten boneng Date: Fri, 6 Feb 2026 01:25:51 +0700 Subject: [PATCH 13/17] Add auto_recover program configuration to supervisord --- supervisord.conf | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 supervisord.conf diff --git a/supervisord.conf b/supervisord.conf new file mode 100644 index 00000000..8b8ee728 --- /dev/null +++ b/supervisord.conf @@ -0,0 +1,6 @@ +[program:auto_recover] +command=/health/auto_recover.sh +autostart=true +autorestart=true +stderr_logfile=/var/log/supervisor/auto_recover.err.log +stdout_logfile=/var/log/supervisor/auto_recover.out.log From 605486db9b35177d8951e9cab4f8ceea85a06e90 Mon Sep 17 00:00:00 2001 From: Kapten boneng Date: Fri, 6 Feb 2026 09:33:27 +0700 Subject: [PATCH 14/17] Add healthcheck_v2.sh for service and disk monitoring Implement a health check script to monitor services and disk space. --- health/healthcheck_v2.sh | 47 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 health/healthcheck_v2.sh diff --git a/health/healthcheck_v2.sh b/health/healthcheck_v2.sh new file mode 100644 index 00000000..ff39c4a2 --- /dev/null +++ b/health/healthcheck_v2.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +set -euo pipefail + +LOG_FILE="/var/log/pi-node-health.log" +MIN_DISK_GB=10 +HORIZON_URL="http://localhost:8000" +CORE_PORT=11626 + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] [HEALTH] $*" | tee -a "$LOG_FILE" +} + +check_service() { + local name="$1" + local cmd="$2" + if eval "$cmd" >/dev/null 2>&1; then + log "OK: $name" + return 0 + else + log "FAIL: $name" + return 1 + fi +} + +check_disk() { + local avail + avail=$(df -BG / | awk 'NR==2{gsub("G","",$4);print $4}') + if (( avail < MIN_DISK_GB )); then + log "FAIL: Disk space low (${avail}GB)" + return 1 + fi + log "OK: Disk space ${avail}GB" +} + +main() { + log "Starting health check..." + + check_service "Horizon API" "curl -sf ${HORIZON_URL}/" || return 1 + check_service "Stellar-Core Port" "nc -z localhost ${CORE_PORT}" || return 1 + check_service "PostgreSQL" "pg_isready" || return 1 + check_disk || return 1 + + log "Health check PASSED" + return 0 +} + +main From 71b1e5a5a5e7d643637ab61ef9f0ed9f29f73137 Mon Sep 17 00:00:00 2001 From: Kapten boneng Date: Fri, 6 Feb 2026 09:35:44 +0700 Subject: [PATCH 15/17] Add healthcheck auto-recovery script --- health/health/healthcheck_auto_recover.sh | 46 +++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 health/health/healthcheck_auto_recover.sh diff --git a/health/health/healthcheck_auto_recover.sh b/health/health/healthcheck_auto_recover.sh new file mode 100644 index 00000000..fbc443be --- /dev/null +++ b/health/health/healthcheck_auto_recover.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +set -euo pipefail + +CHECK_SCRIPT="/opt/stellar/health/healthcheck.sh" +ALERT_SCRIPT="/opt/stellar/health/alert_manager.sh" +LOG_FILE="/var/log/pi-node-recover.log" + +MAX_RETRIES=5 +BASE_DELAY=10 + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] [RECOVER] $*" | tee -a "$LOG_FILE" +} + +attempt_recover() { + local retry=0 + while (( retry < MAX_RETRIES )); do + if "$CHECK_SCRIPT"; then + log "Node healthy again" + "$ALERT_SCRIPT" "Node recovered successfully" "info" + return 0 + fi + + delay=$(( BASE_DELAY * (2 ** retry) )) + log "Health failed. Restarting services... attempt=$((retry+1)) wait=${delay}s" + "$ALERT_SCRIPT" "Node unhealthy. Restart attempt $((retry+1))" "warning" + + docker compose restart pi-node || true + sleep "$delay" + + ((retry++)) + done + + log "Max retries reached. Node still unhealthy." + "$ALERT_SCRIPT" "CRITICAL: Node recovery failed after $MAX_RETRIES attempts" "critical" + return 1 +} + +main() { + if ! "$CHECK_SCRIPT"; then + log "Health check failed → starting recovery" + attempt_recover + fi +} + +main From 7104a582549ec19e5ecbc532c578e771365c1a40 Mon Sep 17 00:00:00 2001 From: Kapten boneng Date: Fri, 6 Feb 2026 09:36:48 +0700 Subject: [PATCH 16/17] Add alert_manager.sh for health alert management --- health/health/health/alert_manager.sh | 34 +++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 health/health/health/alert_manager.sh diff --git a/health/health/health/alert_manager.sh b/health/health/health/alert_manager.sh new file mode 100644 index 00000000..a0e61d99 --- /dev/null +++ b/health/health/health/alert_manager.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +set -euo pipefail + +CONFIG="/opt/stellar/health/alert_config.env" +LOG_FILE="/var/log/pi-node-alerts.log" + +source "$CONFIG" + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] [ALERT] $*" >> "$LOG_FILE" +} + +send_telegram() { + local msg="$1" + curl -s -X POST "https://api.telegram.org/bot${TG_BOT_TOKEN}/sendMessage" \ + -d chat_id="${TG_CHAT_ID}" \ + -d text="$msg" >/dev/null +} + +send_alert() { + local text="$1" + local level="${2:-info}" + + message="[$(hostname)] [$level] $text" + log "$message" + + if [[ "${ENABLE_TELEGRAM}" == "true" ]]; then + send_telegram "$message" + fi +} + +if [[ $# -ge 1 ]]; then + send_alert "$1" "${2:-info}" +fi From ed241aacab901cc1b2f86a2665ab119842c1e9af Mon Sep 17 00:00:00 2001 From: Kapten boneng Date: Sun, 8 Feb 2026 11:09:50 +0700 Subject: [PATCH 17/17] Create Grafana dashboard for Pi Node monitoring Add a new Grafana dashboard for monitoring Pi Node metrics. --- grafana-dashboard.json | 65 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 grafana-dashboard.json diff --git a/grafana-dashboard.json b/grafana-dashboard.json new file mode 100644 index 00000000..4122efa5 --- /dev/null +++ b/grafana-dashboard.json @@ -0,0 +1,65 @@ +{ + "id": null, + "uid": "pi-node-enterprise-monitoring", + "title": "Pi Node Enterprise Monitoring", + "tags": ["pi-network", "node", "enterprise", "monitoring"], + "timezone": "browser", + "schemaVersion": 38, + "version": 1, + "refresh": "10s", + "panels": [ + { + "type": "stat", + "title": "Node Health", + "id": 1, + "gridPos": { "x": 0, "y": 0, "w": 8, "h": 4 }, + "targets": [ + { + "expr": "up{job=\"pi-node\"}", + "refId": "A" + } + ], + "options": { + "reduceOptions": { "calcs": ["last"], "fields": "", "values": false }, + "orientation": "auto", + "colorMode": "background" + } + }, + { + "type": "timeseries", + "title": "CPU Usage", + "id": 2, + "gridPos": { "x": 8, "y": 0, "w": 8, "h": 6 }, + "targets": [ + { + "expr": "100 - (avg by (instance)(irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", + "refId": "A" + } + ] + }, + { + "type": "timeseries", + "title": "Memory Usage", + "id": 3, + "gridPos": { "x": 16, "y": 0, "w": 8, "h": 6 }, + "targets": [ + { + "expr": "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100", + "refId": "A" + } + ] + }, + { + "type": "timeseries", + "title": "Disk Usage", + "id": 4, + "gridPos": { "x": 0, "y": 6, "w": 24, "h": 6 }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"} * 100)", + "refId": "A" + } + ] + } + ] +}