From faed59fd9bbed27e4910b13ca694e911f689b80b Mon Sep 17 00:00:00 2001 From: Will-Luck Date: Sun, 8 Feb 2026 20:38:29 +0000 Subject: [PATCH] v2.1.0: Circuit breaker, Prometheus metrics, event-driven architecture Major enhancements to the Go rewrite: - Circuit breaker with exponential backoff and restart budgets to prevent restart storms. Per-container action labels (restart/stop/notify/none). - Prometheus /metrics endpoint with counters, gauges, and histograms for restarts, skips, notifications, and event processing. - Event-driven Docker watcher with auto-reconnect (polling fallback for tests). Debouncing and real-time orchestration tracking. - Notification rate limiting and retry with exponential backoff (3 attempts). - Testability interfaces (docker.API, clock.Clock, notify.Notifier) with mock implementations and 25+ unit tests (60% guardian coverage). - Config validation, WaitGroup goroutine management, defensive guards. - CI: coverage reporting, govulncheck, Trivy scanning, 3 new acceptance tests (opt-out, circuit-breaker, custom-label), failure artifact capture. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/build.yml | 73 +++++++- Dockerfile | 7 + cmd/guardian/main.go | 9 + go.mod | 11 +- go.sum | 21 +++ internal/clock/clock.go | 17 ++ internal/config/config.go | 78 ++++++++- internal/docker/containers.go | 6 + internal/docker/interface.go | 28 +++ internal/docker/watcher.go | 129 ++++++++++++++ internal/guardian/dependency.go | 6 +- internal/guardian/dependency_test.go | 222 ++++++++++++++++++++++++ internal/guardian/guardian.go | 207 ++++++++++++++++++++-- internal/guardian/guardian_test.go | 187 ++++++++++++++++++++ internal/guardian/guards.go | 28 +-- internal/guardian/mock_test.go | 179 +++++++++++++++++++ internal/guardian/tracker.go | 204 ++++++++++++++++++++++ internal/guardian/tracker_test.go | 224 ++++++++++++++++++++++++ internal/guardian/unhealthy.go | 77 ++++++++- internal/guardian/unhealthy_test.go | 243 ++++++++++++++++++++++++++ internal/metrics/prometheus.go | 92 +++++++++- internal/notify/notifier.go | 249 ++++++++++++++++++++++----- tests/test-all.sh | 3 + tests/test-circuit-breaker.sh | 154 +++++++++++++++++ tests/test-custom-label.sh | 135 +++++++++++++++ tests/test-opt-out.sh | 127 ++++++++++++++ 26 files changed, 2635 insertions(+), 81 deletions(-) create mode 100644 internal/clock/clock.go create mode 100644 internal/docker/interface.go create mode 100644 internal/docker/watcher.go create mode 100644 internal/guardian/dependency_test.go create mode 100644 internal/guardian/guardian_test.go create mode 100644 internal/guardian/mock_test.go create mode 100644 internal/guardian/tracker.go create mode 100644 internal/guardian/tracker_test.go create mode 100644 internal/guardian/unhealthy_test.go create mode 100644 tests/test-circuit-breaker.sh create mode 100644 tests/test-custom-label.sh create mode 100644 tests/test-opt-out.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index fa2f791..4654383 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -13,6 +13,7 @@ env: jobs: go: runs-on: ubuntu-latest + timeout-minutes: 10 steps: - uses: actions/checkout@v4 @@ -29,12 +30,49 @@ jobs: with: version: latest - - name: Unit tests - run: go test -race -count=1 ./... + - name: Unit tests with coverage + run: go test -race -count=1 -coverprofile=coverage.out ./... + + - name: Upload coverage artifact + uses: actions/upload-artifact@v4 + with: + name: coverage + path: coverage.out + + - name: Display coverage summary + run: go tool cover -func=coverage.out | tail -1 + + security: + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.24' + + - name: Run govulncheck + run: | + go install golang.org/x/vuln/cmd/govulncheck@latest + govulncheck ./... + + - name: Build image for scanning + run: docker build -t docker-guardian:scan . + + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + with: + image-ref: docker-guardian:scan + format: table + exit-code: 0 + severity: CRITICAL,HIGH test: needs: go runs-on: ubuntu-latest + timeout-minutes: 15 steps: - uses: actions/checkout@v4 @@ -59,8 +97,37 @@ jobs: - name: Run notifications test run: GUARDIAN_IMAGE=docker-guardian bash tests/test-notifications.sh + - name: Run opt-out test + run: GUARDIAN_IMAGE=docker-guardian bash tests/test-opt-out.sh + + - name: Run circuit breaker test + run: GUARDIAN_IMAGE=docker-guardian bash tests/test-circuit-breaker.sh + + - name: Run custom label test + run: GUARDIAN_IMAGE=docker-guardian bash tests/test-custom-label.sh + + - name: Capture docker logs on failure + if: failure() + run: | + echo "=== Docker containers ===" + docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" || true + echo "" + for c in $(docker ps -a --filter "name=dg-test" --format "{{.Names}}"); do + echo "=== Logs: $c ===" + docker logs "$c" 2>&1 | tail -50 + echo "" + done + + - name: Upload failure logs + if: failure() + uses: actions/upload-artifact@v4 + with: + name: test-failure-logs + path: /tmp/dg-test-logs/ + if-no-files-found: ignore + build: - needs: test + needs: [test, security] runs-on: ubuntu-latest if: github.event_name == 'push' permissions: diff --git a/Dockerfile b/Dockerfile index 90fc525..a689d80 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,6 +26,12 @@ ENV AUTOHEAL_CONTAINER_LABEL=autoheal \ AUTOHEAL_WATCHTOWER_COOLDOWN=300 \ AUTOHEAL_WATCHTOWER_SCOPE=all \ AUTOHEAL_WATCHTOWER_EVENTS=orchestration \ + AUTOHEAL_BACKOFF_MULTIPLIER=2 \ + AUTOHEAL_BACKOFF_MAX=300 \ + AUTOHEAL_BACKOFF_RESET_AFTER=600 \ + AUTOHEAL_RESTART_BUDGET=5 \ + AUTOHEAL_RESTART_WINDOW=300 \ + METRICS_PORT=0 \ DOCKER_SOCK=/var/run/docker.sock \ CURL_TIMEOUT=30 \ WEBHOOK_URL="" \ @@ -33,6 +39,7 @@ ENV AUTOHEAL_CONTAINER_LABEL=autoheal \ APPRISE_URL="" \ POST_RESTART_SCRIPT="" \ NOTIFY_EVENTS="actions" \ + NOTIFY_RATE_LIMIT=60 \ NOTIFY_GOTIFY_URL="" \ NOTIFY_GOTIFY_TOKEN="" \ NOTIFY_DISCORD_WEBHOOK="" \ diff --git a/cmd/guardian/main.go b/cmd/guardian/main.go index cc99773..e059316 100644 --- a/cmd/guardian/main.go +++ b/cmd/guardian/main.go @@ -13,6 +13,7 @@ import ( "github.com/Will-Luck/Docker-Guardian/internal/docker" "github.com/Will-Luck/Docker-Guardian/internal/guardian" "github.com/Will-Luck/Docker-Guardian/internal/logging" + "github.com/Will-Luck/Docker-Guardian/internal/metrics" "github.com/Will-Luck/Docker-Guardian/internal/notify" ) @@ -24,6 +25,10 @@ func main() { } cfg := config.Load() + if err := cfg.Validate(); err != nil { + fmt.Fprintf(os.Stderr, "configuration error: %v\n", err) + os.Exit(1) + } log := logging.New(cfg.LogJSON) // Banner: plain stdout for acceptance test compatibility @@ -48,6 +53,8 @@ func main() { resolved := cfg.ResolvedNotifyEvents() fmt.Printf("NOTIFY_EVENTS=%s (resolved: %s)\n", cfg.NotifyEvents, strings.Join(resolved, ",")) + metrics.Serve(cfg.MetricsPort) + g := guardian.New(cfg, client, dispatcher, log) if cfg.StartPeriod > 0 { @@ -66,4 +73,6 @@ func main() { log.Error("guardian exited with error", "error", err) os.Exit(1) } + + dispatcher.Close() } diff --git a/go.mod b/go.mod index 649b185..3a86f12 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,8 @@ require ( require ( github.com/Microsoft/go-winio v0.6.2 // indirect + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/containerd/errdefs v1.0.0 // indirect github.com/containerd/errdefs/pkg v0.3.0 // indirect github.com/distribution/reference v0.6.0 // indirect @@ -20,12 +22,19 @@ require ( github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/moby/docker-image-spec v1.3.1 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.1.1 // indirect + github.com/prometheus/client_golang v1.23.2 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/procfs v0.16.1 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect go.opentelemetry.io/otel v1.35.0 // indirect go.opentelemetry.io/otel/metric v1.35.0 // indirect go.opentelemetry.io/otel/trace v1.35.0 // indirect - golang.org/x/sys v0.33.0 // indirect + go.yaml.in/yaml/v2 v2.4.2 // indirect + golang.org/x/sys v0.35.0 // indirect + google.golang.org/protobuf v1.36.8 // indirect ) diff --git a/go.sum b/go.sum index 5abb1bc..a1d9d94 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,9 @@ github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE= @@ -29,12 +33,22 @@ github.com/moby/moby/api v1.53.0 h1:PihqG1ncw4W+8mZs69jlwGXdaYBeb5brF6BL7mPIS/w= github.com/moby/moby/api v1.53.0/go.mod h1:8mb+ReTlisw4pS6BRzCMts5M49W5M7bKt1cJy/YbAqc= github.com/moby/moby/client v0.2.2 h1:Pt4hRMCAIlyjL3cr8M5TrXCwKzguebPAc2do2ur7dEM= github.com/moby/moby/client v0.2.2/go.mod h1:2EkIPVNCqR05CMIzL1mfA07t0HvVUUOl85pasRz/GmQ= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040= github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= +github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= @@ -51,8 +65,15 @@ go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5J go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w= go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= +go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= +go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools/v3 v3.5.2 h1:7koQfIKdy+I8UTetycgUqXWSDwpgv193Ka+qRsmBY8Q= diff --git a/internal/clock/clock.go b/internal/clock/clock.go new file mode 100644 index 0000000..292687f --- /dev/null +++ b/internal/clock/clock.go @@ -0,0 +1,17 @@ +package clock + +import "time" + +// Clock abstracts time operations for testability. +type Clock interface { + Now() time.Time + After(d time.Duration) <-chan time.Time + Since(t time.Time) time.Duration +} + +// Real uses the standard library time functions. +type Real struct{} + +func (Real) Now() time.Time { return time.Now() } +func (Real) After(d time.Duration) <-chan time.Time { return time.After(d) } +func (Real) Since(t time.Time) time.Duration { return time.Since(t) } diff --git a/internal/config/config.go b/internal/config/config.go index 98bcdce..1bf5262 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -1,7 +1,9 @@ package config import ( + "errors" "fmt" + "net/url" "os" "strconv" "strings" @@ -31,11 +33,19 @@ type Config struct { WatchtowerScope string // "all" or "affected" WatchtowerEvents string // "orchestration" or "all" + // Circuit breaker / backoff + BackoffMultiplier float64 + BackoffMax int // seconds + BackoffResetAfter int // seconds + RestartBudget int + RestartWindow int // seconds + // Post-restart script PostRestartScript string // Notification events - NotifyEvents string + NotifyEvents string + NotifyRateLimit int // seconds (0 = unlimited) // Notification services WebhookURL string @@ -63,6 +73,9 @@ type Config struct { EmailUser string EmailPass string + // Metrics + MetricsPort int + // Logging LogJSON bool } @@ -89,8 +102,15 @@ func Load() *Config { WatchtowerScope: envStr("AUTOHEAL_WATCHTOWER_SCOPE", "all"), WatchtowerEvents: envStr("AUTOHEAL_WATCHTOWER_EVENTS", "orchestration"), + BackoffMultiplier: envFloat("AUTOHEAL_BACKOFF_MULTIPLIER", 2), + BackoffMax: envInt("AUTOHEAL_BACKOFF_MAX", 300), + BackoffResetAfter: envInt("AUTOHEAL_BACKOFF_RESET_AFTER", 600), + RestartBudget: envInt("AUTOHEAL_RESTART_BUDGET", 5), + RestartWindow: envInt("AUTOHEAL_RESTART_WINDOW", 300), + PostRestartScript: envStr("POST_RESTART_SCRIPT", ""), NotifyEvents: envStr("NOTIFY_EVENTS", "actions"), + NotifyRateLimit: envInt("NOTIFY_RATE_LIMIT", 60), WebhookURL: envStr("WEBHOOK_URL", ""), WebhookJSONKey: envStr("WEBHOOK_JSON_KEY", "text"), @@ -117,6 +137,8 @@ func Load() *Config { EmailUser: envStr("NOTIFY_EMAIL_USER", ""), EmailPass: envStr("NOTIFY_EMAIL_PASS", ""), + MetricsPort: envInt("METRICS_PORT", 0), + LogJSON: envBool("LOG_JSON", false), } } @@ -137,6 +159,11 @@ func (c *Config) PrintBanner() { fmt.Println("AUTOHEAL_WATCHTOWER_COOLDOWN=" + strconv.Itoa(c.WatchtowerCooldown)) fmt.Println("AUTOHEAL_WATCHTOWER_SCOPE=" + c.WatchtowerScope) fmt.Println("AUTOHEAL_WATCHTOWER_EVENTS=" + c.WatchtowerEvents) + fmt.Printf("AUTOHEAL_BACKOFF_MULTIPLIER=%g\n", c.BackoffMultiplier) + fmt.Println("AUTOHEAL_BACKOFF_MAX=" + strconv.Itoa(c.BackoffMax)) + fmt.Println("AUTOHEAL_BACKOFF_RESET_AFTER=" + strconv.Itoa(c.BackoffResetAfter)) + fmt.Println("AUTOHEAL_RESTART_BUDGET=" + strconv.Itoa(c.RestartBudget)) + fmt.Println("AUTOHEAL_RESTART_WINDOW=" + strconv.Itoa(c.RestartWindow)) } // ResolvedNotifyEvents returns the normalised event categories. @@ -170,6 +197,43 @@ func (c *Config) ResolvedNotifyEvents() []string { return result } +// Validate checks configuration for invalid or dangerous values. +func (c *Config) Validate() error { + var errs []error + if c.Interval <= 0 { + errs = append(errs, fmt.Errorf("AUTOHEAL_INTERVAL must be > 0, got %d", c.Interval)) + } + if c.GracePeriod < 0 { + errs = append(errs, fmt.Errorf("AUTOHEAL_GRACE_PERIOD must be >= 0, got %d", c.GracePeriod)) + } + if c.DefaultStopTimeout < 0 { + errs = append(errs, fmt.Errorf("AUTOHEAL_DEFAULT_STOP_TIMEOUT must be >= 0, got %d", c.DefaultStopTimeout)) + } + if c.WatchtowerScope != "all" && c.WatchtowerScope != "affected" { + errs = append(errs, fmt.Errorf("AUTOHEAL_WATCHTOWER_SCOPE must be \"all\" or \"affected\", got %q", c.WatchtowerScope)) + } + if c.WatchtowerEvents != "orchestration" && c.WatchtowerEvents != "all" { + errs = append(errs, fmt.Errorf("AUTOHEAL_WATCHTOWER_EVENTS must be \"orchestration\" or \"all\", got %q", c.WatchtowerEvents)) + } + for _, u := range []struct { + name, val string + }{ + {"WEBHOOK_URL", c.WebhookURL}, + {"APPRISE_URL", c.AppriseURL}, + {"NOTIFY_GOTIFY_URL", c.GotifyURL}, + {"NOTIFY_DISCORD_WEBHOOK", c.DiscordWebhook}, + {"NOTIFY_SLACK_WEBHOOK", c.SlackWebhook}, + {"NOTIFY_LUNASEA_WEBHOOK", c.LunaSeaWebhook}, + } { + if u.val != "" { + if _, err := url.Parse(u.val); err != nil { + errs = append(errs, fmt.Errorf("%s is not a valid URL: %w", u.name, err)) + } + } + } + return errors.Join(errs...) +} + func envStr(key, def string) string { if v := os.Getenv(key); v != "" { return v @@ -189,6 +253,18 @@ func envInt(key string, def int) int { return n } +func envFloat(key string, def float64) float64 { + v := os.Getenv(key) + if v == "" { + return def + } + f, err := strconv.ParseFloat(v, 64) + if err != nil { + return def + } + return f +} + func envBool(key string, def bool) bool { v := os.Getenv(key) if v == "" { diff --git a/internal/docker/containers.go b/internal/docker/containers.go index b0c78f3..0017bfe 100644 --- a/internal/docker/containers.go +++ b/internal/docker/containers.go @@ -73,6 +73,12 @@ func (c *Client) StartContainer(ctx context.Context, id string) error { return err } +// StopContainer stops a running container with the given timeout. +func (c *Client) StopContainer(ctx context.Context, id string, timeout int) error { + _, err := c.api.ContainerStop(ctx, id, client.ContainerStopOptions{Timeout: &timeout}) + return err +} + // ContainerStatus returns the current status string of a container. func (c *Client) ContainerStatus(ctx context.Context, id string) (string, error) { info, err := c.api.ContainerInspect(ctx, id, client.ContainerInspectOptions{}) diff --git a/internal/docker/interface.go b/internal/docker/interface.go new file mode 100644 index 0000000..731a735 --- /dev/null +++ b/internal/docker/interface.go @@ -0,0 +1,28 @@ +package docker + +import ( + "context" + "time" + + "github.com/moby/moby/api/types/container" + "github.com/moby/moby/api/types/events" +) + +// API defines the subset of Docker operations used by Guardian. +// Implemented by Client for production, and by mocks for testing. +type API interface { + UnhealthyContainers(ctx context.Context, label string, onlyRunning bool) ([]container.Summary, error) + ExitedContainers(ctx context.Context) ([]container.Summary, error) + RunningContainers(ctx context.Context) ([]container.Summary, error) + InspectContainer(ctx context.Context, id string) (container.InspectResponse, error) + RestartContainer(ctx context.Context, id string, timeout int) error + StartContainer(ctx context.Context, id string) error + StopContainer(ctx context.Context, id string, timeout int) error + ContainerStatus(ctx context.Context, id string) (string, error) + ContainerFinishedAt(ctx context.Context, id string) (time.Time, error) + ContainerEvents(ctx context.Context, since, until time.Time, orchestrationOnly bool) ([]events.Message, error) + Close() error +} + +// Verify Client implements API at compile time. +var _ API = (*Client)(nil) diff --git a/internal/docker/watcher.go b/internal/docker/watcher.go new file mode 100644 index 0000000..60fcecd --- /dev/null +++ b/internal/docker/watcher.go @@ -0,0 +1,129 @@ +package docker + +import ( + "context" + "time" + + "github.com/moby/moby/api/types/events" + "github.com/moby/moby/client" +) + +// ContainerEvent represents a processed Docker container event. +type ContainerEvent struct { + ContainerID string + ContainerName string + Action string // "health_status", "die", "start", "destroy", "create" + HealthStatus string // "unhealthy", "healthy" (only for health_status events) + Timestamp time.Time +} + +// WatcherAPI defines the interface for watching Docker events. +type WatcherAPI interface { + Watch(ctx context.Context) <-chan ContainerEvent +} + +// Watcher subscribes to the Docker event stream and emits ContainerEvents. +type Watcher struct { + api *client.Client + reconnectMax time.Duration + livenessWindow time.Duration +} + +// NewWatcher creates a Watcher connected to the Docker event stream. +func NewWatcher(c *Client) *Watcher { + return &Watcher{ + api: c.api, + reconnectMax: 30 * time.Second, + livenessWindow: 60 * time.Second, + } +} + +// Watch starts watching Docker events. It reconnects automatically on disconnect. +// Returns a channel of ContainerEvents. The channel is closed when ctx is cancelled. +func (w *Watcher) Watch(ctx context.Context) <-chan ContainerEvent { + ch := make(chan ContainerEvent, 64) + + go func() { + defer close(ch) + backoff := time.Second + + for { + if ctx.Err() != nil { + return + } + + w.streamEvents(ctx, ch) + + // If we get here, stream disconnected. Reconnect with backoff. + if ctx.Err() != nil { + return + } + + select { + case <-time.After(backoff): + case <-ctx.Done(): + return + } + + // Exponential backoff: 1s → 2s → 4s → 8s → ... → 30s max + backoff *= 2 + if backoff > w.reconnectMax { + backoff = w.reconnectMax + } + } + }() + + return ch +} + +func (w *Watcher) streamEvents(ctx context.Context, ch chan<- ContainerEvent) { + opts := client.EventsListOptions{ + Filters: make(client.Filters). + Add("type", "container"). + Add("event", "health_status", "die", "start", "destroy", "create"), + } + + result := w.api.Events(ctx, opts) + + // Reset backoff on successful connection + for { + select { + case msg, ok := <-result.Messages: + if !ok { + return // stream closed + } + evt := parseEvent(msg) + if evt != nil { + select { + case ch <- *evt: + case <-ctx.Done(): + return + } + } + case <-result.Err: + return // stream error — caller will reconnect + case <-ctx.Done(): + return + } + } +} + +func parseEvent(msg events.Message) *ContainerEvent { + evt := &ContainerEvent{ + ContainerID: msg.Actor.ID, + ContainerName: msg.Actor.Attributes["name"], + Action: string(msg.Action), + Timestamp: time.Unix(msg.Time, msg.TimeNano%1e9), + } + + // Docker sends health_status events as "health_status: unhealthy" or "health_status: healthy" + if action := string(msg.Action); len(action) > 15 && action[:14] == "health_status:" { + evt.Action = "health_status" + evt.HealthStatus = action[15:] // skip "health_status: " + } else if action == "health_status" { + // Some Docker versions use attributes + evt.HealthStatus = msg.Actor.Attributes["health_status"] + } + + return evt +} diff --git a/internal/guardian/dependency.go b/internal/guardian/dependency.go index f952385..cbd824b 100644 --- a/internal/guardian/dependency.go +++ b/internal/guardian/dependency.go @@ -46,7 +46,7 @@ func (g *Guardian) checkDependencyOrphans(ctx context.Context) { continue } - now := time.Now().Format("02-01-2006 15:04:05") + now := g.clock.Now().Format("02-01-2006 15:04:05") fmt.Printf("%s Container %s (%s) exited (code %d, orphaned dependent) - parent %s is running\n", now, name, shortID, exitCode, parentID[:12]) @@ -77,10 +77,10 @@ func (g *Guardian) checkDependencyOrphans(ctx context.Context) { fmt.Printf("%s Starting orphaned dependent %s (%s)...\n", now, name, shortID) if err := g.docker.StartContainer(ctx, c.ID); err != nil { g.log.Error("failed to start container", "container", name, "id", shortID, "error", err) - g.dispatcher.Action(fmt.Sprintf("Container %s (%s) orphaned (parent running). Failed to start!", name, shortID)) + g.notifier.Action(fmt.Sprintf("Container %s (%s) orphaned (parent running). Failed to start!", name, shortID)) } else { fmt.Printf("%s Successfully started %s (%s)\n", now, name, shortID) - g.dispatcher.Action(fmt.Sprintf("Container %s (%s) orphaned (parent running). Successfully started!", name, shortID)) + g.notifier.Action(fmt.Sprintf("Container %s (%s) orphaned (parent running). Successfully started!", name, shortID)) } g.runPostRestartScript(name, shortID, "orphaned", 0) diff --git a/internal/guardian/dependency_test.go b/internal/guardian/dependency_test.go new file mode 100644 index 0000000..e6fc513 --- /dev/null +++ b/internal/guardian/dependency_test.go @@ -0,0 +1,222 @@ +package guardian + +import ( + "context" + "strings" + "testing" + "time" + + "github.com/Will-Luck/Docker-Guardian/internal/config" + "github.com/Will-Luck/Docker-Guardian/internal/logging" + "github.com/moby/moby/api/types/container" +) + +func TestCheckDependencyOrphans_StartsOrphan(t *testing.T) { + cfg := &config.Config{ + MonitorDependencies: true, + DependencyStartDelay: 0, + WatchtowerCooldown: 0, + GracePeriod: 0, + BackupLabel: "", + } + dock := newMockDocker() + notif := &mockNotifier{} + clk := newMockClock(time.Now()) + + parentID := "parent1234567890abcdef" + + dock.exitedContainers = []container.Summary{ + {ID: "orphan01234567890abcdef"}, + } + dock.inspectResults["orphan01234567890abcdef"] = container.InspectResponse{ + Name: "/orphan-app", + HostConfig: &container.HostConfig{ + NetworkMode: container.NetworkMode("container:" + parentID), + }, + Config: &container.Config{ + Labels: map[string]string{}, + }, + State: &container.State{ + ExitCode: 1, + }, + } + dock.statusResults[parentID] = "running" + dock.statusResults["orphan01234567890abcdef"] = "exited" + + g := &Guardian{ + cfg: cfg, + docker: dock, + notifier: notif, + log: logging.New(false), + clock: clk, + tracker: NewRestartTracker(DefaultTrackerConfig(), clk), + } + + g.checkDependencyOrphans(context.Background()) + + if len(dock.startCalls) != 1 { + t.Fatalf("expected 1 start call, got %d", len(dock.startCalls)) + } + if dock.startCalls[0] != "orphan01234567890abcdef" { + t.Errorf("started wrong container: %s", dock.startCalls[0]) + } + if len(notif.actions) != 1 || !strings.Contains(notif.actions[0], "Successfully started") { + t.Errorf("expected success notification, got %v", notif.actions) + } +} + +func TestCheckDependencyOrphans_SkipsNonDependents(t *testing.T) { + cfg := &config.Config{ + MonitorDependencies: true, + DependencyStartDelay: 0, + WatchtowerCooldown: 0, + GracePeriod: 0, + BackupLabel: "", + } + dock := newMockDocker() + notif := &mockNotifier{} + clk := newMockClock(time.Now()) + + dock.exitedContainers = []container.Summary{ + {ID: "standalone1234567890ab"}, + } + dock.inspectResults["standalone1234567890ab"] = container.InspectResponse{ + Name: "/standalone", + HostConfig: &container.HostConfig{ + NetworkMode: "bridge", + }, + Config: &container.Config{}, + State: &container.State{}, + } + + g := &Guardian{ + cfg: cfg, + docker: dock, + notifier: notif, + log: logging.New(false), + clock: clk, + tracker: NewRestartTracker(DefaultTrackerConfig(), clk), + } + + g.checkDependencyOrphans(context.Background()) + + if len(dock.startCalls) != 0 { + t.Error("should not start non-dependent container") + } +} + +func TestCheckDependencyOrphans_SkipsWhenParentNotRunning(t *testing.T) { + cfg := &config.Config{ + MonitorDependencies: true, + DependencyStartDelay: 0, + WatchtowerCooldown: 0, + GracePeriod: 0, + BackupLabel: "", + } + dock := newMockDocker() + notif := &mockNotifier{} + clk := newMockClock(time.Now()) + + parentID := "parent1234567890abcdef" + + dock.exitedContainers = []container.Summary{ + {ID: "orphan01234567890abcdef"}, + } + dock.inspectResults["orphan01234567890abcdef"] = container.InspectResponse{ + Name: "/orphan-app", + HostConfig: &container.HostConfig{ + NetworkMode: container.NetworkMode("container:" + parentID), + }, + Config: &container.Config{}, + State: &container.State{}, + } + dock.statusResults[parentID] = "exited" // Parent not running + + g := &Guardian{ + cfg: cfg, + docker: dock, + notifier: notif, + log: logging.New(false), + clock: clk, + tracker: NewRestartTracker(DefaultTrackerConfig(), clk), + } + + g.checkDependencyOrphans(context.Background()) + + if len(dock.startCalls) != 0 { + t.Error("should not start orphan when parent is not running") + } +} + +func TestCheckDependencyOrphans_Disabled(t *testing.T) { + cfg := &config.Config{ + MonitorDependencies: false, + } + dock := newMockDocker() + notif := &mockNotifier{} + clk := newMockClock(time.Now()) + + g := &Guardian{ + cfg: cfg, + docker: dock, + notifier: notif, + log: logging.New(false), + clock: clk, + tracker: NewRestartTracker(DefaultTrackerConfig(), clk), + } + + g.checkDependencyOrphans(context.Background()) + + if len(dock.startCalls) != 0 { + t.Error("should not make any calls when dependencies disabled") + } +} + +func TestCheckDependencyOrphans_SkipsAutoRecovered(t *testing.T) { + cfg := &config.Config{ + MonitorDependencies: true, + DependencyStartDelay: 0, + WatchtowerCooldown: 0, + GracePeriod: 0, + BackupLabel: "", + } + dock := newMockDocker() + notif := &mockNotifier{} + clk := newMockClock(time.Now()) + + parentID := "parent1234567890abcdef" + + dock.exitedContainers = []container.Summary{ + {ID: "orphan01234567890abcdef"}, + } + dock.inspectResults["orphan01234567890abcdef"] = container.InspectResponse{ + Name: "/orphan-app", + HostConfig: &container.HostConfig{ + NetworkMode: container.NetworkMode("container:" + parentID), + }, + Config: &container.Config{ + Labels: map[string]string{}, + }, + State: &container.State{ + ExitCode: 0, + }, + } + dock.statusResults[parentID] = "running" + // Container has auto-recovered by the time we re-check + dock.statusResults["orphan01234567890abcdef"] = "running" + + g := &Guardian{ + cfg: cfg, + docker: dock, + notifier: notif, + log: logging.New(false), + clock: clk, + tracker: NewRestartTracker(DefaultTrackerConfig(), clk), + } + + g.checkDependencyOrphans(context.Background()) + + if len(dock.startCalls) != 0 { + t.Error("should not start auto-recovered container") + } +} diff --git a/internal/guardian/guardian.go b/internal/guardian/guardian.go index 3d0e6a5..0e74e97 100644 --- a/internal/guardian/guardian.go +++ b/internal/guardian/guardian.go @@ -2,8 +2,10 @@ package guardian import ( "context" + "sync" "time" + "github.com/Will-Luck/Docker-Guardian/internal/clock" "github.com/Will-Luck/Docker-Guardian/internal/config" "github.com/Will-Luck/Docker-Guardian/internal/docker" "github.com/Will-Luck/Docker-Guardian/internal/logging" @@ -11,32 +13,95 @@ import ( "github.com/moby/moby/api/types/events" ) -// Guardian orchestrates the main monitoring loop. +// Guardian orchestrates container health monitoring. type Guardian struct { - cfg *config.Config - docker *docker.Client - dispatcher *notify.Dispatcher - log *logging.Logger - cycle int + cfg *config.Config + docker docker.API + notifier notify.Notifier + log *logging.Logger + clock clock.Clock - // Per-cycle caches (reset at start of each cycle) + // Circuit breaker + tracker *RestartTracker + + // Event debouncing + debounceMu sync.Mutex + debounceTimers map[string]*time.Timer + + // Orchestration tracking (event-driven replacement for per-cycle cache) + orchestrationMu sync.Mutex + orchestrationEvents map[string]time.Time // container name → latest event time + + // Per-cycle caches (used during full scans) orchestratorEvents []events.Message orchestratorCached bool backupRunning *bool + cycle int } // New creates a Guardian instance. -func New(cfg *config.Config, client *docker.Client, dispatcher *notify.Dispatcher, log *logging.Logger) *Guardian { +func New(cfg *config.Config, client docker.API, notifier notify.Notifier, log *logging.Logger) *Guardian { + clk := clock.Real{} + tcfg := TrackerConfig{ + BackoffMultiplier: cfg.BackoffMultiplier, + BackoffMax: time.Duration(cfg.BackoffMax) * time.Second, + BackoffResetAfter: time.Duration(cfg.BackoffResetAfter) * time.Second, + RestartBudget: cfg.RestartBudget, + RestartWindow: time.Duration(cfg.RestartWindow) * time.Second, + } return &Guardian{ - cfg: cfg, - docker: client, - dispatcher: dispatcher, - log: log, + cfg: cfg, + docker: client, + notifier: notifier, + log: log, + clock: clk, + tracker: NewRestartTracker(tcfg, clk), + debounceTimers: make(map[string]*time.Timer), + orchestrationEvents: make(map[string]time.Time), } } -// Run starts the main monitoring loop, returning when the context is cancelled. +// NewWithClock creates a Guardian with a custom clock (for testing). +func NewWithClock(cfg *config.Config, client docker.API, notifier notify.Notifier, log *logging.Logger, clk clock.Clock) *Guardian { + g := New(cfg, client, notifier, log) + g.clock = clk + g.tracker.clock = clk + return g +} + +// Run starts the event-driven monitoring loop. +// If a Watcher is available (via docker.Client), it uses the event stream. +// Otherwise, it falls back to the polling loop for compatibility. func (g *Guardian) Run(ctx context.Context) error { + // Check if we can get a watcher + if client, ok := g.docker.(*docker.Client); ok { + return g.runEventDriven(ctx, client) + } + // Fallback to polling (for tests with mock docker) + return g.runPolling(ctx) +} + +func (g *Guardian) runEventDriven(ctx context.Context, client *docker.Client) error { + watcher := docker.NewWatcher(client) + eventCh := watcher.Watch(ctx) + + // Initial full scan on startup + g.fullScan(ctx) + + for { + select { + case evt, ok := <-eventCh: + if !ok { + return nil // watcher closed (context cancelled) + } + g.handleEvent(ctx, evt) + case <-ctx.Done(): + return nil + } + } +} + +func (g *Guardian) runPolling(ctx context.Context) error { for { g.cycle++ g.orchestratorCached = false @@ -52,3 +117,119 @@ func (g *Guardian) Run(ctx context.Context) error { } } } + +// fullScan does a complete check of all containers. +// Called on startup and after event stream reconnection. +func (g *Guardian) fullScan(ctx context.Context) { + g.cycle++ + g.orchestratorCached = false + g.backupRunning = nil + + g.checkUnhealthy(ctx) + g.checkDependencyOrphans(ctx) +} + +// handleEvent processes a single Docker event with debouncing. +func (g *Guardian) handleEvent(ctx context.Context, evt docker.ContainerEvent) { + switch evt.Action { + case "health_status": + if evt.HealthStatus == "unhealthy" { + g.debounce(ctx, evt.ContainerID, func() { + g.checkContainerByID(ctx, evt.ContainerID) + }) + } else if evt.HealthStatus == "healthy" { + g.tracker.Reset(evt.ContainerID) + } + + case "die": + g.debounce(ctx, "dep:"+evt.ContainerID, func() { + g.checkOrphanedDependents(ctx, evt.ContainerID) + }) + + case "create", "destroy": + g.recordOrchestrationActivity(evt) + + case "start": + // No action needed — tracked for potential future use + } +} + +// debounce ensures only one action per container within the debounce window. +func (g *Guardian) debounce(ctx context.Context, key string, fn func()) { + debounceWindow := time.Duration(g.cfg.Interval) * time.Second + if debounceWindow <= 0 { + debounceWindow = 5 * time.Second + } + + g.debounceMu.Lock() + if timer, ok := g.debounceTimers[key]; ok { + timer.Stop() + } + g.debounceTimers[key] = time.AfterFunc(debounceWindow, func() { + if ctx.Err() == nil { + fn() + } + g.debounceMu.Lock() + delete(g.debounceTimers, key) + g.debounceMu.Unlock() + }) + g.debounceMu.Unlock() +} + +// checkContainerByID inspects and potentially restarts a single container. +func (g *Guardian) checkContainerByID(ctx context.Context, containerID string) { + // Use the regular unhealthy check — it re-queries and filters + g.backupRunning = nil + g.orchestratorCached = false + g.checkUnhealthy(ctx) +} + +// checkOrphanedDependents checks if any dependents of the given container need starting. +func (g *Guardian) checkOrphanedDependents(ctx context.Context, _ string) { + g.backupRunning = nil + g.orchestratorCached = false + g.checkDependencyOrphans(ctx) +} + +// recordOrchestrationActivity records a create/destroy event for orchestration tracking. +func (g *Guardian) recordOrchestrationActivity(evt docker.ContainerEvent) { + g.orchestrationMu.Lock() + g.orchestrationEvents[evt.ContainerName] = evt.Timestamp + g.orchestrationMu.Unlock() + + // Clean up old entries + go g.pruneOrchestrationEvents() +} + +func (g *Guardian) pruneOrchestrationEvents() { + cutoff := g.clock.Now().Add(-time.Duration(g.cfg.WatchtowerCooldown) * time.Second) + g.orchestrationMu.Lock() + for name, ts := range g.orchestrationEvents { + if ts.Before(cutoff) { + delete(g.orchestrationEvents, name) + } + } + g.orchestrationMu.Unlock() +} + +// EventStreamConnected returns whether we're using event-driven mode. +// Used by metrics. +func (g *Guardian) EventStreamConnected() bool { + _, ok := g.docker.(*docker.Client) + return ok +} + +// Tracker returns the restart tracker (for metrics). +func (g *Guardian) Tracker() *RestartTracker { + return g.tracker +} + +// UnhealthyCount returns the count from the last check (for metrics). +// This is a simple accessor — the real metric instrumentation happens in Phase 5. +func (g *Guardian) UnhealthyCount(ctx context.Context) int { + containers, err := g.docker.UnhealthyContainers(ctx, g.cfg.ContainerLabel, g.cfg.OnlyMonitorRunning) + if err != nil { + return 0 + } + return len(containers) +} diff --git a/internal/guardian/guardian_test.go b/internal/guardian/guardian_test.go new file mode 100644 index 0000000..ba62aaa --- /dev/null +++ b/internal/guardian/guardian_test.go @@ -0,0 +1,187 @@ +package guardian + +import ( + "context" + "testing" + "time" + + "github.com/Will-Luck/Docker-Guardian/internal/config" + "github.com/Will-Luck/Docker-Guardian/internal/logging" + "github.com/moby/moby/api/types/container" + "github.com/moby/moby/api/types/events" +) + +func newTestGuardian(cfg *config.Config, dock *mockDocker, notif *mockNotifier, clk *mockClock) *Guardian { + return &Guardian{ + cfg: cfg, + docker: dock, + notifier: notif, + log: logging.New(false), + clock: clk, + tracker: NewRestartTracker(DefaultTrackerConfig(), clk), + } +} + +func TestShouldSkip_NoGuards(t *testing.T) { + cfg := &config.Config{ + WatchtowerCooldown: 0, + GracePeriod: 0, + BackupLabel: "", + } + dock := newMockDocker() + notif := &mockNotifier{} + clk := newMockClock(time.Now()) + + g := newTestGuardian(cfg, dock, notif, clk) + if g.shouldSkip(context.Background(), "abcdef123456", "test-container", nil) { + t.Error("should not skip when no guards are configured") + } +} + +func TestShouldSkip_GracePeriod(t *testing.T) { + now := time.Date(2025, 1, 1, 12, 0, 0, 0, time.UTC) + clk := newMockClock(now) + + cfg := &config.Config{ + GracePeriod: 60, + WatchtowerCooldown: 0, + BackupLabel: "", + } + dock := newMockDocker() + notif := &mockNotifier{} + + // Container stopped 30s ago — within grace period + dock.finishedAtResults["abcdef123456"] = now.Add(-30 * time.Second) + + g := newTestGuardian(cfg, dock, notif, clk) + if !g.shouldSkip(context.Background(), "abcdef123456", "test-container", nil) { + t.Error("should skip container within grace period") + } + if len(notif.skips) != 1 { + t.Errorf("expected 1 skip notification, got %d", len(notif.skips)) + } +} + +func TestShouldSkip_GracePeriodExpired(t *testing.T) { + now := time.Date(2025, 1, 1, 12, 0, 0, 0, time.UTC) + clk := newMockClock(now) + + cfg := &config.Config{ + GracePeriod: 60, + WatchtowerCooldown: 0, + BackupLabel: "", + } + dock := newMockDocker() + notif := &mockNotifier{} + + // Container stopped 90s ago — outside grace period + dock.finishedAtResults["abcdef123456"] = now.Add(-90 * time.Second) + + g := newTestGuardian(cfg, dock, notif, clk) + if g.shouldSkip(context.Background(), "abcdef123456", "test-container", nil) { + t.Error("should not skip container outside grace period") + } +} + +func TestShouldSkip_OrchestrationAll(t *testing.T) { + cfg := &config.Config{ + WatchtowerCooldown: 300, + WatchtowerScope: "all", + WatchtowerEvents: "orchestration", + GracePeriod: 0, + BackupLabel: "", + } + dock := newMockDocker() + notif := &mockNotifier{} + clk := newMockClock(time.Now()) + + dock.containerEvents = []events.Message{{Action: "create"}} + + g := newTestGuardian(cfg, dock, notif, clk) + if !g.shouldSkip(context.Background(), "abcdef123456", "test-container", nil) { + t.Error("should skip when orchestration events exist with scope=all") + } +} + +func TestShouldSkip_OrchestrationAffected(t *testing.T) { + cfg := &config.Config{ + WatchtowerCooldown: 300, + WatchtowerScope: "affected", + WatchtowerEvents: "orchestration", + GracePeriod: 0, + BackupLabel: "", + } + dock := newMockDocker() + notif := &mockNotifier{} + clk := newMockClock(time.Now()) + + // Event for a different container + dock.containerEvents = []events.Message{ + {Action: "create", Actor: events.Actor{Attributes: map[string]string{"name": "other-container"}}}, + } + + g := newTestGuardian(cfg, dock, notif, clk) + + // test-container should NOT be skipped — it's not in the events + if g.shouldSkip(context.Background(), "abcdef123456", "test-container", nil) { + t.Error("should not skip unaffected container with scope=affected") + } + + // Reset cache for next check + g.orchestratorCached = false + + // Now check the affected container + if !g.shouldSkip(context.Background(), "bbbbbb123456", "other-container", nil) { + t.Error("should skip affected container with scope=affected") + } +} + +func TestShouldSkip_BackupRunning(t *testing.T) { + cfg := &config.Config{ + WatchtowerCooldown: 0, + GracePeriod: 0, + BackupLabel: "docker-volume-backup.stop-during-backup", + BackupContainer: "my-backup", + } + dock := newMockDocker() + notif := &mockNotifier{} + clk := newMockClock(time.Now()) + + // Backup container running + dock.runningContainers = []container.Summary{ + {Names: []string{"/my-backup"}}, + } + + labels := map[string]string{ + "docker-volume-backup.stop-during-backup": "true", + } + + g := newTestGuardian(cfg, dock, notif, clk) + if !g.shouldSkip(context.Background(), "abcdef123456", "test-container", labels) { + t.Error("should skip backup-managed container while backup is running") + } +} + +func TestShouldSkip_BackupNotRunning(t *testing.T) { + cfg := &config.Config{ + WatchtowerCooldown: 0, + GracePeriod: 0, + BackupLabel: "docker-volume-backup.stop-during-backup", + BackupContainer: "my-backup", + } + dock := newMockDocker() + notif := &mockNotifier{} + clk := newMockClock(time.Now()) + + // No backup container running + dock.runningContainers = []container.Summary{} + + labels := map[string]string{ + "docker-volume-backup.stop-during-backup": "true", + } + + g := newTestGuardian(cfg, dock, notif, clk) + if g.shouldSkip(context.Background(), "abcdef123456", "test-container", labels) { + t.Error("should not skip when backup is not running") + } +} diff --git a/internal/guardian/guards.go b/internal/guardian/guards.go index e397479..9fa2b25 100644 --- a/internal/guardian/guards.go +++ b/internal/guardian/guards.go @@ -6,6 +6,8 @@ import ( "os/exec" "strings" "time" + + "github.com/Will-Luck/Docker-Guardian/internal/metrics" ) // shouldSkip returns true if this container should be skipped due to @@ -20,18 +22,20 @@ func (g *Guardian) shouldSkip(ctx context.Context, containerID, containerName st if g.cfg.WatchtowerScope == "affected" { if g.isContainerInOrchestration(cleanName) { - now := time.Now().Format("02-01-2006 15:04:05") + now := g.clock.Now().Format("02-01-2006 15:04:05") fmt.Printf("%s Container %s (%s) affected by orchestration activity within %ds - skipping\n", now, cleanName, shortID, g.cfg.WatchtowerCooldown) - g.dispatcher.Skip(fmt.Sprintf("Container %s (%s) skipped - orchestration activity", cleanName, shortID)) + g.notifier.Skip(fmt.Sprintf("Container %s (%s) skipped - orchestration activity", cleanName, shortID)) + metrics.SkipsTotal.WithLabelValues(cleanName, "orchestration").Inc() return true } } else { if g.isOrchestratorActive() { - now := time.Now().Format("02-01-2006 15:04:05") + now := g.clock.Now().Format("02-01-2006 15:04:05") fmt.Printf("%s Container %s (%s) skipped - orchestration activity detected within %ds\n", now, cleanName, shortID, g.cfg.WatchtowerCooldown) - g.dispatcher.Skip(fmt.Sprintf("Container %s (%s) skipped - orchestration activity", cleanName, shortID)) + g.notifier.Skip(fmt.Sprintf("Container %s (%s) skipped - orchestration activity", cleanName, shortID)) + metrics.SkipsTotal.WithLabelValues(cleanName, "orchestration").Inc() return true } } @@ -41,12 +45,13 @@ func (g *Guardian) shouldSkip(ctx context.Context, containerID, containerName st if g.cfg.GracePeriod > 0 { finishedAt, err := g.docker.ContainerFinishedAt(ctx, containerID) if err == nil { - age := time.Since(finishedAt) + age := g.clock.Since(finishedAt) if age < time.Duration(g.cfg.GracePeriod)*time.Second { - now := time.Now().Format("02-01-2006 15:04:05") + now := g.clock.Now().Format("02-01-2006 15:04:05") fmt.Printf("%s Container %s (%s) stopped within grace period (%ds) - skipping\n", now, cleanName, shortID, g.cfg.GracePeriod) - g.dispatcher.Skip(fmt.Sprintf("Container %s (%s) skipped - grace period", cleanName, shortID)) + g.notifier.Skip(fmt.Sprintf("Container %s (%s) skipped - grace period", cleanName, shortID)) + metrics.SkipsTotal.WithLabelValues(cleanName, "grace").Inc() return true } } @@ -54,10 +59,11 @@ func (g *Guardian) shouldSkip(ctx context.Context, containerID, containerName st // Backup awareness if g.isBackupManaged(labels) && g.cachedBackupRunning(ctx) { - now := time.Now().Format("02-01-2006 15:04:05") + now := g.clock.Now().Format("02-01-2006 15:04:05") fmt.Printf("%s Container %s (%s) managed by backup (currently running) - skipping\n", now, cleanName, shortID) - g.dispatcher.Skip(fmt.Sprintf("Container %s (%s) skipped - backup running", cleanName, shortID)) + g.notifier.Skip(fmt.Sprintf("Container %s (%s) skipped - backup running", cleanName, shortID)) + metrics.SkipsTotal.WithLabelValues(cleanName, "backup").Inc() return true } @@ -73,7 +79,7 @@ func (g *Guardian) fetchOrchestrationEvents(ctx context.Context) { g.orchestratorCached = true g.orchestratorEvents = nil - now := time.Now() + now := g.clock.Now() since := now.Add(-time.Duration(g.cfg.WatchtowerCooldown) * time.Second) orchestrationOnly := g.cfg.WatchtowerEvents != "all" @@ -84,7 +90,7 @@ func (g *Guardian) fetchOrchestrationEvents(ctx context.Context) { g.orchestratorEvents = events if len(events) > 0 { - dateStr := time.Now().Format("02-01-2006 15:04:05") + dateStr := g.clock.Now().Format("02-01-2006 15:04:05") fmt.Printf("%s Orchestration activity detected: %d container event(s) within %ds cooldown\n", dateStr, len(events), g.cfg.WatchtowerCooldown) } diff --git a/internal/guardian/mock_test.go b/internal/guardian/mock_test.go new file mode 100644 index 0000000..717878d --- /dev/null +++ b/internal/guardian/mock_test.go @@ -0,0 +1,179 @@ +package guardian + +import ( + "context" + "sync" + "time" + + "github.com/moby/moby/api/types/container" + "github.com/moby/moby/api/types/events" +) + +// mockDocker implements docker.API for testing. +type mockDocker struct { + mu sync.Mutex + + unhealthyContainers []container.Summary + unhealthyErr error + + exitedContainers []container.Summary + exitedErr error + + runningContainers []container.Summary + runningErr error + + inspectResults map[string]container.InspectResponse + inspectErr map[string]error + + restartCalls []string + restartErr map[string]error + + startCalls []string + startErr map[string]error + + stopCalls []string + stopErr map[string]error + + statusResults map[string]string + statusErr map[string]error + + finishedAtResults map[string]time.Time + finishedAtErr map[string]error + + containerEvents []events.Message + containerEventsErr error +} + +func newMockDocker() *mockDocker { + return &mockDocker{ + inspectResults: make(map[string]container.InspectResponse), + inspectErr: make(map[string]error), + restartErr: make(map[string]error), + startErr: make(map[string]error), + stopErr: make(map[string]error), + statusResults: make(map[string]string), + statusErr: make(map[string]error), + finishedAtResults: make(map[string]time.Time), + finishedAtErr: make(map[string]error), + } +} + +func (m *mockDocker) UnhealthyContainers(_ context.Context, _ string, _ bool) ([]container.Summary, error) { + return m.unhealthyContainers, m.unhealthyErr +} + +func (m *mockDocker) ExitedContainers(_ context.Context) ([]container.Summary, error) { + return m.exitedContainers, m.exitedErr +} + +func (m *mockDocker) RunningContainers(_ context.Context) ([]container.Summary, error) { + return m.runningContainers, m.runningErr +} + +func (m *mockDocker) InspectContainer(_ context.Context, id string) (container.InspectResponse, error) { + if err, ok := m.inspectErr[id]; ok && err != nil { + return container.InspectResponse{}, err + } + return m.inspectResults[id], nil +} + +func (m *mockDocker) RestartContainer(_ context.Context, id string, _ int) error { + m.mu.Lock() + m.restartCalls = append(m.restartCalls, id) + m.mu.Unlock() + if err, ok := m.restartErr[id]; ok { + return err + } + return nil +} + +func (m *mockDocker) StartContainer(_ context.Context, id string) error { + m.mu.Lock() + m.startCalls = append(m.startCalls, id) + m.mu.Unlock() + if err, ok := m.startErr[id]; ok { + return err + } + return nil +} + +func (m *mockDocker) StopContainer(_ context.Context, id string, _ int) error { + m.mu.Lock() + m.stopCalls = append(m.stopCalls, id) + m.mu.Unlock() + if err, ok := m.stopErr[id]; ok { + return err + } + return nil +} + +func (m *mockDocker) ContainerStatus(_ context.Context, id string) (string, error) { + if err, ok := m.statusErr[id]; ok && err != nil { + return "", err + } + return m.statusResults[id], nil +} + +func (m *mockDocker) ContainerFinishedAt(_ context.Context, id string) (time.Time, error) { + if err, ok := m.finishedAtErr[id]; ok && err != nil { + return time.Time{}, err + } + return m.finishedAtResults[id], nil +} + +func (m *mockDocker) ContainerEvents(_ context.Context, _, _ time.Time, _ bool) ([]events.Message, error) { + return m.containerEvents, m.containerEventsErr +} + +func (m *mockDocker) Close() error { return nil } + +// mockNotifier implements notify.Notifier for testing. +type mockNotifier struct { + mu sync.Mutex + startups []string + actions []string + skips []string + closed bool +} + +func (m *mockNotifier) Startup(text string) { + m.mu.Lock() + m.startups = append(m.startups, text) + m.mu.Unlock() +} + +func (m *mockNotifier) Action(text string) { + m.mu.Lock() + m.actions = append(m.actions, text) + m.mu.Unlock() +} + +func (m *mockNotifier) Skip(text string) { + m.mu.Lock() + m.skips = append(m.skips, text) + m.mu.Unlock() +} + +func (m *mockNotifier) Close() { + m.mu.Lock() + m.closed = true + m.mu.Unlock() +} + +// mockClock implements clock.Clock for testing. +type mockClock struct { + now time.Time +} + +func newMockClock(t time.Time) *mockClock { + return &mockClock{now: t} +} + +func (c *mockClock) Now() time.Time { return c.now } +func (c *mockClock) After(d time.Duration) <-chan time.Time { + ch := make(chan time.Time, 1) + ch <- c.now.Add(d) + return ch +} +func (c *mockClock) Since(t time.Time) time.Duration { return c.now.Sub(t) } +func (c *mockClock) Advance(d time.Duration) { c.now = c.now.Add(d) } diff --git a/internal/guardian/tracker.go b/internal/guardian/tracker.go new file mode 100644 index 0000000..8c9b9b4 --- /dev/null +++ b/internal/guardian/tracker.go @@ -0,0 +1,204 @@ +package guardian + +import ( + "fmt" + "sync" + "time" + + "github.com/Will-Luck/Docker-Guardian/internal/clock" +) + +// TrackerConfig holds circuit breaker / backoff settings. +type TrackerConfig struct { + BackoffMultiplier float64 // multiplicative factor for each retry (default 2) + BackoffMax time.Duration // cap on backoff delay (default 300s) + BackoffResetAfter time.Duration // healthy for this long resets backoff (default 600s) + RestartBudget int // max restarts per window (0 = unlimited) + RestartWindow time.Duration // rolling window for budget (default 300s) +} + +// DefaultTrackerConfig returns sensible defaults. +func DefaultTrackerConfig() TrackerConfig { + return TrackerConfig{ + BackoffMultiplier: 2, + BackoffMax: 300 * time.Second, + BackoffResetAfter: 600 * time.Second, + RestartBudget: 5, + RestartWindow: 300 * time.Second, + } +} + +// ContainerHistory tracks restart history for a single container. +type ContainerHistory struct { + Restarts []time.Time // timestamps of recent restarts + BackoffUntil time.Time // next allowed restart time + BackoffDelay time.Duration // current backoff delay + CircuitOpen bool // true = budget exhausted +} + +// SkipReason describes why a restart was suppressed. +type SkipReason string + +const ( + SkipNone SkipReason = "" + SkipBackoff SkipReason = "backoff" + SkipCircuit SkipReason = "circuit" +) + +// RestartTracker implements per-container circuit breaker and exponential backoff. +type RestartTracker struct { + mu sync.Mutex + history map[string]*ContainerHistory + cfg TrackerConfig + clock clock.Clock +} + +// NewRestartTracker creates a tracker with the given config. +func NewRestartTracker(cfg TrackerConfig, clk clock.Clock) *RestartTracker { + return &RestartTracker{ + history: make(map[string]*ContainerHistory), + cfg: cfg, + clock: clk, + } +} + +// ShouldRestart checks if a restart is allowed for the given container. +// Returns (allowed, skipReason). +func (rt *RestartTracker) ShouldRestart(id string) (bool, SkipReason) { + rt.mu.Lock() + defer rt.mu.Unlock() + + h := rt.getOrCreate(id) + now := rt.clock.Now() + + // Prune old restarts outside the window + rt.pruneOld(h) + + // Check circuit breaker (budget exhausted) + if h.CircuitOpen { + return false, SkipCircuit + } + + // Check backoff + if now.Before(h.BackoffUntil) { + return false, SkipBackoff + } + + // Check budget + if rt.cfg.RestartBudget > 0 && len(h.Restarts) >= rt.cfg.RestartBudget { + h.CircuitOpen = true + return false, SkipCircuit + } + + return true, SkipNone +} + +// RecordRestart records that a restart was performed for the given container. +// Advances the backoff delay for next time. +func (rt *RestartTracker) RecordRestart(id string) { + rt.mu.Lock() + defer rt.mu.Unlock() + + h := rt.getOrCreate(id) + now := rt.clock.Now() + + h.Restarts = append(h.Restarts, now) + + // Calculate next backoff + if h.BackoffDelay == 0 { + h.BackoffDelay = 10 * time.Second // initial backoff + } else { + h.BackoffDelay = time.Duration(float64(h.BackoffDelay) * rt.cfg.BackoffMultiplier) + } + if h.BackoffDelay > rt.cfg.BackoffMax { + h.BackoffDelay = rt.cfg.BackoffMax + } + h.BackoffUntil = now.Add(h.BackoffDelay) +} + +// Reset clears backoff and restart history for a container (e.g. when it becomes healthy). +func (rt *RestartTracker) Reset(id string) { + rt.mu.Lock() + defer rt.mu.Unlock() + + delete(rt.history, id) +} + +// IsCircuitOpen returns true if the circuit is open for the given container. +func (rt *RestartTracker) IsCircuitOpen(id string) bool { + rt.mu.Lock() + defer rt.mu.Unlock() + + h, ok := rt.history[id] + if !ok { + return false + } + return h.CircuitOpen +} + +// BackoffRemaining returns the time remaining in backoff for a container. +func (rt *RestartTracker) BackoffRemaining(id string) time.Duration { + rt.mu.Lock() + defer rt.mu.Unlock() + + h, ok := rt.history[id] + if !ok { + return 0 + } + remaining := h.BackoffUntil.Sub(rt.clock.Now()) + if remaining < 0 { + return 0 + } + return remaining +} + +// CircuitOpenCount returns the number of containers with open circuits. +func (rt *RestartTracker) CircuitOpenCount() int { + rt.mu.Lock() + defer rt.mu.Unlock() + + count := 0 + for _, h := range rt.history { + if h.CircuitOpen { + count++ + } + } + return count +} + +// FormatSkipReason returns a human-readable string for a skip reason. +func (rt *RestartTracker) FormatSkipReason(id, name string, reason SkipReason) string { + switch reason { + case SkipBackoff: + remaining := rt.BackoffRemaining(id) + return fmt.Sprintf("Container %s in backoff (%.0fs remaining)", name, remaining.Seconds()) + case SkipCircuit: + return fmt.Sprintf("Container %s circuit open (restart budget exhausted)", name) + default: + return "" + } +} + +func (rt *RestartTracker) getOrCreate(id string) *ContainerHistory { + h, ok := rt.history[id] + if !ok { + h = &ContainerHistory{} + rt.history[id] = h + } + return h +} + +func (rt *RestartTracker) pruneOld(h *ContainerHistory) { + if rt.cfg.RestartWindow <= 0 { + return + } + cutoff := rt.clock.Now().Add(-rt.cfg.RestartWindow) + i := 0 + for _, t := range h.Restarts { + if t.After(cutoff) { + h.Restarts[i] = t + i++ + } + } + h.Restarts = h.Restarts[:i] +} diff --git a/internal/guardian/tracker_test.go b/internal/guardian/tracker_test.go new file mode 100644 index 0000000..dc5786a --- /dev/null +++ b/internal/guardian/tracker_test.go @@ -0,0 +1,224 @@ +package guardian + +import ( + "testing" + "time" +) + +func TestTracker_AllowsFirstRestart(t *testing.T) { + clk := newMockClock(time.Now()) + rt := NewRestartTracker(DefaultTrackerConfig(), clk) + + allowed, reason := rt.ShouldRestart("abc123") + if !allowed { + t.Errorf("first restart should be allowed, got reason=%s", reason) + } +} + +func TestTracker_BackoffAfterRestart(t *testing.T) { + clk := newMockClock(time.Now()) + rt := NewRestartTracker(DefaultTrackerConfig(), clk) + + rt.RecordRestart("abc123") + + // Immediately after, should be in backoff + allowed, reason := rt.ShouldRestart("abc123") + if allowed { + t.Error("should be in backoff after restart") + } + if reason != SkipBackoff { + t.Errorf("expected backoff reason, got %s", reason) + } + + // Advance past initial 10s backoff + clk.Advance(11 * time.Second) + + allowed, reason = rt.ShouldRestart("abc123") + if !allowed { + t.Errorf("should be allowed after backoff expires, got reason=%s", reason) + } +} + +func TestTracker_ExponentialBackoff(t *testing.T) { + clk := newMockClock(time.Now()) + rt := NewRestartTracker(DefaultTrackerConfig(), clk) + + // First restart: 10s backoff + rt.RecordRestart("abc123") + if rt.BackoffRemaining("abc123") > 10*time.Second || rt.BackoffRemaining("abc123") < 9*time.Second { + t.Errorf("expected ~10s backoff, got %v", rt.BackoffRemaining("abc123")) + } + + // Advance past first backoff + clk.Advance(11 * time.Second) + + // Second restart: 20s backoff + rt.RecordRestart("abc123") + remaining := rt.BackoffRemaining("abc123") + if remaining > 20*time.Second || remaining < 19*time.Second { + t.Errorf("expected ~20s backoff, got %v", remaining) + } + + // Advance past second backoff + clk.Advance(21 * time.Second) + + // Third restart: 40s backoff + rt.RecordRestart("abc123") + remaining = rt.BackoffRemaining("abc123") + if remaining > 40*time.Second || remaining < 39*time.Second { + t.Errorf("expected ~40s backoff, got %v", remaining) + } +} + +func TestTracker_BackoffMax(t *testing.T) { + clk := newMockClock(time.Now()) + cfg := DefaultTrackerConfig() + cfg.BackoffMax = 30 * time.Second + rt := NewRestartTracker(cfg, clk) + + // Restart 5 times, each time advancing past backoff + for i := 0; i < 5; i++ { + rt.RecordRestart("abc123") + clk.Advance(cfg.BackoffMax + time.Second) + } + + // Backoff should be capped at BackoffMax + rt.RecordRestart("abc123") + remaining := rt.BackoffRemaining("abc123") + if remaining > cfg.BackoffMax { + t.Errorf("backoff should be capped at %v, got %v", cfg.BackoffMax, remaining) + } +} + +func TestTracker_BudgetExhausted(t *testing.T) { + clk := newMockClock(time.Now()) + cfg := DefaultTrackerConfig() + cfg.RestartBudget = 3 + cfg.RestartWindow = 3600 * time.Second // large window so restarts aren't pruned + cfg.BackoffMax = 10 * time.Second // small max so we advance less + rt := NewRestartTracker(cfg, clk) + + // Record 3 restarts (budget = 3), advancing past backoff each time + for i := 0; i < 3; i++ { + rt.RecordRestart("abc123") + clk.Advance(cfg.BackoffMax + time.Second) + } + + // 4th restart should be denied — circuit open + allowed, reason := rt.ShouldRestart("abc123") + if allowed { + t.Error("should deny restart when budget exhausted") + } + if reason != SkipCircuit { + t.Errorf("expected circuit reason, got %s", reason) + } + if !rt.IsCircuitOpen("abc123") { + t.Error("circuit should be open") + } +} + +func TestTracker_BudgetUnlimited(t *testing.T) { + clk := newMockClock(time.Now()) + cfg := DefaultTrackerConfig() + cfg.RestartBudget = 0 // unlimited + rt := NewRestartTracker(cfg, clk) + + for i := 0; i < 20; i++ { + rt.RecordRestart("abc123") + clk.Advance(cfg.BackoffMax + time.Second) // advance past max backoff + } + + allowed, _ := rt.ShouldRestart("abc123") + if !allowed { + t.Error("should allow restarts when budget is unlimited") + } +} + +func TestTracker_Reset(t *testing.T) { + clk := newMockClock(time.Now()) + cfg := DefaultTrackerConfig() + cfg.RestartBudget = 2 + rt := NewRestartTracker(cfg, clk) + + // Exhaust budget + rt.RecordRestart("abc123") + rt.RecordRestart("abc123") + clk.Advance(11 * time.Second) + + allowed, _ := rt.ShouldRestart("abc123") + if allowed { + t.Error("should deny after budget exhausted") + } + + // Reset (container became healthy) + rt.Reset("abc123") + + allowed, _ = rt.ShouldRestart("abc123") + if !allowed { + t.Error("should allow after reset") + } +} + +func TestTracker_PruneOldRestarts(t *testing.T) { + clk := newMockClock(time.Now()) + cfg := DefaultTrackerConfig() + cfg.RestartBudget = 3 + cfg.RestartWindow = 60 * time.Second + rt := NewRestartTracker(cfg, clk) + + // Record 3 restarts + for i := 0; i < 3; i++ { + rt.RecordRestart("abc123") + clk.Advance(11 * time.Second) + } + + // Advance past the window so old restarts are pruned + clk.Advance(60 * time.Second) + + // Should be allowed now — old restarts pruned + allowed, _ := rt.ShouldRestart("abc123") + if !allowed { + t.Error("should allow after old restarts are pruned") + } +} + +func TestTracker_CircuitOpenCount(t *testing.T) { + clk := newMockClock(time.Now()) + cfg := DefaultTrackerConfig() + cfg.RestartBudget = 1 + rt := NewRestartTracker(cfg, clk) + + rt.RecordRestart("a") + clk.Advance(11 * time.Second) + rt.ShouldRestart("a") // triggers circuit open + + rt.RecordRestart("b") + clk.Advance(11 * time.Second) + rt.ShouldRestart("b") // triggers circuit open + + if rt.CircuitOpenCount() != 2 { + t.Errorf("expected 2 open circuits, got %d", rt.CircuitOpenCount()) + } +} + +func TestContainerAction(t *testing.T) { + tests := []struct { + labels map[string]string + expected string + }{ + {nil, "restart"}, + {map[string]string{}, "restart"}, + {map[string]string{"autoheal.action": "restart"}, "restart"}, + {map[string]string{"autoheal.action": "stop"}, "stop"}, + {map[string]string{"autoheal.action": "notify"}, "notify"}, + {map[string]string{"autoheal.action": "none"}, "none"}, + {map[string]string{"autoheal.action": "invalid"}, "restart"}, + } + + for _, tt := range tests { + got := containerAction(tt.labels) + if got != tt.expected { + t.Errorf("containerAction(%v) = %q, want %q", tt.labels, got, tt.expected) + } + } +} diff --git a/internal/guardian/unhealthy.go b/internal/guardian/unhealthy.go index 527fc0f..ce45b6c 100644 --- a/internal/guardian/unhealthy.go +++ b/internal/guardian/unhealthy.go @@ -6,9 +6,23 @@ import ( "strconv" "strings" "time" + + "github.com/Will-Luck/Docker-Guardian/internal/metrics" ) -// checkUnhealthy finds unhealthy containers and restarts them. +// containerAction returns the action to take for a container based on its labels. +// Possible values: "restart" (default), "stop", "notify", "none". +func containerAction(labels map[string]string) string { + if action, ok := labels["autoheal.action"]; ok { + switch action { + case "restart", "stop", "notify", "none": + return action + } + } + return "restart" +} + +// checkUnhealthy finds unhealthy containers and handles them based on action labels. func (g *Guardian) checkUnhealthy(ctx context.Context) { containers, err := g.docker.UnhealthyContainers(ctx, g.cfg.ContainerLabel, g.cfg.OnlyMonitorRunning) if err != nil { @@ -16,18 +30,31 @@ func (g *Guardian) checkUnhealthy(ctx context.Context) { return } + metrics.UnhealthyContainers.Set(float64(len(containers))) + metrics.CircuitOpenContainers.Set(float64(g.tracker.CircuitOpenCount())) + for _, c := range containers { // Skip containers opted out via label if c.Labels["autoheal"] == "False" { continue } + if len(c.Names) == 0 { + continue + } + id := c.ID shortID := id[:12] name := strings.TrimPrefix(c.Names[0], "/") + // Check per-container action label + action := containerAction(c.Labels) + if action == "none" { + continue + } + if string(c.State) == "restarting" { - now := time.Now().Format("02-01-2006 15:04:05") + now := g.clock.Now().Format("02-01-2006 15:04:05") fmt.Printf("%s Container %s (%s) found to be restarting - don't restart\n", now, name, shortID) continue } @@ -36,6 +63,24 @@ func (g *Guardian) checkUnhealthy(ctx context.Context) { continue } + // Handle notify-only action + if action == "notify" { + g.notifier.Action(fmt.Sprintf("Container %s (%s) found to be unhealthy (action=notify)", name, shortID)) + continue + } + + // Circuit breaker check (for restart and stop actions) + if allowed, reason := g.tracker.ShouldRestart(id); !allowed { + msg := g.tracker.FormatSkipReason(id, name, reason) + now := g.clock.Now().Format("02-01-2006 15:04:05") + fmt.Printf("%s %s\n", now, msg) + metrics.SkipsTotal.WithLabelValues(name, string(reason)).Inc() + if reason == SkipCircuit { + g.notifier.Action(fmt.Sprintf("[CRITICAL] %s", msg)) + } + continue + } + timeout := g.cfg.DefaultStopTimeout if v, ok := c.Labels["autoheal.stop.timeout"]; ok { if parsed, err := strconv.Atoi(v); err == nil { @@ -43,17 +88,39 @@ func (g *Guardian) checkUnhealthy(ctx context.Context) { } } - now := time.Now().Format("02-01-2006 15:04:05") + // Handle stop action (quarantine) + if action == "stop" { + now := g.clock.Now().Format("02-01-2006 15:04:05") + fmt.Printf("%s Container %s (%s) found to be unhealthy - Stopping container (action=stop)\n", now, name, shortID) + if err := g.docker.StopContainer(ctx, id, timeout); err != nil { + g.log.Error("failed to stop container", "container", name, "id", shortID, "error", err) + g.notifier.Action(fmt.Sprintf("Container %s (%s) found to be unhealthy. Failed to stop (quarantine)!", name, shortID)) + metrics.RestartsTotal.WithLabelValues(name, "failure").Inc() + } else { + g.notifier.Action(fmt.Sprintf("Container %s (%s) found to be unhealthy. Stopped (quarantined).", name, shortID)) + metrics.RestartsTotal.WithLabelValues(name, "success").Inc() + } + g.tracker.RecordRestart(id) + continue + } + + // Default: restart + now := g.clock.Now().Format("02-01-2006 15:04:05") fmt.Printf("%s Container %s (%s) found to be unhealthy - Restarting container now with %ds timeout\n", now, name, shortID, timeout) + start := time.Now() if err := g.docker.RestartContainer(ctx, id, timeout); err != nil { g.log.Error("failed to restart container", "container", name, "id", shortID, "error", err) - g.dispatcher.Action(fmt.Sprintf("Container %s (%s) found to be unhealthy. Failed to restart the container!", name, shortID)) + g.notifier.Action(fmt.Sprintf("Container %s (%s) found to be unhealthy. Failed to restart the container!", name, shortID)) + metrics.RestartsTotal.WithLabelValues(name, "failure").Inc() } else { - g.dispatcher.Action(fmt.Sprintf("Container %s (%s) found to be unhealthy. Successfully restarted the container!", name, shortID)) + g.notifier.Action(fmt.Sprintf("Container %s (%s) found to be unhealthy. Successfully restarted the container!", name, shortID)) + metrics.RestartsTotal.WithLabelValues(name, "success").Inc() } + metrics.RestartDuration.WithLabelValues(name).Observe(time.Since(start).Seconds()) + g.tracker.RecordRestart(id) g.runPostRestartScript(name, shortID, string(c.State), timeout) } } diff --git a/internal/guardian/unhealthy_test.go b/internal/guardian/unhealthy_test.go new file mode 100644 index 0000000..3e3cdc9 --- /dev/null +++ b/internal/guardian/unhealthy_test.go @@ -0,0 +1,243 @@ +package guardian + +import ( + "context" + "errors" + "strings" + "testing" + "time" + + "github.com/Will-Luck/Docker-Guardian/internal/config" + "github.com/Will-Luck/Docker-Guardian/internal/logging" + "github.com/moby/moby/api/types/container" +) + +func TestCheckUnhealthy_RestartsContainer(t *testing.T) { + cfg := &config.Config{ + ContainerLabel: "all", + DefaultStopTimeout: 10, + WatchtowerCooldown: 0, + GracePeriod: 0, + BackupLabel: "", + } + dock := newMockDocker() + notif := &mockNotifier{} + clk := newMockClock(time.Now()) + + dock.unhealthyContainers = []container.Summary{ + { + ID: "abcdef1234567890abcdef", + Names: []string{"/test-app"}, + State: "running", + Labels: map[string]string{}, + }, + } + + g := &Guardian{ + cfg: cfg, + docker: dock, + notifier: notif, + log: logging.New(false), + clock: clk, + tracker: NewRestartTracker(DefaultTrackerConfig(), clk), + } + + g.checkUnhealthy(context.Background()) + + if len(dock.restartCalls) != 1 { + t.Fatalf("expected 1 restart call, got %d", len(dock.restartCalls)) + } + if dock.restartCalls[0] != "abcdef1234567890abcdef" { + t.Errorf("restarted wrong container: %s", dock.restartCalls[0]) + } + if len(notif.actions) != 1 || !strings.Contains(notif.actions[0], "Successfully restarted") { + t.Errorf("expected success notification, got %v", notif.actions) + } +} + +func TestCheckUnhealthy_SkipsOptedOut(t *testing.T) { + cfg := &config.Config{ + ContainerLabel: "all", + DefaultStopTimeout: 10, + WatchtowerCooldown: 0, + GracePeriod: 0, + BackupLabel: "", + } + dock := newMockDocker() + notif := &mockNotifier{} + clk := newMockClock(time.Now()) + + dock.unhealthyContainers = []container.Summary{ + { + ID: "abcdef1234567890abcdef", + Names: []string{"/opted-out"}, + State: "running", + Labels: map[string]string{"autoheal": "False"}, + }, + } + + g := &Guardian{ + cfg: cfg, + docker: dock, + notifier: notif, + log: logging.New(false), + clock: clk, + tracker: NewRestartTracker(DefaultTrackerConfig(), clk), + } + + g.checkUnhealthy(context.Background()) + + if len(dock.restartCalls) != 0 { + t.Error("should not restart opted-out container") + } +} + +func TestCheckUnhealthy_SkipsEmptyNames(t *testing.T) { + cfg := &config.Config{ + ContainerLabel: "all", + DefaultStopTimeout: 10, + WatchtowerCooldown: 0, + GracePeriod: 0, + BackupLabel: "", + } + dock := newMockDocker() + notif := &mockNotifier{} + clk := newMockClock(time.Now()) + + dock.unhealthyContainers = []container.Summary{ + { + ID: "abcdef1234567890abcdef", + Names: []string{}, + State: "running", + Labels: map[string]string{}, + }, + } + + g := &Guardian{ + cfg: cfg, + docker: dock, + notifier: notif, + log: logging.New(false), + clock: clk, + tracker: NewRestartTracker(DefaultTrackerConfig(), clk), + } + + // Should not panic + g.checkUnhealthy(context.Background()) + + if len(dock.restartCalls) != 0 { + t.Error("should not restart container with no names") + } +} + +func TestCheckUnhealthy_SkipsRestarting(t *testing.T) { + cfg := &config.Config{ + ContainerLabel: "all", + DefaultStopTimeout: 10, + WatchtowerCooldown: 0, + GracePeriod: 0, + BackupLabel: "", + } + dock := newMockDocker() + notif := &mockNotifier{} + clk := newMockClock(time.Now()) + + dock.unhealthyContainers = []container.Summary{ + { + ID: "abcdef1234567890abcdef", + Names: []string{"/restarting-app"}, + State: "restarting", + Labels: map[string]string{}, + }, + } + + g := &Guardian{ + cfg: cfg, + docker: dock, + notifier: notif, + log: logging.New(false), + clock: clk, + tracker: NewRestartTracker(DefaultTrackerConfig(), clk), + } + + g.checkUnhealthy(context.Background()) + + if len(dock.restartCalls) != 0 { + t.Error("should not restart container already in restarting state") + } +} + +func TestCheckUnhealthy_RestartFailure(t *testing.T) { + cfg := &config.Config{ + ContainerLabel: "all", + DefaultStopTimeout: 10, + WatchtowerCooldown: 0, + GracePeriod: 0, + BackupLabel: "", + } + dock := newMockDocker() + notif := &mockNotifier{} + clk := newMockClock(time.Now()) + + dock.unhealthyContainers = []container.Summary{ + { + ID: "abcdef1234567890abcdef", + Names: []string{"/fail-app"}, + State: "running", + Labels: map[string]string{}, + }, + } + dock.restartErr["abcdef1234567890abcdef"] = errors.New("restart failed") + + g := &Guardian{ + cfg: cfg, + docker: dock, + notifier: notif, + log: logging.New(false), + clock: clk, + tracker: NewRestartTracker(DefaultTrackerConfig(), clk), + } + + g.checkUnhealthy(context.Background()) + + if len(notif.actions) != 1 || !strings.Contains(notif.actions[0], "Failed") { + t.Errorf("expected failure notification, got %v", notif.actions) + } +} + +func TestCheckUnhealthy_CustomTimeout(t *testing.T) { + cfg := &config.Config{ + ContainerLabel: "all", + DefaultStopTimeout: 10, + WatchtowerCooldown: 0, + GracePeriod: 0, + BackupLabel: "", + } + dock := newMockDocker() + notif := &mockNotifier{} + clk := newMockClock(time.Now()) + + dock.unhealthyContainers = []container.Summary{ + { + ID: "abcdef1234567890abcdef", + Names: []string{"/custom-timeout"}, + State: "running", + Labels: map[string]string{"autoheal.stop.timeout": "30"}, + }, + } + + g := &Guardian{ + cfg: cfg, + docker: dock, + notifier: notif, + log: logging.New(false), + clock: clk, + tracker: NewRestartTracker(DefaultTrackerConfig(), clk), + } + + g.checkUnhealthy(context.Background()) + + if len(dock.restartCalls) != 1 { + t.Fatalf("expected 1 restart, got %d", len(dock.restartCalls)) + } +} diff --git a/internal/metrics/prometheus.go b/internal/metrics/prometheus.go index 150c230..fee15c5 100644 --- a/internal/metrics/prometheus.go +++ b/internal/metrics/prometheus.go @@ -1,4 +1,92 @@ package metrics -// Prometheus metrics endpoint — placeholder for v2.0.0. -// Will expose /metrics with counters for restarts, starts, skips, and notification outcomes. +import ( + "fmt" + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +// Metrics holds all Prometheus metric collectors. +var ( + RestartsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "docker_guardian_restarts_total", + Help: "Total container restarts by result.", + }, []string{"container", "result"}) + + SkipsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "docker_guardian_skips_total", + Help: "Total skipped containers by reason.", + }, []string{"container", "reason"}) + + NotificationsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "docker_guardian_notifications_total", + Help: "Total notification sends by service and result.", + }, []string{"service", "result"}) + + EventsProcessedTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "docker_guardian_events_processed_total", + Help: "Total Docker events processed by action.", + }, []string{"action"}) + + UnhealthyContainers = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "docker_guardian_unhealthy_containers", + Help: "Current number of unhealthy containers.", + }) + + CircuitOpenContainers = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "docker_guardian_circuit_open_containers", + Help: "Number of containers with open circuit breakers.", + }) + + EventStreamConnected = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "docker_guardian_event_stream_connected", + Help: "1 if connected to Docker event stream, 0 otherwise.", + }) + + RestartDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "docker_guardian_restart_duration_seconds", + Help: "Time taken to restart a container.", + Buckets: prometheus.DefBuckets, + }, []string{"container"}) + + EventProcessingDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "docker_guardian_event_processing_duration_seconds", + Help: "Time taken to process a Docker event.", + Buckets: prometheus.DefBuckets, + }) +) + +func init() { + prometheus.MustRegister( + RestartsTotal, + SkipsTotal, + NotificationsTotal, + EventsProcessedTotal, + UnhealthyContainers, + CircuitOpenContainers, + EventStreamConnected, + RestartDuration, + EventProcessingDuration, + ) +} + +// Serve starts the Prometheus metrics HTTP server on the given port. +// Returns immediately; the server runs in the background. +// If port is 0, metrics are disabled and this is a no-op. +func Serve(port int) { + if port == 0 { + return + } + + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.Handler()) + + go func() { + addr := fmt.Sprintf(":%d", port) + if err := http.ListenAndServe(addr, mux); err != nil { //nolint:gosec // Metrics endpoint, intentionally unauthenticated + fmt.Printf("metrics server error: %v\n", err) + } + }() +} diff --git a/internal/notify/notifier.go b/internal/notify/notifier.go index 813c068..ba591ea 100644 --- a/internal/notify/notifier.go +++ b/internal/notify/notifier.go @@ -8,18 +8,33 @@ import ( "net/smtp" "net/url" "strings" + "sync" "time" "github.com/Will-Luck/Docker-Guardian/internal/config" "github.com/Will-Luck/Docker-Guardian/internal/logging" + "github.com/Will-Luck/Docker-Guardian/internal/metrics" ) +// Notifier is the interface for sending notifications from the guardian. +type Notifier interface { + Startup(text string) + Action(text string) + Skip(text string) + Close() +} + // Dispatcher sends notifications to all configured services. type Dispatcher struct { cfg *config.Config log *logging.Logger client *http.Client resolved []string + wg sync.WaitGroup + + // Rate limiting: per container+event key → last notification time + rateMu sync.Mutex + rateLimit map[string]time.Time } // NewDispatcher creates a notification dispatcher from config. @@ -30,7 +45,22 @@ func NewDispatcher(cfg *config.Config, log *logging.Logger) *Dispatcher { client: &http.Client{ Timeout: time.Duration(cfg.CurlTimeout) * time.Second, }, - resolved: cfg.ResolvedNotifyEvents(), + resolved: cfg.ResolvedNotifyEvents(), + rateLimit: make(map[string]time.Time), + } +} + +// Close waits for in-flight notification goroutines to finish, with a 10-second timeout. +func (d *Dispatcher) Close() { + done := make(chan struct{}) + go func() { + d.wg.Wait() + close(done) + }() + select { + case <-done: + case <-time.After(10 * time.Second): + d.log.Warn("notification shutdown timed out after 10s, some notifications may have been lost") } } @@ -82,17 +112,38 @@ func (d *Dispatcher) hasEvent(event string) bool { return false } +// isRateLimited checks if a notification for this key is rate-limited. +// Returns true if the notification should be suppressed. +func (d *Dispatcher) isRateLimited(key string) bool { + if d.cfg.NotifyRateLimit <= 0 { + return false + } + + d.rateMu.Lock() + defer d.rateMu.Unlock() + + window := time.Duration(d.cfg.NotifyRateLimit) * time.Second + if last, ok := d.rateLimit[key]; ok { + if time.Since(last) < window { + return true + } + } + d.rateLimit[key] = time.Now() + return false +} + // Startup sends a startup notification. func (d *Dispatcher) Startup(text string) { if !d.hasEvent("startup") { return } - d.dispatch(text) + d.dispatch(text, false) } // Action sends an action notification (success or failure). +// Action events use retry on failure. func (d *Dispatcher) Action(text string) { - if strings.Contains(text, "Failed") { + if strings.Contains(text, "Failed") || strings.Contains(text, "[CRITICAL]") { if !d.hasEvent("actions") && !d.hasEvent("failures") { return } @@ -101,7 +152,17 @@ func (d *Dispatcher) Action(text string) { return } } - d.dispatch(text) + + // Rate limit using the first ~50 chars as a key (contains container name) + key := text + if len(key) > 50 { + key = key[:50] + } + if d.isRateLimited(key) { + return + } + + d.dispatch(text, true) } // Skip sends a skip notification. @@ -109,10 +170,10 @@ func (d *Dispatcher) Skip(text string) { if !d.hasEvent("skips") { return } - d.dispatch(text) + d.dispatch(text, false) } -func (d *Dispatcher) dispatch(text string) { +func (d *Dispatcher) dispatch(text string, retry bool) { if d.hasEvent("debug") { now := time.Now().Format("2006-01-02T15:04:05-0700") services := d.ConfiguredServices() @@ -124,97 +185,201 @@ func (d *Dispatcher) dispatch(text string) { } if d.cfg.WebhookURL != "" { - go d.sendJSON(d.cfg.WebhookURL, map[string]string{d.cfg.WebhookJSONKey: text}) + d.wg.Add(1) + go func() { + defer d.wg.Done() + d.sendWithRetry("webhook", retry, func() error { + return d.sendJSON(d.cfg.WebhookURL, map[string]string{d.cfg.WebhookJSONKey: text}) + }) + }() } if d.cfg.AppriseURL != "" { - go d.sendJSON(d.cfg.AppriseURL, map[string]string{"title": "Docker-Guardian", "body": text}) + d.wg.Add(1) + go func() { + defer d.wg.Done() + d.sendWithRetry("apprise", retry, func() error { + return d.sendJSON(d.cfg.AppriseURL, map[string]string{"title": "Docker-Guardian", "body": text}) + }) + }() } if d.cfg.GotifyURL != "" { - go d.sendJSON(d.cfg.GotifyURL+"/message?token="+d.cfg.GotifyToken, - map[string]any{"title": "Docker-Guardian", "message": text, "priority": 5}) + d.wg.Add(1) + go func() { + defer d.wg.Done() + d.sendWithRetry("gotify", retry, func() error { + return d.sendJSON(d.cfg.GotifyURL+"/message?token="+d.cfg.GotifyToken, + map[string]any{"title": "Docker-Guardian", "message": text, "priority": 5}) + }) + }() } if d.cfg.DiscordWebhook != "" { - go d.sendJSON(d.cfg.DiscordWebhook, map[string]any{ - "embeds": []map[string]any{{"title": "Docker-Guardian", "description": text, "color": 3066993}}, - }) + d.wg.Add(1) + go func() { + defer d.wg.Done() + d.sendWithRetry("discord", retry, func() error { + return d.sendJSON(d.cfg.DiscordWebhook, map[string]any{ + "embeds": []map[string]any{{"title": "Docker-Guardian", "description": text, "color": 3066993}}, + }) + }) + }() } if d.cfg.SlackWebhook != "" { - go d.sendJSON(d.cfg.SlackWebhook, map[string]string{"text": "*Docker-Guardian*\n" + text}) + d.wg.Add(1) + go func() { + defer d.wg.Done() + d.sendWithRetry("slack", retry, func() error { + return d.sendJSON(d.cfg.SlackWebhook, map[string]string{"text": "*Docker-Guardian*\n" + text}) + }) + }() } if d.cfg.TelegramToken != "" { - go d.sendJSON("https://api.telegram.org/bot"+d.cfg.TelegramToken+"/sendMessage", - map[string]string{"chat_id": d.cfg.TelegramChatID, "text": "Docker-Guardian: " + text}) + d.wg.Add(1) + go func() { + defer d.wg.Done() + d.sendWithRetry("telegram", retry, func() error { + return d.sendJSON("https://api.telegram.org/bot"+d.cfg.TelegramToken+"/sendMessage", + map[string]string{"chat_id": d.cfg.TelegramChatID, "text": "Docker-Guardian: " + text}) + }) + }() } if d.cfg.PushoverToken != "" { - go d.sendForm("https://api.pushover.net/1/messages.json", map[string]string{ - "token": d.cfg.PushoverToken, "user": d.cfg.PushoverUser, - "title": "Docker-Guardian", "message": text, - }) + d.wg.Add(1) + go func() { + defer d.wg.Done() + d.sendWithRetry("pushover", retry, func() error { + return d.sendForm("https://api.pushover.net/1/messages.json", map[string]string{ + "token": d.cfg.PushoverToken, "user": d.cfg.PushoverUser, + "title": "Docker-Guardian", "message": text, + }) + }) + }() } if d.cfg.PushbulletToken != "" { - go d.sendJSONWithHeader("https://api.pushbullet.com/v2/pushes", - "Access-Token", d.cfg.PushbulletToken, - map[string]string{"type": "note", "title": "Docker-Guardian", "body": text}) + d.wg.Add(1) + go func() { + defer d.wg.Done() + d.sendWithRetry("pushbullet", retry, func() error { + return d.sendJSONWithHeader("https://api.pushbullet.com/v2/pushes", + "Access-Token", d.cfg.PushbulletToken, + map[string]string{"type": "note", "title": "Docker-Guardian", "body": text}) + }) + }() } if d.cfg.LunaSeaWebhook != "" { - go d.sendJSON(d.cfg.LunaSeaWebhook, map[string]string{"title": "Docker-Guardian", "body": text}) + d.wg.Add(1) + go func() { + defer d.wg.Done() + d.sendWithRetry("lunasea", retry, func() error { + return d.sendJSON(d.cfg.LunaSeaWebhook, map[string]string{"title": "Docker-Guardian", "body": text}) + }) + }() } if d.cfg.EmailSMTP != "" { - go d.sendEmail(text) + d.wg.Add(1) + go func() { + defer d.wg.Done() + d.sendWithRetry("email", retry, func() error { + return d.sendEmail(text) + }) + }() + } +} + +// sendWithRetry retries a send function up to 3 times with exponential backoff. +// Only retries if retry=true. Tracks metrics per service. +func (d *Dispatcher) sendWithRetry(service string, retry bool, fn func() error) { + maxAttempts := 1 + if retry { + maxAttempts = 3 + } + + delays := []time.Duration{time.Second, 2 * time.Second, 4 * time.Second} + + for attempt := 0; attempt < maxAttempts; attempt++ { + if err := fn(); err == nil { + metrics.NotificationsTotal.WithLabelValues(service, "success").Inc() + return + } + // Only retry if we have attempts left + if attempt < maxAttempts-1 { + time.Sleep(delays[attempt]) + } } + metrics.NotificationsTotal.WithLabelValues(service, "failure").Inc() } -func (d *Dispatcher) sendJSON(url string, payload any) { +func (d *Dispatcher) sendJSON(targetURL string, payload any) error { body, err := json.Marshal(payload) if err != nil { d.log.Error("failed to marshal notification payload", "error", err) - return + return err } - resp, err := d.client.Post(url, "application/json", bytes.NewReader(body)) + resp, err := d.client.Post(targetURL, "application/json", bytes.NewReader(body)) if err != nil { - d.log.Debug("notification send failed", "url", url, "error", err) - return + d.log.Warn("notification send failed", "url", targetURL, "error", err) + return err + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + d.log.Warn("notification returned non-2xx status", "url", targetURL, "status", resp.StatusCode) + return fmt.Errorf("HTTP %d", resp.StatusCode) } - resp.Body.Close() + return nil } -func (d *Dispatcher) sendJSONWithHeader(url, headerKey, headerVal string, payload any) { +func (d *Dispatcher) sendJSONWithHeader(targetURL, headerKey, headerVal string, payload any) error { body, err := json.Marshal(payload) if err != nil { - return + d.log.Warn("failed to marshal notification payload", "error", err) + return err } - req, err := http.NewRequest(http.MethodPost, url, bytes.NewReader(body)) + req, err := http.NewRequest(http.MethodPost, targetURL, bytes.NewReader(body)) if err != nil { - return + d.log.Warn("failed to create notification request", "error", err) + return err } req.Header.Set("Content-Type", "application/json") req.Header.Set(headerKey, headerVal) resp, err := d.client.Do(req) if err != nil { - return + d.log.Warn("notification send failed", "url", targetURL, "error", err) + return err + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + d.log.Warn("notification returned non-2xx status", "url", targetURL, "status", resp.StatusCode) + return fmt.Errorf("HTTP %d", resp.StatusCode) } - resp.Body.Close() + return nil } -func (d *Dispatcher) sendForm(endpoint string, fields map[string]string) { +func (d *Dispatcher) sendForm(endpoint string, fields map[string]string) error { vals := url.Values{} for k, v := range fields { vals.Set(k, v) } resp, err := d.client.Post(endpoint, "application/x-www-form-urlencoded", strings.NewReader(vals.Encode())) if err != nil { - return + d.log.Warn("notification send failed", "url", endpoint, "error", err) + return err + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + d.log.Warn("notification returned non-2xx status", "url", endpoint, "status", resp.StatusCode) + return fmt.Errorf("HTTP %d", resp.StatusCode) } - resp.Body.Close() + return nil } -func (d *Dispatcher) sendEmail(text string) { +func (d *Dispatcher) sendEmail(text string) error { msg := fmt.Sprintf("From: %s\r\nTo: %s\r\nSubject: Docker-Guardian Alert\r\n\r\n%s", d.cfg.EmailFrom, d.cfg.EmailTo, text) auth := smtp.PlainAuth("", d.cfg.EmailUser, d.cfg.EmailPass, strings.Split(d.cfg.EmailSMTP, ":")[0]) err := smtp.SendMail(d.cfg.EmailSMTP, auth, d.cfg.EmailFrom, []string{d.cfg.EmailTo}, []byte(msg)) if err != nil { - d.log.Debug("email notification failed", "error", err) + d.log.Warn("email notification failed", "error", err) + return err } + return nil } diff --git a/tests/test-all.sh b/tests/test-all.sh index 4aaf477..b692945 100755 --- a/tests/test-all.sh +++ b/tests/test-all.sh @@ -37,6 +37,9 @@ run_test "Backup Awareness" "test-backup.sh" run_test "Grace Period" "test-grace.sh" run_test "Watchtower Awareness" "test-watchtower.sh" run_test "Notifications" "test-notifications.sh" +run_test "Opt-Out Label" "test-opt-out.sh" +run_test "Circuit Breaker" "test-circuit-breaker.sh" +run_test "Custom Label Filter" "test-custom-label.sh" echo "=============================================" echo " Final Results: ${TOTAL_PASS} suites passed, ${TOTAL_FAIL} suites failed" diff --git a/tests/test-circuit-breaker.sh b/tests/test-circuit-breaker.sh new file mode 100644 index 0000000..f9fae35 --- /dev/null +++ b/tests/test-circuit-breaker.sh @@ -0,0 +1,154 @@ +#!/usr/bin/env bash +# Test: Circuit breaker stops restarting after budget is exhausted +# Uses a container that always fails its healthcheck. After the restart budget +# is hit, Guardian should stop restarting and log a CRITICAL circuit-open message. + +set -euo pipefail + +PASS=0 +FAIL=0 +GUARDIAN_IMAGE="${GUARDIAN_IMAGE:-docker-guardian}" + +cleanup() { + echo "Cleaning up..." + docker rm -f dg-test-cb-target dg-test-guardian-cb 2>/dev/null || true +} +trap cleanup EXIT + +echo "=== Test: Circuit Breaker ===" + +cleanup + +# Start a container that always fails its healthcheck +docker run -d \ + --name dg-test-cb-target \ + --health-cmd="exit 1" \ + --health-interval=2s \ + --health-retries=1 \ + --health-start-period=0s \ + alpine:3.20 sh -c 'sleep 3600' + +# Wait for unhealthy +echo "Waiting for container to become unhealthy..." +for i in $(seq 1 15); do + HC_STATE=$(docker inspect -f '{{.State.Health.Status}}' dg-test-cb-target 2>/dev/null || echo "unknown") + if [ "$HC_STATE" = "unhealthy" ]; then + echo "Container is unhealthy after ${i}s" + break + fi + sleep 1 +done + +if [ "$HC_STATE" != "unhealthy" ]; then + echo "FAIL: Container did not become unhealthy within 15s" + FAIL=$((FAIL + 1)) + echo "" + echo "=== Circuit Breaker Test Results: ${PASS} passed, ${FAIL} failed ===" + exit "$FAIL" +fi + +echo "PASS: Container reached unhealthy state" +PASS=$((PASS + 1)) + +# Start Guardian with a very low restart budget (2) and short window/backoff +# Budget=2 means after 2 restarts the circuit opens +docker run -d \ + --name dg-test-guardian-cb \ + -e AUTOHEAL_CONTAINER_LABEL=all \ + -e AUTOHEAL_INTERVAL=3 \ + -e AUTOHEAL_GRACE_PERIOD=0 \ + -e AUTOHEAL_WATCHTOWER_COOLDOWN=0 \ + -e AUTOHEAL_RESTART_BUDGET=2 \ + -e AUTOHEAL_RESTART_WINDOW=600 \ + -e AUTOHEAL_BACKOFF_MULTIPLIER=1 \ + -e AUTOHEAL_BACKOFF_MAX=1 \ + -e AUTOHEAL_BACKOFF_RESET_AFTER=600 \ + -e NOTIFY_EVENTS=actions \ + -v /var/run/docker.sock:/var/run/docker.sock \ + "$GUARDIAN_IMAGE" + +# ── Test 1: Guardian restarts the container at least once ────────── + +echo "" +echo "--- Test 1: Guardian restarts container at least once ---" + +STARTED_AT_BEFORE=$(docker inspect -f '{{.State.StartedAt}}' dg-test-cb-target 2>/dev/null) + +RESTARTED=false +for i in $(seq 1 30); do + STARTED_NOW=$(docker inspect -f '{{.State.StartedAt}}' dg-test-cb-target 2>/dev/null || echo "unknown") + if [ "$STARTED_NOW" != "$STARTED_AT_BEFORE" ] && [ "$STARTED_NOW" != "unknown" ]; then + echo "PASS: Container was restarted at least once (took ${i}s)" + PASS=$((PASS + 1)) + RESTARTED=true + break + fi + sleep 1 +done + +if [ "$RESTARTED" = false ]; then + echo "FAIL: Container was not restarted within 30s" + FAIL=$((FAIL + 1)) + docker logs dg-test-guardian-cb 2>&1 | tail -30 + echo "" + echo "=== Circuit Breaker Test Results: ${PASS} passed, ${FAIL} failed ===" + exit "$FAIL" +fi + +# ── Test 2: After budget exhausted, Guardian logs circuit open ───── + +echo "" +echo "--- Test 2: Circuit breaker opens after budget exhausted ---" + +# Wait long enough for Guardian to hit the budget (2 restarts) and see the circuit message +# With backoff_max=1 and budget=2, this should happen within ~30s +echo "Waiting up to 60s for circuit breaker to open..." +CIRCUIT_OPEN=false +for i in $(seq 1 60); do + if docker logs dg-test-guardian-cb 2>&1 | grep -q "circuit open\|budget exhausted"; then + echo "PASS: Guardian logged circuit breaker open (took ${i}s)" + PASS=$((PASS + 1)) + CIRCUIT_OPEN=true + break + fi + sleep 1 +done + +if [ "$CIRCUIT_OPEN" = false ]; then + echo "FAIL: Guardian did not log circuit breaker open within 60s" + FAIL=$((FAIL + 1)) + echo "Guardian logs:" + docker logs dg-test-guardian-cb 2>&1 | tail -30 +fi + +# ── Test 3: Verify CRITICAL notification was logged ──────────────── + +echo "" +echo "--- Test 3: CRITICAL notification in logs ---" + +if docker logs dg-test-guardian-cb 2>&1 | grep -q "\[CRITICAL\]"; then + echo "PASS: CRITICAL notification logged for circuit breaker" + PASS=$((PASS + 1)) +else + echo "FAIL: No CRITICAL notification found in logs" + FAIL=$((FAIL + 1)) + docker logs dg-test-guardian-cb 2>&1 | tail -20 +fi + +# ── Test 4: Guardian itself is still running ─────────────────────── + +echo "" +echo "--- Test 4: Guardian is still healthy after circuit opens ---" + +GUARDIAN_STATE=$(docker inspect -f '{{.State.Status}}' dg-test-guardian-cb 2>/dev/null || echo "unknown") +if [ "$GUARDIAN_STATE" = "running" ]; then + echo "PASS: Guardian still running after circuit breaker activation" + PASS=$((PASS + 1)) +else + echo "FAIL: Guardian stopped unexpectedly (state=$GUARDIAN_STATE)" + FAIL=$((FAIL + 1)) +fi + +echo "" +echo "=== Circuit Breaker Test Results: ${PASS} passed, ${FAIL} failed ===" +exit "$FAIL" diff --git a/tests/test-custom-label.sh b/tests/test-custom-label.sh new file mode 100644 index 0000000..dd4a5a2 --- /dev/null +++ b/tests/test-custom-label.sh @@ -0,0 +1,135 @@ +#!/usr/bin/env bash +# Test: Custom label filtering +# Verifies that AUTOHEAL_CONTAINER_LABEL restricts monitoring to only +# containers with that label, ignoring unhealthy containers without it. + +set -euo pipefail + +PASS=0 +FAIL=0 +GUARDIAN_IMAGE="${GUARDIAN_IMAGE:-docker-guardian}" + +cleanup() { + echo "Cleaning up..." + docker rm -f dg-test-label-monitored dg-test-label-ignored dg-test-guardian-label 2>/dev/null || true +} +trap cleanup EXIT + +echo "=== Test: Custom Label Filtering ===" + +cleanup + +# ── Start two unhealthy containers ───────────────────────────────── +# One with our custom label, one without + +echo "Starting container WITH custom label (my-monitor=true)..." +docker run -d \ + --name dg-test-label-monitored \ + --label my-monitor=true \ + --health-cmd="exit 1" \ + --health-interval=2s \ + --health-retries=1 \ + --health-start-period=0s \ + alpine:3.20 sh -c 'sleep 3600' + +echo "Starting container WITHOUT custom label..." +docker run -d \ + --name dg-test-label-ignored \ + --health-cmd="exit 1" \ + --health-interval=2s \ + --health-retries=1 \ + --health-start-period=0s \ + alpine:3.20 sh -c 'sleep 3600' + +# Wait for both to become unhealthy +echo "Waiting for both containers to become unhealthy..." +for i in $(seq 1 15); do + S1=$(docker inspect -f '{{.State.Health.Status}}' dg-test-label-monitored 2>/dev/null || echo "unknown") + S2=$(docker inspect -f '{{.State.Health.Status}}' dg-test-label-ignored 2>/dev/null || echo "unknown") + if [ "$S1" = "unhealthy" ] && [ "$S2" = "unhealthy" ]; then + echo "Both containers unhealthy after ${i}s" + break + fi + sleep 1 +done + +if [ "$S1" != "unhealthy" ] || [ "$S2" != "unhealthy" ]; then + echo "FAIL: One or both containers did not become unhealthy within 15s" + FAIL=$((FAIL + 1)) + echo "" + echo "=== Custom Label Test Results: ${PASS} passed, ${FAIL} failed ===" + exit "$FAIL" +fi + +STARTED_MONITORED=$(docker inspect -f '{{.State.StartedAt}}' dg-test-label-monitored 2>/dev/null) +STARTED_IGNORED=$(docker inspect -f '{{.State.StartedAt}}' dg-test-label-ignored 2>/dev/null) + +# Start Guardian with custom label filter +echo "Starting Guardian with AUTOHEAL_CONTAINER_LABEL=my-monitor..." +docker run -d \ + --name dg-test-guardian-label \ + -e AUTOHEAL_CONTAINER_LABEL=my-monitor \ + -e AUTOHEAL_INTERVAL=3 \ + -e AUTOHEAL_GRACE_PERIOD=0 \ + -e AUTOHEAL_WATCHTOWER_COOLDOWN=0 \ + -v /var/run/docker.sock:/var/run/docker.sock \ + "$GUARDIAN_IMAGE" + +# ── Test 1: Labelled container IS restarted ──────────────────────── + +echo "" +echo "--- Test 1: Labelled container is restarted ---" + +RESTARTED=false +for i in $(seq 1 20); do + STARTED_NOW=$(docker inspect -f '{{.State.StartedAt}}' dg-test-label-monitored 2>/dev/null || echo "unknown") + if [ "$STARTED_NOW" != "$STARTED_MONITORED" ] && [ "$STARTED_NOW" != "unknown" ]; then + echo "PASS: Labelled container was restarted (took ${i}s)" + PASS=$((PASS + 1)) + RESTARTED=true + break + fi + sleep 1 +done + +if [ "$RESTARTED" = false ]; then + echo "FAIL: Labelled container was not restarted within 20s" + FAIL=$((FAIL + 1)) + docker logs dg-test-guardian-label 2>&1 | tail -20 +fi + +# ── Test 2: Unlabelled container is NOT restarted ────────────────── + +echo "" +echo "--- Test 2: Unlabelled container is ignored ---" + +# Give it some extra time to be sure +sleep 10 + +STARTED_IGN_NOW=$(docker inspect -f '{{.State.StartedAt}}' dg-test-label-ignored 2>/dev/null) +if [ "$STARTED_IGN_NOW" = "$STARTED_IGNORED" ]; then + echo "PASS: Unlabelled container was NOT restarted" + PASS=$((PASS + 1)) +else + echo "FAIL: Unlabelled container was restarted despite missing label" + FAIL=$((FAIL + 1)) + docker logs dg-test-guardian-label 2>&1 | tail -20 +fi + +# ── Test 3: Guardian logs confirm label filter ───────────────────── + +echo "" +echo "--- Test 3: Guardian logs show label filter ---" + +if docker logs dg-test-guardian-label 2>&1 | grep -q "AUTOHEAL_CONTAINER_LABEL=my-monitor"; then + echo "PASS: Guardian logs confirm custom label filter" + PASS=$((PASS + 1)) +else + echo "FAIL: Guardian logs do not show custom label" + FAIL=$((FAIL + 1)) + docker logs dg-test-guardian-label 2>&1 | head -10 +fi + +echo "" +echo "=== Custom Label Test Results: ${PASS} passed, ${FAIL} failed ===" +exit "$FAIL" diff --git a/tests/test-opt-out.sh b/tests/test-opt-out.sh new file mode 100644 index 0000000..0a955ae --- /dev/null +++ b/tests/test-opt-out.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# Test: Opt-out via autoheal=False label +# Verifies that containers with autoheal=False are NOT restarted by Guardian, +# even when unhealthy. + +set -euo pipefail + +PASS=0 +FAIL=0 +GUARDIAN_IMAGE="${GUARDIAN_IMAGE:-docker-guardian}" + +cleanup() { + echo "Cleaning up..." + docker rm -f dg-test-optout-target dg-test-optout-monitored dg-test-guardian-optout 2>/dev/null || true +} +trap cleanup EXIT + +echo "=== Test: Opt-Out via autoheal=False ===" + +cleanup + +# ── Test 1: Container with autoheal=False is NOT restarted ───────── + +echo "" +echo "--- Test 1: Container with autoheal=False is skipped ---" + +# Start an unhealthy container with autoheal=False +docker run -d \ + --name dg-test-optout-target \ + --label autoheal=False \ + --health-cmd="exit 1" \ + --health-interval=2s \ + --health-retries=1 \ + --health-start-period=0s \ + alpine:3.20 sh -c 'sleep 3600' + +# Wait for it to become unhealthy +echo "Waiting for opt-out container to become unhealthy..." +for i in $(seq 1 15); do + HC_STATE=$(docker inspect -f '{{.State.Health.Status}}' dg-test-optout-target 2>/dev/null || echo "unknown") + if [ "$HC_STATE" = "unhealthy" ]; then + break + fi + sleep 1 +done + +if [ "$HC_STATE" != "unhealthy" ]; then + echo "FAIL: Container did not become unhealthy within 15s" + FAIL=$((FAIL + 1)) + echo "" + echo "=== Opt-Out Test Results: ${PASS} passed, ${FAIL} failed ===" + exit "$FAIL" +fi + +STARTED_AT_BEFORE=$(docker inspect -f '{{.State.StartedAt}}' dg-test-optout-target 2>/dev/null) + +# Start Guardian +docker run -d \ + --name dg-test-guardian-optout \ + -e AUTOHEAL_CONTAINER_LABEL=all \ + -e AUTOHEAL_INTERVAL=3 \ + -e AUTOHEAL_GRACE_PERIOD=0 \ + -e AUTOHEAL_WATCHTOWER_COOLDOWN=0 \ + -v /var/run/docker.sock:/var/run/docker.sock \ + "$GUARDIAN_IMAGE" + +# Wait sufficient time — Guardian should NOT restart the opted-out container +echo "Waiting 15s to confirm Guardian does NOT restart the opted-out container..." +sleep 15 + +STARTED_AT_AFTER=$(docker inspect -f '{{.State.StartedAt}}' dg-test-optout-target 2>/dev/null) + +if [ "$STARTED_AT_BEFORE" = "$STARTED_AT_AFTER" ]; then + echo "PASS: Opted-out container was NOT restarted" + PASS=$((PASS + 1)) +else + echo "FAIL: Opted-out container was restarted despite autoheal=False" + FAIL=$((FAIL + 1)) + docker logs dg-test-guardian-optout 2>&1 | tail -20 +fi + +# ── Test 2: A monitored container IS still restarted ─────────────── + +echo "" +echo "--- Test 2: Monitored container (no opt-out) is still restarted ---" + +docker run -d \ + --name dg-test-optout-monitored \ + --health-cmd="exit 1" \ + --health-interval=2s \ + --health-retries=1 \ + --health-start-period=0s \ + alpine:3.20 sh -c 'sleep 3600' + +# Wait for unhealthy +for i in $(seq 1 15); do + HC_STATE=$(docker inspect -f '{{.State.Health.Status}}' dg-test-optout-monitored 2>/dev/null || echo "unknown") + if [ "$HC_STATE" = "unhealthy" ]; then + break + fi + sleep 1 +done + +STARTED_AT_MON=$(docker inspect -f '{{.State.StartedAt}}' dg-test-optout-monitored 2>/dev/null) + +echo "Waiting for Guardian to restart the monitored container (up to 20s)..." +RESTARTED=false +for i in $(seq 1 20); do + STARTED_NOW=$(docker inspect -f '{{.State.StartedAt}}' dg-test-optout-monitored 2>/dev/null || echo "unknown") + if [ "$STARTED_NOW" != "$STARTED_AT_MON" ] && [ "$STARTED_NOW" != "unknown" ]; then + echo "PASS: Monitored container was restarted (took ${i}s)" + PASS=$((PASS + 1)) + RESTARTED=true + break + fi + sleep 1 +done + +if [ "$RESTARTED" = false ]; then + echo "FAIL: Monitored container was NOT restarted within 20s" + FAIL=$((FAIL + 1)) + docker logs dg-test-guardian-optout 2>&1 | tail -20 +fi + +echo "" +echo "=== Opt-Out Test Results: ${PASS} passed, ${FAIL} failed ===" +exit "$FAIL"