From faed59fd9bbed27e4910b13ca694e911f689b80b Mon Sep 17 00:00:00 2001
From: Will-Luck <Will-Luck@users.noreply.github.com>
Date: Sun, 8 Feb 2026 20:38:29 +0000
Subject: [PATCH] v2.1.0: Circuit breaker, Prometheus metrics, event-driven
 architecture

Major enhancements to the Go rewrite:

- Circuit breaker with exponential backoff and restart budgets to prevent
  restart storms. Per-container action labels (restart/stop/notify/none).
- Prometheus /metrics endpoint with counters, gauges, and histograms for
  restarts, skips, notifications, and event processing.
- Event-driven Docker watcher with auto-reconnect (polling fallback for
  tests). Debouncing and real-time orchestration tracking.
- Notification rate limiting and retry with exponential backoff (3 attempts).
- Testability interfaces (docker.API, clock.Clock, notify.Notifier) with
  mock implementations and 25+ unit tests (60% guardian coverage).
- Config validation, WaitGroup goroutine management, defensive guards.
- CI: coverage reporting, govulncheck, Trivy scanning, 3 new acceptance
  tests (opt-out, circuit-breaker, custom-label), failure artifact capture.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/build.yml          |  73 +++++++-
 Dockerfile                           |   7 +
 cmd/guardian/main.go                 |   9 +
 go.mod                               |  11 +-
 go.sum                               |  21 +++
 internal/clock/clock.go              |  17 ++
 internal/config/config.go            |  78 ++++++++-
 internal/docker/containers.go        |   6 +
 internal/docker/interface.go         |  28 +++
 internal/docker/watcher.go           | 129 ++++++++++++++
 internal/guardian/dependency.go      |   6 +-
 internal/guardian/dependency_test.go | 222 ++++++++++++++++++++++++
 internal/guardian/guardian.go        | 207 ++++++++++++++++++++--
 internal/guardian/guardian_test.go   | 187 ++++++++++++++++++++
 internal/guardian/guards.go          |  28 +--
 internal/guardian/mock_test.go       | 179 +++++++++++++++++++
 internal/guardian/tracker.go         | 204 ++++++++++++++++++++++
 internal/guardian/tracker_test.go    | 224 ++++++++++++++++++++++++
 internal/guardian/unhealthy.go       |  77 ++++++++-
 internal/guardian/unhealthy_test.go  | 243 ++++++++++++++++++++++++++
 internal/metrics/prometheus.go       |  92 +++++++++-
 internal/notify/notifier.go          | 249 ++++++++++++++++++++++-----
 tests/test-all.sh                    |   3 +
 tests/test-circuit-breaker.sh        | 154 +++++++++++++++++
 tests/test-custom-label.sh           | 135 +++++++++++++++
 tests/test-opt-out.sh                | 127 ++++++++++++++
 26 files changed, 2635 insertions(+), 81 deletions(-)
 create mode 100644 internal/clock/clock.go
 create mode 100644 internal/docker/interface.go
 create mode 100644 internal/docker/watcher.go
 create mode 100644 internal/guardian/dependency_test.go
 create mode 100644 internal/guardian/guardian_test.go
 create mode 100644 internal/guardian/mock_test.go
 create mode 100644 internal/guardian/tracker.go
 create mode 100644 internal/guardian/tracker_test.go
 create mode 100644 internal/guardian/unhealthy_test.go
 create mode 100644 tests/test-circuit-breaker.sh
 create mode 100644 tests/test-custom-label.sh
 create mode 100644 tests/test-opt-out.sh

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index fa2f791..4654383 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -13,6 +13,7 @@ env:
 jobs:
   go:
     runs-on: ubuntu-latest
+    timeout-minutes: 10
     steps:
       - uses: actions/checkout@v4
 
@@ -29,12 +30,49 @@ jobs:
         with:
           version: latest
 
-      - name: Unit tests
-        run: go test -race -count=1 ./...
+      - name: Unit tests with coverage
+        run: go test -race -count=1 -coverprofile=coverage.out ./...
+
+      - name: Upload coverage artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage
+          path: coverage.out
+
+      - name: Display coverage summary
+        run: go tool cover -func=coverage.out | tail -1
+
+  security:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.24'
+
+      - name: Run govulncheck
+        run: |
+          go install golang.org/x/vuln/cmd/govulncheck@latest
+          govulncheck ./...
+
+      - name: Build image for scanning
+        run: docker build -t docker-guardian:scan .
+
+      - name: Run Trivy vulnerability scanner
+        uses: aquasecurity/trivy-action@master
+        with:
+          image-ref: docker-guardian:scan
+          format: table
+          exit-code: 0
+          severity: CRITICAL,HIGH
 
   test:
     needs: go
     runs-on: ubuntu-latest
+    timeout-minutes: 15
     steps:
       - uses: actions/checkout@v4
 
@@ -59,8 +97,37 @@ jobs:
       - name: Run notifications test
         run: GUARDIAN_IMAGE=docker-guardian bash tests/test-notifications.sh
 
+      - name: Run opt-out test
+        run: GUARDIAN_IMAGE=docker-guardian bash tests/test-opt-out.sh
+
+      - name: Run circuit breaker test
+        run: GUARDIAN_IMAGE=docker-guardian bash tests/test-circuit-breaker.sh
+
+      - name: Run custom label test
+        run: GUARDIAN_IMAGE=docker-guardian bash tests/test-custom-label.sh
+
+      - name: Capture docker logs on failure
+        if: failure()
+        run: |
+          echo "=== Docker containers ==="
+          docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" || true
+          echo ""
+          for c in $(docker ps -a --filter "name=dg-test" --format "{{.Names}}"); do
+            echo "=== Logs: $c ==="
+            docker logs "$c" 2>&1 | tail -50
+            echo ""
+          done
+
+      - name: Upload failure logs
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-failure-logs
+          path: /tmp/dg-test-logs/
+          if-no-files-found: ignore
+
   build:
-    needs: test
+    needs: [test, security]
     runs-on: ubuntu-latest
     if: github.event_name == 'push'
     permissions:
diff --git a/Dockerfile b/Dockerfile
index 90fc525..a689d80 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -26,6 +26,12 @@ ENV AUTOHEAL_CONTAINER_LABEL=autoheal \
     AUTOHEAL_WATCHTOWER_COOLDOWN=300 \
     AUTOHEAL_WATCHTOWER_SCOPE=all \
     AUTOHEAL_WATCHTOWER_EVENTS=orchestration \
+    AUTOHEAL_BACKOFF_MULTIPLIER=2 \
+    AUTOHEAL_BACKOFF_MAX=300 \
+    AUTOHEAL_BACKOFF_RESET_AFTER=600 \
+    AUTOHEAL_RESTART_BUDGET=5 \
+    AUTOHEAL_RESTART_WINDOW=300 \
+    METRICS_PORT=0 \
     DOCKER_SOCK=/var/run/docker.sock \
     CURL_TIMEOUT=30 \
     WEBHOOK_URL="" \
@@ -33,6 +39,7 @@ ENV AUTOHEAL_CONTAINER_LABEL=autoheal \
     APPRISE_URL="" \
     POST_RESTART_SCRIPT="" \
     NOTIFY_EVENTS="actions" \
+    NOTIFY_RATE_LIMIT=60 \
     NOTIFY_GOTIFY_URL="" \
     NOTIFY_GOTIFY_TOKEN="" \
     NOTIFY_DISCORD_WEBHOOK="" \
diff --git a/cmd/guardian/main.go b/cmd/guardian/main.go
index cc99773..e059316 100644
--- a/cmd/guardian/main.go
+++ b/cmd/guardian/main.go
@@ -13,6 +13,7 @@ import (
 	"github.com/Will-Luck/Docker-Guardian/internal/docker"
 	"github.com/Will-Luck/Docker-Guardian/internal/guardian"
 	"github.com/Will-Luck/Docker-Guardian/internal/logging"
+	"github.com/Will-Luck/Docker-Guardian/internal/metrics"
 	"github.com/Will-Luck/Docker-Guardian/internal/notify"
 )
 
@@ -24,6 +25,10 @@ func main() {
 	}
 
 	cfg := config.Load()
+	if err := cfg.Validate(); err != nil {
+		fmt.Fprintf(os.Stderr, "configuration error: %v\n", err)
+		os.Exit(1)
+	}
 	log := logging.New(cfg.LogJSON)
 
 	// Banner: plain stdout for acceptance test compatibility
@@ -48,6 +53,8 @@ func main() {
 	resolved := cfg.ResolvedNotifyEvents()
 	fmt.Printf("NOTIFY_EVENTS=%s (resolved: %s)\n", cfg.NotifyEvents, strings.Join(resolved, ","))
 
+	metrics.Serve(cfg.MetricsPort)
+
 	g := guardian.New(cfg, client, dispatcher, log)
 
 	if cfg.StartPeriod > 0 {
@@ -66,4 +73,6 @@ func main() {
 		log.Error("guardian exited with error", "error", err)
 		os.Exit(1)
 	}
+
+	dispatcher.Close()
 }
diff --git a/go.mod b/go.mod
index 649b185..3a86f12 100644
--- a/go.mod
+++ b/go.mod
@@ -11,6 +11,8 @@ require (
 
 require (
 	github.com/Microsoft/go-winio v0.6.2 // indirect
+	github.com/beorn7/perks v1.0.1 // indirect
+	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/containerd/errdefs v1.0.0 // indirect
 	github.com/containerd/errdefs/pkg v0.3.0 // indirect
 	github.com/distribution/reference v0.6.0 // indirect
@@ -20,12 +22,19 @@ require (
 	github.com/go-logr/logr v1.4.2 // indirect
 	github.com/go-logr/stdr v1.2.2 // indirect
 	github.com/moby/docker-image-spec v1.3.1 // indirect
+	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/opencontainers/go-digest v1.0.0 // indirect
 	github.com/opencontainers/image-spec v1.1.1 // indirect
+	github.com/prometheus/client_golang v1.23.2 // indirect
+	github.com/prometheus/client_model v0.6.2 // indirect
+	github.com/prometheus/common v0.66.1 // indirect
+	github.com/prometheus/procfs v0.16.1 // indirect
 	go.opentelemetry.io/auto/sdk v1.1.0 // indirect
 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect
 	go.opentelemetry.io/otel v1.35.0 // indirect
 	go.opentelemetry.io/otel/metric v1.35.0 // indirect
 	go.opentelemetry.io/otel/trace v1.35.0 // indirect
-	golang.org/x/sys v0.33.0 // indirect
+	go.yaml.in/yaml/v2 v2.4.2 // indirect
+	golang.org/x/sys v0.35.0 // indirect
+	google.golang.org/protobuf v1.36.8 // indirect
 )
diff --git a/go.sum b/go.sum
index 5abb1bc..a1d9d94 100644
--- a/go.sum
+++ b/go.sum
@@ -1,5 +1,9 @@
 github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
 github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
+github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
+github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
+github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
+github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI=
 github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M=
 github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE=
@@ -29,12 +33,22 @@ github.com/moby/moby/api v1.53.0 h1:PihqG1ncw4W+8mZs69jlwGXdaYBeb5brF6BL7mPIS/w=
 github.com/moby/moby/api v1.53.0/go.mod h1:8mb+ReTlisw4pS6BRzCMts5M49W5M7bKt1cJy/YbAqc=
 github.com/moby/moby/client v0.2.2 h1:Pt4hRMCAIlyjL3cr8M5TrXCwKzguebPAc2do2ur7dEM=
 github.com/moby/moby/client v0.2.2/go.mod h1:2EkIPVNCqR05CMIzL1mfA07t0HvVUUOl85pasRz/GmQ=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
 github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
 github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040=
 github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
+github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
+github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
+github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
+github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
+github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
+github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
+github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
 github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
 github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
@@ -51,8 +65,15 @@ go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5J
 go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w=
 go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs=
 go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc=
+go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
+go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
 golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
 golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
+golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI=
+golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
+google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
+google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gotest.tools/v3 v3.5.2 h1:7koQfIKdy+I8UTetycgUqXWSDwpgv193Ka+qRsmBY8Q=
diff --git a/internal/clock/clock.go b/internal/clock/clock.go
new file mode 100644
index 0000000..292687f
--- /dev/null
+++ b/internal/clock/clock.go
@@ -0,0 +1,17 @@
+package clock
+
+import "time"
+
+// Clock abstracts time operations for testability.
+type Clock interface {
+	Now() time.Time
+	After(d time.Duration) <-chan time.Time
+	Since(t time.Time) time.Duration
+}
+
+// Real uses the standard library time functions.
+type Real struct{}
+
+func (Real) Now() time.Time                         { return time.Now() }
+func (Real) After(d time.Duration) <-chan time.Time { return time.After(d) }
+func (Real) Since(t time.Time) time.Duration        { return time.Since(t) }
diff --git a/internal/config/config.go b/internal/config/config.go
index 98bcdce..1bf5262 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -1,7 +1,9 @@
 package config
 
 import (
+	"errors"
 	"fmt"
+	"net/url"
 	"os"
 	"strconv"
 	"strings"
@@ -31,11 +33,19 @@ type Config struct {
 	WatchtowerScope      string // "all" or "affected"
 	WatchtowerEvents     string // "orchestration" or "all"
 
+	// Circuit breaker / backoff
+	BackoffMultiplier float64
+	BackoffMax        int // seconds
+	BackoffResetAfter int // seconds
+	RestartBudget     int
+	RestartWindow     int // seconds
+
 	// Post-restart script
 	PostRestartScript string
 
 	// Notification events
-	NotifyEvents string
+	NotifyEvents    string
+	NotifyRateLimit int // seconds (0 = unlimited)
 
 	// Notification services
 	WebhookURL     string
@@ -63,6 +73,9 @@ type Config struct {
 	EmailUser string
 	EmailPass string
 
+	// Metrics
+	MetricsPort int
+
 	// Logging
 	LogJSON bool
 }
@@ -89,8 +102,15 @@ func Load() *Config {
 		WatchtowerScope:      envStr("AUTOHEAL_WATCHTOWER_SCOPE", "all"),
 		WatchtowerEvents:     envStr("AUTOHEAL_WATCHTOWER_EVENTS", "orchestration"),
 
+		BackoffMultiplier: envFloat("AUTOHEAL_BACKOFF_MULTIPLIER", 2),
+		BackoffMax:        envInt("AUTOHEAL_BACKOFF_MAX", 300),
+		BackoffResetAfter: envInt("AUTOHEAL_BACKOFF_RESET_AFTER", 600),
+		RestartBudget:     envInt("AUTOHEAL_RESTART_BUDGET", 5),
+		RestartWindow:     envInt("AUTOHEAL_RESTART_WINDOW", 300),
+
 		PostRestartScript: envStr("POST_RESTART_SCRIPT", ""),
 		NotifyEvents:      envStr("NOTIFY_EVENTS", "actions"),
+		NotifyRateLimit:   envInt("NOTIFY_RATE_LIMIT", 60),
 
 		WebhookURL:     envStr("WEBHOOK_URL", ""),
 		WebhookJSONKey: envStr("WEBHOOK_JSON_KEY", "text"),
@@ -117,6 +137,8 @@ func Load() *Config {
 		EmailUser: envStr("NOTIFY_EMAIL_USER", ""),
 		EmailPass: envStr("NOTIFY_EMAIL_PASS", ""),
 
+		MetricsPort: envInt("METRICS_PORT", 0),
+
 		LogJSON: envBool("LOG_JSON", false),
 	}
 }
@@ -137,6 +159,11 @@ func (c *Config) PrintBanner() {
 	fmt.Println("AUTOHEAL_WATCHTOWER_COOLDOWN=" + strconv.Itoa(c.WatchtowerCooldown))
 	fmt.Println("AUTOHEAL_WATCHTOWER_SCOPE=" + c.WatchtowerScope)
 	fmt.Println("AUTOHEAL_WATCHTOWER_EVENTS=" + c.WatchtowerEvents)
+	fmt.Printf("AUTOHEAL_BACKOFF_MULTIPLIER=%g\n", c.BackoffMultiplier)
+	fmt.Println("AUTOHEAL_BACKOFF_MAX=" + strconv.Itoa(c.BackoffMax))
+	fmt.Println("AUTOHEAL_BACKOFF_RESET_AFTER=" + strconv.Itoa(c.BackoffResetAfter))
+	fmt.Println("AUTOHEAL_RESTART_BUDGET=" + strconv.Itoa(c.RestartBudget))
+	fmt.Println("AUTOHEAL_RESTART_WINDOW=" + strconv.Itoa(c.RestartWindow))
 }
 
 // ResolvedNotifyEvents returns the normalised event categories.
@@ -170,6 +197,43 @@ func (c *Config) ResolvedNotifyEvents() []string {
 	return result
 }
 
+// Validate checks configuration for invalid or dangerous values.
+func (c *Config) Validate() error {
+	var errs []error
+	if c.Interval <= 0 {
+		errs = append(errs, fmt.Errorf("AUTOHEAL_INTERVAL must be > 0, got %d", c.Interval))
+	}
+	if c.GracePeriod < 0 {
+		errs = append(errs, fmt.Errorf("AUTOHEAL_GRACE_PERIOD must be >= 0, got %d", c.GracePeriod))
+	}
+	if c.DefaultStopTimeout < 0 {
+		errs = append(errs, fmt.Errorf("AUTOHEAL_DEFAULT_STOP_TIMEOUT must be >= 0, got %d", c.DefaultStopTimeout))
+	}
+	if c.WatchtowerScope != "all" && c.WatchtowerScope != "affected" {
+		errs = append(errs, fmt.Errorf("AUTOHEAL_WATCHTOWER_SCOPE must be \"all\" or \"affected\", got %q", c.WatchtowerScope))
+	}
+	if c.WatchtowerEvents != "orchestration" && c.WatchtowerEvents != "all" {
+		errs = append(errs, fmt.Errorf("AUTOHEAL_WATCHTOWER_EVENTS must be \"orchestration\" or \"all\", got %q", c.WatchtowerEvents))
+	}
+	for _, u := range []struct {
+		name, val string
+	}{
+		{"WEBHOOK_URL", c.WebhookURL},
+		{"APPRISE_URL", c.AppriseURL},
+		{"NOTIFY_GOTIFY_URL", c.GotifyURL},
+		{"NOTIFY_DISCORD_WEBHOOK", c.DiscordWebhook},
+		{"NOTIFY_SLACK_WEBHOOK", c.SlackWebhook},
+		{"NOTIFY_LUNASEA_WEBHOOK", c.LunaSeaWebhook},
+	} {
+		if u.val != "" {
+			if _, err := url.Parse(u.val); err != nil {
+				errs = append(errs, fmt.Errorf("%s is not a valid URL: %w", u.name, err))
+			}
+		}
+	}
+	return errors.Join(errs...)
+}
+
 func envStr(key, def string) string {
 	if v := os.Getenv(key); v != "" {
 		return v
@@ -189,6 +253,18 @@ func envInt(key string, def int) int {
 	return n
 }
 
+func envFloat(key string, def float64) float64 {
+	v := os.Getenv(key)
+	if v == "" {
+		return def
+	}
+	f, err := strconv.ParseFloat(v, 64)
+	if err != nil {
+		return def
+	}
+	return f
+}
+
 func envBool(key string, def bool) bool {
 	v := os.Getenv(key)
 	if v == "" {
diff --git a/internal/docker/containers.go b/internal/docker/containers.go
index b0c78f3..0017bfe 100644
--- a/internal/docker/containers.go
+++ b/internal/docker/containers.go
@@ -73,6 +73,12 @@ func (c *Client) StartContainer(ctx context.Context, id string) error {
 	return err
 }
 
+// StopContainer stops a running container with the given timeout.
+func (c *Client) StopContainer(ctx context.Context, id string, timeout int) error {
+	_, err := c.api.ContainerStop(ctx, id, client.ContainerStopOptions{Timeout: &timeout})
+	return err
+}
+
 // ContainerStatus returns the current status string of a container.
 func (c *Client) ContainerStatus(ctx context.Context, id string) (string, error) {
 	info, err := c.api.ContainerInspect(ctx, id, client.ContainerInspectOptions{})
diff --git a/internal/docker/interface.go b/internal/docker/interface.go
new file mode 100644
index 0000000..731a735
--- /dev/null
+++ b/internal/docker/interface.go
@@ -0,0 +1,28 @@
+package docker
+
+import (
+	"context"
+	"time"
+
+	"github.com/moby/moby/api/types/container"
+	"github.com/moby/moby/api/types/events"
+)
+
+// API defines the subset of Docker operations used by Guardian.
+// Implemented by Client for production, and by mocks for testing.
+type API interface {
+	UnhealthyContainers(ctx context.Context, label string, onlyRunning bool) ([]container.Summary, error)
+	ExitedContainers(ctx context.Context) ([]container.Summary, error)
+	RunningContainers(ctx context.Context) ([]container.Summary, error)
+	InspectContainer(ctx context.Context, id string) (container.InspectResponse, error)
+	RestartContainer(ctx context.Context, id string, timeout int) error
+	StartContainer(ctx context.Context, id string) error
+	StopContainer(ctx context.Context, id string, timeout int) error
+	ContainerStatus(ctx context.Context, id string) (string, error)
+	ContainerFinishedAt(ctx context.Context, id string) (time.Time, error)
+	ContainerEvents(ctx context.Context, since, until time.Time, orchestrationOnly bool) ([]events.Message, error)
+	Close() error
+}
+
+// Verify Client implements API at compile time.
+var _ API = (*Client)(nil)
diff --git a/internal/docker/watcher.go b/internal/docker/watcher.go
new file mode 100644
index 0000000..60fcecd
--- /dev/null
+++ b/internal/docker/watcher.go
@@ -0,0 +1,129 @@
+package docker
+
+import (
+	"context"
+	"time"
+
+	"github.com/moby/moby/api/types/events"
+	"github.com/moby/moby/client"
+)
+
+// ContainerEvent represents a processed Docker container event.
+type ContainerEvent struct {
+	ContainerID   string
+	ContainerName string
+	Action        string // "health_status", "die", "start", "destroy", "create"
+	HealthStatus  string // "unhealthy", "healthy" (only for health_status events)
+	Timestamp     time.Time
+}
+
+// WatcherAPI defines the interface for watching Docker events.
+type WatcherAPI interface {
+	Watch(ctx context.Context) <-chan ContainerEvent
+}
+
+// Watcher subscribes to the Docker event stream and emits ContainerEvents.
+type Watcher struct {
+	api            *client.Client
+	reconnectMax   time.Duration
+	livenessWindow time.Duration
+}
+
+// NewWatcher creates a Watcher connected to the Docker event stream.
+func NewWatcher(c *Client) *Watcher {
+	return &Watcher{
+		api:            c.api,
+		reconnectMax:   30 * time.Second,
+		livenessWindow: 60 * time.Second,
+	}
+}
+
+// Watch starts watching Docker events. It reconnects automatically on disconnect.
+// Returns a channel of ContainerEvents. The channel is closed when ctx is cancelled.
+func (w *Watcher) Watch(ctx context.Context) <-chan ContainerEvent {
+	ch := make(chan ContainerEvent, 64)
+
+	go func() {
+		defer close(ch)
+		backoff := time.Second
+
+		for {
+			if ctx.Err() != nil {
+				return
+			}
+
+			w.streamEvents(ctx, ch)
+
+			// If we get here, stream disconnected. Reconnect with backoff.
+			if ctx.Err() != nil {
+				return
+			}
+
+			select {
+			case <-time.After(backoff):
+			case <-ctx.Done():
+				return
+			}
+
+			// Exponential backoff: 1s → 2s → 4s → 8s → ... → 30s max
+			backoff *= 2
+			if backoff > w.reconnectMax {
+				backoff = w.reconnectMax
+			}
+		}
+	}()
+
+	return ch
+}
+
+func (w *Watcher) streamEvents(ctx context.Context, ch chan<- ContainerEvent) {
+	opts := client.EventsListOptions{
+		Filters: make(client.Filters).
+			Add("type", "container").
+			Add("event", "health_status", "die", "start", "destroy", "create"),
+	}
+
+	result := w.api.Events(ctx, opts)
+
+	// Reset backoff on successful connection
+	for {
+		select {
+		case msg, ok := <-result.Messages:
+			if !ok {
+				return // stream closed
+			}
+			evt := parseEvent(msg)
+			if evt != nil {
+				select {
+				case ch <- *evt:
+				case <-ctx.Done():
+					return
+				}
+			}
+		case <-result.Err:
+			return // stream error — caller will reconnect
+		case <-ctx.Done():
+			return
+		}
+	}
+}
+
+func parseEvent(msg events.Message) *ContainerEvent {
+	evt := &ContainerEvent{
+		ContainerID:   msg.Actor.ID,
+		ContainerName: msg.Actor.Attributes["name"],
+		Action:        string(msg.Action),
+		Timestamp:     time.Unix(msg.Time, msg.TimeNano%1e9),
+	}
+
+	// Docker sends health_status events as "health_status: unhealthy" or "health_status: healthy"
+	if action := string(msg.Action); len(action) > 15 && action[:14] == "health_status:" {
+		evt.Action = "health_status"
+		evt.HealthStatus = action[15:] // skip "health_status: "
+	} else if action == "health_status" {
+		// Some Docker versions use attributes
+		evt.HealthStatus = msg.Actor.Attributes["health_status"]
+	}
+
+	return evt
+}
diff --git a/internal/guardian/dependency.go b/internal/guardian/dependency.go
index f952385..cbd824b 100644
--- a/internal/guardian/dependency.go
+++ b/internal/guardian/dependency.go
@@ -46,7 +46,7 @@ func (g *Guardian) checkDependencyOrphans(ctx context.Context) {
 			continue
 		}
 
-		now := time.Now().Format("02-01-2006 15:04:05")
+		now := g.clock.Now().Format("02-01-2006 15:04:05")
 		fmt.Printf("%s Container %s (%s) exited (code %d, orphaned dependent) - parent %s is running\n",
 			now, name, shortID, exitCode, parentID[:12])
 
@@ -77,10 +77,10 @@ func (g *Guardian) checkDependencyOrphans(ctx context.Context) {
 		fmt.Printf("%s Starting orphaned dependent %s (%s)...\n", now, name, shortID)
 		if err := g.docker.StartContainer(ctx, c.ID); err != nil {
 			g.log.Error("failed to start container", "container", name, "id", shortID, "error", err)
-			g.dispatcher.Action(fmt.Sprintf("Container %s (%s) orphaned (parent running). Failed to start!", name, shortID))
+			g.notifier.Action(fmt.Sprintf("Container %s (%s) orphaned (parent running). Failed to start!", name, shortID))
 		} else {
 			fmt.Printf("%s Successfully started %s (%s)\n", now, name, shortID)
-			g.dispatcher.Action(fmt.Sprintf("Container %s (%s) orphaned (parent running). Successfully started!", name, shortID))
+			g.notifier.Action(fmt.Sprintf("Container %s (%s) orphaned (parent running). Successfully started!", name, shortID))
 		}
 
 		g.runPostRestartScript(name, shortID, "orphaned", 0)
diff --git a/internal/guardian/dependency_test.go b/internal/guardian/dependency_test.go
new file mode 100644
index 0000000..e6fc513
--- /dev/null
+++ b/internal/guardian/dependency_test.go
@@ -0,0 +1,222 @@
+package guardian
+
+import (
+	"context"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/Will-Luck/Docker-Guardian/internal/config"
+	"github.com/Will-Luck/Docker-Guardian/internal/logging"
+	"github.com/moby/moby/api/types/container"
+)
+
+func TestCheckDependencyOrphans_StartsOrphan(t *testing.T) {
+	cfg := &config.Config{
+		MonitorDependencies:  true,
+		DependencyStartDelay: 0,
+		WatchtowerCooldown:   0,
+		GracePeriod:          0,
+		BackupLabel:          "",
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+	clk := newMockClock(time.Now())
+
+	parentID := "parent1234567890abcdef"
+
+	dock.exitedContainers = []container.Summary{
+		{ID: "orphan01234567890abcdef"},
+	}
+	dock.inspectResults["orphan01234567890abcdef"] = container.InspectResponse{
+		Name: "/orphan-app",
+		HostConfig: &container.HostConfig{
+			NetworkMode: container.NetworkMode("container:" + parentID),
+		},
+		Config: &container.Config{
+			Labels: map[string]string{},
+		},
+		State: &container.State{
+			ExitCode: 1,
+		},
+	}
+	dock.statusResults[parentID] = "running"
+	dock.statusResults["orphan01234567890abcdef"] = "exited"
+
+	g := &Guardian{
+		cfg:      cfg,
+		docker:   dock,
+		notifier: notif,
+		log:      logging.New(false),
+		clock:    clk,
+		tracker:  NewRestartTracker(DefaultTrackerConfig(), clk),
+	}
+
+	g.checkDependencyOrphans(context.Background())
+
+	if len(dock.startCalls) != 1 {
+		t.Fatalf("expected 1 start call, got %d", len(dock.startCalls))
+	}
+	if dock.startCalls[0] != "orphan01234567890abcdef" {
+		t.Errorf("started wrong container: %s", dock.startCalls[0])
+	}
+	if len(notif.actions) != 1 || !strings.Contains(notif.actions[0], "Successfully started") {
+		t.Errorf("expected success notification, got %v", notif.actions)
+	}
+}
+
+func TestCheckDependencyOrphans_SkipsNonDependents(t *testing.T) {
+	cfg := &config.Config{
+		MonitorDependencies:  true,
+		DependencyStartDelay: 0,
+		WatchtowerCooldown:   0,
+		GracePeriod:          0,
+		BackupLabel:          "",
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+	clk := newMockClock(time.Now())
+
+	dock.exitedContainers = []container.Summary{
+		{ID: "standalone1234567890ab"},
+	}
+	dock.inspectResults["standalone1234567890ab"] = container.InspectResponse{
+		Name: "/standalone",
+		HostConfig: &container.HostConfig{
+			NetworkMode: "bridge",
+		},
+		Config: &container.Config{},
+		State:  &container.State{},
+	}
+
+	g := &Guardian{
+		cfg:      cfg,
+		docker:   dock,
+		notifier: notif,
+		log:      logging.New(false),
+		clock:    clk,
+		tracker:  NewRestartTracker(DefaultTrackerConfig(), clk),
+	}
+
+	g.checkDependencyOrphans(context.Background())
+
+	if len(dock.startCalls) != 0 {
+		t.Error("should not start non-dependent container")
+	}
+}
+
+func TestCheckDependencyOrphans_SkipsWhenParentNotRunning(t *testing.T) {
+	cfg := &config.Config{
+		MonitorDependencies:  true,
+		DependencyStartDelay: 0,
+		WatchtowerCooldown:   0,
+		GracePeriod:          0,
+		BackupLabel:          "",
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+	clk := newMockClock(time.Now())
+
+	parentID := "parent1234567890abcdef"
+
+	dock.exitedContainers = []container.Summary{
+		{ID: "orphan01234567890abcdef"},
+	}
+	dock.inspectResults["orphan01234567890abcdef"] = container.InspectResponse{
+		Name: "/orphan-app",
+		HostConfig: &container.HostConfig{
+			NetworkMode: container.NetworkMode("container:" + parentID),
+		},
+		Config: &container.Config{},
+		State:  &container.State{},
+	}
+	dock.statusResults[parentID] = "exited" // Parent not running
+
+	g := &Guardian{
+		cfg:      cfg,
+		docker:   dock,
+		notifier: notif,
+		log:      logging.New(false),
+		clock:    clk,
+		tracker:  NewRestartTracker(DefaultTrackerConfig(), clk),
+	}
+
+	g.checkDependencyOrphans(context.Background())
+
+	if len(dock.startCalls) != 0 {
+		t.Error("should not start orphan when parent is not running")
+	}
+}
+
+func TestCheckDependencyOrphans_Disabled(t *testing.T) {
+	cfg := &config.Config{
+		MonitorDependencies: false,
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+	clk := newMockClock(time.Now())
+
+	g := &Guardian{
+		cfg:      cfg,
+		docker:   dock,
+		notifier: notif,
+		log:      logging.New(false),
+		clock:    clk,
+		tracker:  NewRestartTracker(DefaultTrackerConfig(), clk),
+	}
+
+	g.checkDependencyOrphans(context.Background())
+
+	if len(dock.startCalls) != 0 {
+		t.Error("should not make any calls when dependencies disabled")
+	}
+}
+
+func TestCheckDependencyOrphans_SkipsAutoRecovered(t *testing.T) {
+	cfg := &config.Config{
+		MonitorDependencies:  true,
+		DependencyStartDelay: 0,
+		WatchtowerCooldown:   0,
+		GracePeriod:          0,
+		BackupLabel:          "",
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+	clk := newMockClock(time.Now())
+
+	parentID := "parent1234567890abcdef"
+
+	dock.exitedContainers = []container.Summary{
+		{ID: "orphan01234567890abcdef"},
+	}
+	dock.inspectResults["orphan01234567890abcdef"] = container.InspectResponse{
+		Name: "/orphan-app",
+		HostConfig: &container.HostConfig{
+			NetworkMode: container.NetworkMode("container:" + parentID),
+		},
+		Config: &container.Config{
+			Labels: map[string]string{},
+		},
+		State: &container.State{
+			ExitCode: 0,
+		},
+	}
+	dock.statusResults[parentID] = "running"
+	// Container has auto-recovered by the time we re-check
+	dock.statusResults["orphan01234567890abcdef"] = "running"
+
+	g := &Guardian{
+		cfg:      cfg,
+		docker:   dock,
+		notifier: notif,
+		log:      logging.New(false),
+		clock:    clk,
+		tracker:  NewRestartTracker(DefaultTrackerConfig(), clk),
+	}
+
+	g.checkDependencyOrphans(context.Background())
+
+	if len(dock.startCalls) != 0 {
+		t.Error("should not start auto-recovered container")
+	}
+}
diff --git a/internal/guardian/guardian.go b/internal/guardian/guardian.go
index 3d0e6a5..0e74e97 100644
--- a/internal/guardian/guardian.go
+++ b/internal/guardian/guardian.go
@@ -2,8 +2,10 @@ package guardian
 
 import (
 	"context"
+	"sync"
 	"time"
 
+	"github.com/Will-Luck/Docker-Guardian/internal/clock"
 	"github.com/Will-Luck/Docker-Guardian/internal/config"
 	"github.com/Will-Luck/Docker-Guardian/internal/docker"
 	"github.com/Will-Luck/Docker-Guardian/internal/logging"
@@ -11,32 +13,95 @@ import (
 	"github.com/moby/moby/api/types/events"
 )
 
-// Guardian orchestrates the main monitoring loop.
+// Guardian orchestrates container health monitoring.
 type Guardian struct {
-	cfg        *config.Config
-	docker     *docker.Client
-	dispatcher *notify.Dispatcher
-	log        *logging.Logger
-	cycle      int
+	cfg      *config.Config
+	docker   docker.API
+	notifier notify.Notifier
+	log      *logging.Logger
+	clock    clock.Clock
 
-	// Per-cycle caches (reset at start of each cycle)
+	// Circuit breaker
+	tracker *RestartTracker
+
+	// Event debouncing
+	debounceMu     sync.Mutex
+	debounceTimers map[string]*time.Timer
+
+	// Orchestration tracking (event-driven replacement for per-cycle cache)
+	orchestrationMu     sync.Mutex
+	orchestrationEvents map[string]time.Time // container name → latest event time
+
+	// Per-cycle caches (used during full scans)
 	orchestratorEvents []events.Message
 	orchestratorCached bool
 	backupRunning      *bool
+	cycle              int
 }
 
 // New creates a Guardian instance.
-func New(cfg *config.Config, client *docker.Client, dispatcher *notify.Dispatcher, log *logging.Logger) *Guardian {
+func New(cfg *config.Config, client docker.API, notifier notify.Notifier, log *logging.Logger) *Guardian {
+	clk := clock.Real{}
+	tcfg := TrackerConfig{
+		BackoffMultiplier: cfg.BackoffMultiplier,
+		BackoffMax:        time.Duration(cfg.BackoffMax) * time.Second,
+		BackoffResetAfter: time.Duration(cfg.BackoffResetAfter) * time.Second,
+		RestartBudget:     cfg.RestartBudget,
+		RestartWindow:     time.Duration(cfg.RestartWindow) * time.Second,
+	}
 	return &Guardian{
-		cfg:        cfg,
-		docker:     client,
-		dispatcher: dispatcher,
-		log:        log,
+		cfg:                 cfg,
+		docker:              client,
+		notifier:            notifier,
+		log:                 log,
+		clock:               clk,
+		tracker:             NewRestartTracker(tcfg, clk),
+		debounceTimers:      make(map[string]*time.Timer),
+		orchestrationEvents: make(map[string]time.Time),
 	}
 }
 
-// Run starts the main monitoring loop, returning when the context is cancelled.
+// NewWithClock creates a Guardian with a custom clock (for testing).
+func NewWithClock(cfg *config.Config, client docker.API, notifier notify.Notifier, log *logging.Logger, clk clock.Clock) *Guardian {
+	g := New(cfg, client, notifier, log)
+	g.clock = clk
+	g.tracker.clock = clk
+	return g
+}
+
+// Run starts the event-driven monitoring loop.
+// If a Watcher is available (via docker.Client), it uses the event stream.
+// Otherwise, it falls back to the polling loop for compatibility.
 func (g *Guardian) Run(ctx context.Context) error {
+	// Check if we can get a watcher
+	if client, ok := g.docker.(*docker.Client); ok {
+		return g.runEventDriven(ctx, client)
+	}
+	// Fallback to polling (for tests with mock docker)
+	return g.runPolling(ctx)
+}
+
+func (g *Guardian) runEventDriven(ctx context.Context, client *docker.Client) error {
+	watcher := docker.NewWatcher(client)
+	eventCh := watcher.Watch(ctx)
+
+	// Initial full scan on startup
+	g.fullScan(ctx)
+
+	for {
+		select {
+		case evt, ok := <-eventCh:
+			if !ok {
+				return nil // watcher closed (context cancelled)
+			}
+			g.handleEvent(ctx, evt)
+		case <-ctx.Done():
+			return nil
+		}
+	}
+}
+
+func (g *Guardian) runPolling(ctx context.Context) error {
 	for {
 		g.cycle++
 		g.orchestratorCached = false
@@ -52,3 +117,119 @@ func (g *Guardian) Run(ctx context.Context) error {
 		}
 	}
 }
+
+// fullScan does a complete check of all containers.
+// Called on startup and after event stream reconnection.
+func (g *Guardian) fullScan(ctx context.Context) {
+	g.cycle++
+	g.orchestratorCached = false
+	g.backupRunning = nil
+
+	g.checkUnhealthy(ctx)
+	g.checkDependencyOrphans(ctx)
+}
+
+// handleEvent processes a single Docker event with debouncing.
+func (g *Guardian) handleEvent(ctx context.Context, evt docker.ContainerEvent) {
+	switch evt.Action {
+	case "health_status":
+		if evt.HealthStatus == "unhealthy" {
+			g.debounce(ctx, evt.ContainerID, func() {
+				g.checkContainerByID(ctx, evt.ContainerID)
+			})
+		} else if evt.HealthStatus == "healthy" {
+			g.tracker.Reset(evt.ContainerID)
+		}
+
+	case "die":
+		g.debounce(ctx, "dep:"+evt.ContainerID, func() {
+			g.checkOrphanedDependents(ctx, evt.ContainerID)
+		})
+
+	case "create", "destroy":
+		g.recordOrchestrationActivity(evt)
+
+	case "start":
+		// No action needed — tracked for potential future use
+	}
+}
+
+// debounce ensures only one action per container within the debounce window.
+func (g *Guardian) debounce(ctx context.Context, key string, fn func()) {
+	debounceWindow := time.Duration(g.cfg.Interval) * time.Second
+	if debounceWindow <= 0 {
+		debounceWindow = 5 * time.Second
+	}
+
+	g.debounceMu.Lock()
+	if timer, ok := g.debounceTimers[key]; ok {
+		timer.Stop()
+	}
+	g.debounceTimers[key] = time.AfterFunc(debounceWindow, func() {
+		if ctx.Err() == nil {
+			fn()
+		}
+		g.debounceMu.Lock()
+		delete(g.debounceTimers, key)
+		g.debounceMu.Unlock()
+	})
+	g.debounceMu.Unlock()
+}
+
+// checkContainerByID inspects and potentially restarts a single container.
+func (g *Guardian) checkContainerByID(ctx context.Context, containerID string) {
+	// Use the regular unhealthy check — it re-queries and filters
+	g.backupRunning = nil
+	g.orchestratorCached = false
+	g.checkUnhealthy(ctx)
+}
+
+// checkOrphanedDependents checks if any dependents of the given container need starting.
+func (g *Guardian) checkOrphanedDependents(ctx context.Context, _ string) {
+	g.backupRunning = nil
+	g.orchestratorCached = false
+	g.checkDependencyOrphans(ctx)
+}
+
+// recordOrchestrationActivity records a create/destroy event for orchestration tracking.
+func (g *Guardian) recordOrchestrationActivity(evt docker.ContainerEvent) {
+	g.orchestrationMu.Lock()
+	g.orchestrationEvents[evt.ContainerName] = evt.Timestamp
+	g.orchestrationMu.Unlock()
+
+	// Clean up old entries
+	go g.pruneOrchestrationEvents()
+}
+
+func (g *Guardian) pruneOrchestrationEvents() {
+	cutoff := g.clock.Now().Add(-time.Duration(g.cfg.WatchtowerCooldown) * time.Second)
+	g.orchestrationMu.Lock()
+	for name, ts := range g.orchestrationEvents {
+		if ts.Before(cutoff) {
+			delete(g.orchestrationEvents, name)
+		}
+	}
+	g.orchestrationMu.Unlock()
+}
+
+// EventStreamConnected returns whether we're using event-driven mode.
+// Used by metrics.
+func (g *Guardian) EventStreamConnected() bool {
+	_, ok := g.docker.(*docker.Client)
+	return ok
+}
+
+// Tracker returns the restart tracker (for metrics).
+func (g *Guardian) Tracker() *RestartTracker {
+	return g.tracker
+}
+
+// UnhealthyCount returns the count from the last check (for metrics).
+// This is a simple accessor — the real metric instrumentation happens in Phase 5.
+func (g *Guardian) UnhealthyCount(ctx context.Context) int {
+	containers, err := g.docker.UnhealthyContainers(ctx, g.cfg.ContainerLabel, g.cfg.OnlyMonitorRunning)
+	if err != nil {
+		return 0
+	}
+	return len(containers)
+}
diff --git a/internal/guardian/guardian_test.go b/internal/guardian/guardian_test.go
new file mode 100644
index 0000000..ba62aaa
--- /dev/null
+++ b/internal/guardian/guardian_test.go
@@ -0,0 +1,187 @@
+package guardian
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/Will-Luck/Docker-Guardian/internal/config"
+	"github.com/Will-Luck/Docker-Guardian/internal/logging"
+	"github.com/moby/moby/api/types/container"
+	"github.com/moby/moby/api/types/events"
+)
+
+func newTestGuardian(cfg *config.Config, dock *mockDocker, notif *mockNotifier, clk *mockClock) *Guardian {
+	return &Guardian{
+		cfg:      cfg,
+		docker:   dock,
+		notifier: notif,
+		log:      logging.New(false),
+		clock:    clk,
+		tracker:  NewRestartTracker(DefaultTrackerConfig(), clk),
+	}
+}
+
+func TestShouldSkip_NoGuards(t *testing.T) {
+	cfg := &config.Config{
+		WatchtowerCooldown: 0,
+		GracePeriod:        0,
+		BackupLabel:        "",
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+	clk := newMockClock(time.Now())
+
+	g := newTestGuardian(cfg, dock, notif, clk)
+	if g.shouldSkip(context.Background(), "abcdef123456", "test-container", nil) {
+		t.Error("should not skip when no guards are configured")
+	}
+}
+
+func TestShouldSkip_GracePeriod(t *testing.T) {
+	now := time.Date(2025, 1, 1, 12, 0, 0, 0, time.UTC)
+	clk := newMockClock(now)
+
+	cfg := &config.Config{
+		GracePeriod:        60,
+		WatchtowerCooldown: 0,
+		BackupLabel:        "",
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+
+	// Container stopped 30s ago — within grace period
+	dock.finishedAtResults["abcdef123456"] = now.Add(-30 * time.Second)
+
+	g := newTestGuardian(cfg, dock, notif, clk)
+	if !g.shouldSkip(context.Background(), "abcdef123456", "test-container", nil) {
+		t.Error("should skip container within grace period")
+	}
+	if len(notif.skips) != 1 {
+		t.Errorf("expected 1 skip notification, got %d", len(notif.skips))
+	}
+}
+
+func TestShouldSkip_GracePeriodExpired(t *testing.T) {
+	now := time.Date(2025, 1, 1, 12, 0, 0, 0, time.UTC)
+	clk := newMockClock(now)
+
+	cfg := &config.Config{
+		GracePeriod:        60,
+		WatchtowerCooldown: 0,
+		BackupLabel:        "",
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+
+	// Container stopped 90s ago — outside grace period
+	dock.finishedAtResults["abcdef123456"] = now.Add(-90 * time.Second)
+
+	g := newTestGuardian(cfg, dock, notif, clk)
+	if g.shouldSkip(context.Background(), "abcdef123456", "test-container", nil) {
+		t.Error("should not skip container outside grace period")
+	}
+}
+
+func TestShouldSkip_OrchestrationAll(t *testing.T) {
+	cfg := &config.Config{
+		WatchtowerCooldown: 300,
+		WatchtowerScope:    "all",
+		WatchtowerEvents:   "orchestration",
+		GracePeriod:        0,
+		BackupLabel:        "",
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+	clk := newMockClock(time.Now())
+
+	dock.containerEvents = []events.Message{{Action: "create"}}
+
+	g := newTestGuardian(cfg, dock, notif, clk)
+	if !g.shouldSkip(context.Background(), "abcdef123456", "test-container", nil) {
+		t.Error("should skip when orchestration events exist with scope=all")
+	}
+}
+
+func TestShouldSkip_OrchestrationAffected(t *testing.T) {
+	cfg := &config.Config{
+		WatchtowerCooldown: 300,
+		WatchtowerScope:    "affected",
+		WatchtowerEvents:   "orchestration",
+		GracePeriod:        0,
+		BackupLabel:        "",
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+	clk := newMockClock(time.Now())
+
+	// Event for a different container
+	dock.containerEvents = []events.Message{
+		{Action: "create", Actor: events.Actor{Attributes: map[string]string{"name": "other-container"}}},
+	}
+
+	g := newTestGuardian(cfg, dock, notif, clk)
+
+	// test-container should NOT be skipped — it's not in the events
+	if g.shouldSkip(context.Background(), "abcdef123456", "test-container", nil) {
+		t.Error("should not skip unaffected container with scope=affected")
+	}
+
+	// Reset cache for next check
+	g.orchestratorCached = false
+
+	// Now check the affected container
+	if !g.shouldSkip(context.Background(), "bbbbbb123456", "other-container", nil) {
+		t.Error("should skip affected container with scope=affected")
+	}
+}
+
+func TestShouldSkip_BackupRunning(t *testing.T) {
+	cfg := &config.Config{
+		WatchtowerCooldown: 0,
+		GracePeriod:        0,
+		BackupLabel:        "docker-volume-backup.stop-during-backup",
+		BackupContainer:    "my-backup",
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+	clk := newMockClock(time.Now())
+
+	// Backup container running
+	dock.runningContainers = []container.Summary{
+		{Names: []string{"/my-backup"}},
+	}
+
+	labels := map[string]string{
+		"docker-volume-backup.stop-during-backup": "true",
+	}
+
+	g := newTestGuardian(cfg, dock, notif, clk)
+	if !g.shouldSkip(context.Background(), "abcdef123456", "test-container", labels) {
+		t.Error("should skip backup-managed container while backup is running")
+	}
+}
+
+func TestShouldSkip_BackupNotRunning(t *testing.T) {
+	cfg := &config.Config{
+		WatchtowerCooldown: 0,
+		GracePeriod:        0,
+		BackupLabel:        "docker-volume-backup.stop-during-backup",
+		BackupContainer:    "my-backup",
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+	clk := newMockClock(time.Now())
+
+	// No backup container running
+	dock.runningContainers = []container.Summary{}
+
+	labels := map[string]string{
+		"docker-volume-backup.stop-during-backup": "true",
+	}
+
+	g := newTestGuardian(cfg, dock, notif, clk)
+	if g.shouldSkip(context.Background(), "abcdef123456", "test-container", labels) {
+		t.Error("should not skip when backup is not running")
+	}
+}
diff --git a/internal/guardian/guards.go b/internal/guardian/guards.go
index e397479..9fa2b25 100644
--- a/internal/guardian/guards.go
+++ b/internal/guardian/guards.go
@@ -6,6 +6,8 @@ import (
 	"os/exec"
 	"strings"
 	"time"
+
+	"github.com/Will-Luck/Docker-Guardian/internal/metrics"
 )
 
 // shouldSkip returns true if this container should be skipped due to
@@ -20,18 +22,20 @@ func (g *Guardian) shouldSkip(ctx context.Context, containerID, containerName st
 
 		if g.cfg.WatchtowerScope == "affected" {
 			if g.isContainerInOrchestration(cleanName) {
-				now := time.Now().Format("02-01-2006 15:04:05")
+				now := g.clock.Now().Format("02-01-2006 15:04:05")
 				fmt.Printf("%s Container %s (%s) affected by orchestration activity within %ds - skipping\n",
 					now, cleanName, shortID, g.cfg.WatchtowerCooldown)
-				g.dispatcher.Skip(fmt.Sprintf("Container %s (%s) skipped - orchestration activity", cleanName, shortID))
+				g.notifier.Skip(fmt.Sprintf("Container %s (%s) skipped - orchestration activity", cleanName, shortID))
+				metrics.SkipsTotal.WithLabelValues(cleanName, "orchestration").Inc()
 				return true
 			}
 		} else {
 			if g.isOrchestratorActive() {
-				now := time.Now().Format("02-01-2006 15:04:05")
+				now := g.clock.Now().Format("02-01-2006 15:04:05")
 				fmt.Printf("%s Container %s (%s) skipped - orchestration activity detected within %ds\n",
 					now, cleanName, shortID, g.cfg.WatchtowerCooldown)
-				g.dispatcher.Skip(fmt.Sprintf("Container %s (%s) skipped - orchestration activity", cleanName, shortID))
+				g.notifier.Skip(fmt.Sprintf("Container %s (%s) skipped - orchestration activity", cleanName, shortID))
+				metrics.SkipsTotal.WithLabelValues(cleanName, "orchestration").Inc()
 				return true
 			}
 		}
@@ -41,12 +45,13 @@ func (g *Guardian) shouldSkip(ctx context.Context, containerID, containerName st
 	if g.cfg.GracePeriod > 0 {
 		finishedAt, err := g.docker.ContainerFinishedAt(ctx, containerID)
 		if err == nil {
-			age := time.Since(finishedAt)
+			age := g.clock.Since(finishedAt)
 			if age < time.Duration(g.cfg.GracePeriod)*time.Second {
-				now := time.Now().Format("02-01-2006 15:04:05")
+				now := g.clock.Now().Format("02-01-2006 15:04:05")
 				fmt.Printf("%s Container %s (%s) stopped within grace period (%ds) - skipping\n",
 					now, cleanName, shortID, g.cfg.GracePeriod)
-				g.dispatcher.Skip(fmt.Sprintf("Container %s (%s) skipped - grace period", cleanName, shortID))
+				g.notifier.Skip(fmt.Sprintf("Container %s (%s) skipped - grace period", cleanName, shortID))
+				metrics.SkipsTotal.WithLabelValues(cleanName, "grace").Inc()
 				return true
 			}
 		}
@@ -54,10 +59,11 @@ func (g *Guardian) shouldSkip(ctx context.Context, containerID, containerName st
 
 	// Backup awareness
 	if g.isBackupManaged(labels) && g.cachedBackupRunning(ctx) {
-		now := time.Now().Format("02-01-2006 15:04:05")
+		now := g.clock.Now().Format("02-01-2006 15:04:05")
 		fmt.Printf("%s Container %s (%s) managed by backup (currently running) - skipping\n",
 			now, cleanName, shortID)
-		g.dispatcher.Skip(fmt.Sprintf("Container %s (%s) skipped - backup running", cleanName, shortID))
+		g.notifier.Skip(fmt.Sprintf("Container %s (%s) skipped - backup running", cleanName, shortID))
+		metrics.SkipsTotal.WithLabelValues(cleanName, "backup").Inc()
 		return true
 	}
 
@@ -73,7 +79,7 @@ func (g *Guardian) fetchOrchestrationEvents(ctx context.Context) {
 	g.orchestratorCached = true
 	g.orchestratorEvents = nil
 
-	now := time.Now()
+	now := g.clock.Now()
 	since := now.Add(-time.Duration(g.cfg.WatchtowerCooldown) * time.Second)
 	orchestrationOnly := g.cfg.WatchtowerEvents != "all"
 
@@ -84,7 +90,7 @@ func (g *Guardian) fetchOrchestrationEvents(ctx context.Context) {
 	g.orchestratorEvents = events
 
 	if len(events) > 0 {
-		dateStr := time.Now().Format("02-01-2006 15:04:05")
+		dateStr := g.clock.Now().Format("02-01-2006 15:04:05")
 		fmt.Printf("%s Orchestration activity detected: %d container event(s) within %ds cooldown\n",
 			dateStr, len(events), g.cfg.WatchtowerCooldown)
 	}
diff --git a/internal/guardian/mock_test.go b/internal/guardian/mock_test.go
new file mode 100644
index 0000000..717878d
--- /dev/null
+++ b/internal/guardian/mock_test.go
@@ -0,0 +1,179 @@
+package guardian
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"github.com/moby/moby/api/types/container"
+	"github.com/moby/moby/api/types/events"
+)
+
+// mockDocker implements docker.API for testing.
+type mockDocker struct {
+	mu sync.Mutex
+
+	unhealthyContainers []container.Summary
+	unhealthyErr        error
+
+	exitedContainers []container.Summary
+	exitedErr        error
+
+	runningContainers []container.Summary
+	runningErr        error
+
+	inspectResults map[string]container.InspectResponse
+	inspectErr     map[string]error
+
+	restartCalls []string
+	restartErr   map[string]error
+
+	startCalls []string
+	startErr   map[string]error
+
+	stopCalls []string
+	stopErr   map[string]error
+
+	statusResults map[string]string
+	statusErr     map[string]error
+
+	finishedAtResults map[string]time.Time
+	finishedAtErr     map[string]error
+
+	containerEvents    []events.Message
+	containerEventsErr error
+}
+
+func newMockDocker() *mockDocker {
+	return &mockDocker{
+		inspectResults:    make(map[string]container.InspectResponse),
+		inspectErr:        make(map[string]error),
+		restartErr:        make(map[string]error),
+		startErr:          make(map[string]error),
+		stopErr:           make(map[string]error),
+		statusResults:     make(map[string]string),
+		statusErr:         make(map[string]error),
+		finishedAtResults: make(map[string]time.Time),
+		finishedAtErr:     make(map[string]error),
+	}
+}
+
+func (m *mockDocker) UnhealthyContainers(_ context.Context, _ string, _ bool) ([]container.Summary, error) {
+	return m.unhealthyContainers, m.unhealthyErr
+}
+
+func (m *mockDocker) ExitedContainers(_ context.Context) ([]container.Summary, error) {
+	return m.exitedContainers, m.exitedErr
+}
+
+func (m *mockDocker) RunningContainers(_ context.Context) ([]container.Summary, error) {
+	return m.runningContainers, m.runningErr
+}
+
+func (m *mockDocker) InspectContainer(_ context.Context, id string) (container.InspectResponse, error) {
+	if err, ok := m.inspectErr[id]; ok && err != nil {
+		return container.InspectResponse{}, err
+	}
+	return m.inspectResults[id], nil
+}
+
+func (m *mockDocker) RestartContainer(_ context.Context, id string, _ int) error {
+	m.mu.Lock()
+	m.restartCalls = append(m.restartCalls, id)
+	m.mu.Unlock()
+	if err, ok := m.restartErr[id]; ok {
+		return err
+	}
+	return nil
+}
+
+func (m *mockDocker) StartContainer(_ context.Context, id string) error {
+	m.mu.Lock()
+	m.startCalls = append(m.startCalls, id)
+	m.mu.Unlock()
+	if err, ok := m.startErr[id]; ok {
+		return err
+	}
+	return nil
+}
+
+func (m *mockDocker) StopContainer(_ context.Context, id string, _ int) error {
+	m.mu.Lock()
+	m.stopCalls = append(m.stopCalls, id)
+	m.mu.Unlock()
+	if err, ok := m.stopErr[id]; ok {
+		return err
+	}
+	return nil
+}
+
+func (m *mockDocker) ContainerStatus(_ context.Context, id string) (string, error) {
+	if err, ok := m.statusErr[id]; ok && err != nil {
+		return "", err
+	}
+	return m.statusResults[id], nil
+}
+
+func (m *mockDocker) ContainerFinishedAt(_ context.Context, id string) (time.Time, error) {
+	if err, ok := m.finishedAtErr[id]; ok && err != nil {
+		return time.Time{}, err
+	}
+	return m.finishedAtResults[id], nil
+}
+
+func (m *mockDocker) ContainerEvents(_ context.Context, _, _ time.Time, _ bool) ([]events.Message, error) {
+	return m.containerEvents, m.containerEventsErr
+}
+
+func (m *mockDocker) Close() error { return nil }
+
+// mockNotifier implements notify.Notifier for testing.
+type mockNotifier struct {
+	mu       sync.Mutex
+	startups []string
+	actions  []string
+	skips    []string
+	closed   bool
+}
+
+func (m *mockNotifier) Startup(text string) {
+	m.mu.Lock()
+	m.startups = append(m.startups, text)
+	m.mu.Unlock()
+}
+
+func (m *mockNotifier) Action(text string) {
+	m.mu.Lock()
+	m.actions = append(m.actions, text)
+	m.mu.Unlock()
+}
+
+func (m *mockNotifier) Skip(text string) {
+	m.mu.Lock()
+	m.skips = append(m.skips, text)
+	m.mu.Unlock()
+}
+
+func (m *mockNotifier) Close() {
+	m.mu.Lock()
+	m.closed = true
+	m.mu.Unlock()
+}
+
+// mockClock implements clock.Clock for testing.
+type mockClock struct {
+	now time.Time
+}
+
+func newMockClock(t time.Time) *mockClock {
+	return &mockClock{now: t}
+}
+
+func (c *mockClock) Now() time.Time { return c.now }
+func (c *mockClock) After(d time.Duration) <-chan time.Time {
+	ch := make(chan time.Time, 1)
+	ch <- c.now.Add(d)
+	return ch
+}
+func (c *mockClock) Since(t time.Time) time.Duration { return c.now.Sub(t) }
+func (c *mockClock) Advance(d time.Duration)         { c.now = c.now.Add(d) }
diff --git a/internal/guardian/tracker.go b/internal/guardian/tracker.go
new file mode 100644
index 0000000..8c9b9b4
--- /dev/null
+++ b/internal/guardian/tracker.go
@@ -0,0 +1,204 @@
+package guardian
+
+import (
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/Will-Luck/Docker-Guardian/internal/clock"
+)
+
+// TrackerConfig holds circuit breaker / backoff settings.
+type TrackerConfig struct {
+	BackoffMultiplier float64       // multiplicative factor for each retry (default 2)
+	BackoffMax        time.Duration // cap on backoff delay (default 300s)
+	BackoffResetAfter time.Duration // healthy for this long resets backoff (default 600s)
+	RestartBudget     int           // max restarts per window (0 = unlimited)
+	RestartWindow     time.Duration // rolling window for budget (default 300s)
+}
+
+// DefaultTrackerConfig returns sensible defaults.
+func DefaultTrackerConfig() TrackerConfig {
+	return TrackerConfig{
+		BackoffMultiplier: 2,
+		BackoffMax:        300 * time.Second,
+		BackoffResetAfter: 600 * time.Second,
+		RestartBudget:     5,
+		RestartWindow:     300 * time.Second,
+	}
+}
+
+// ContainerHistory tracks restart history for a single container.
+type ContainerHistory struct {
+	Restarts     []time.Time   // timestamps of recent restarts
+	BackoffUntil time.Time     // next allowed restart time
+	BackoffDelay time.Duration // current backoff delay
+	CircuitOpen  bool          // true = budget exhausted
+}
+
+// SkipReason describes why a restart was suppressed.
+type SkipReason string
+
+const (
+	SkipNone    SkipReason = ""
+	SkipBackoff SkipReason = "backoff"
+	SkipCircuit SkipReason = "circuit"
+)
+
+// RestartTracker implements per-container circuit breaker and exponential backoff.
+type RestartTracker struct {
+	mu      sync.Mutex
+	history map[string]*ContainerHistory
+	cfg     TrackerConfig
+	clock   clock.Clock
+}
+
+// NewRestartTracker creates a tracker with the given config.
+func NewRestartTracker(cfg TrackerConfig, clk clock.Clock) *RestartTracker {
+	return &RestartTracker{
+		history: make(map[string]*ContainerHistory),
+		cfg:     cfg,
+		clock:   clk,
+	}
+}
+
+// ShouldRestart checks if a restart is allowed for the given container.
+// Returns (allowed, skipReason).
+func (rt *RestartTracker) ShouldRestart(id string) (bool, SkipReason) {
+	rt.mu.Lock()
+	defer rt.mu.Unlock()
+
+	h := rt.getOrCreate(id)
+	now := rt.clock.Now()
+
+	// Prune old restarts outside the window
+	rt.pruneOld(h)
+
+	// Check circuit breaker (budget exhausted)
+	if h.CircuitOpen {
+		return false, SkipCircuit
+	}
+
+	// Check backoff
+	if now.Before(h.BackoffUntil) {
+		return false, SkipBackoff
+	}
+
+	// Check budget
+	if rt.cfg.RestartBudget > 0 && len(h.Restarts) >= rt.cfg.RestartBudget {
+		h.CircuitOpen = true
+		return false, SkipCircuit
+	}
+
+	return true, SkipNone
+}
+
+// RecordRestart records that a restart was performed for the given container.
+// Advances the backoff delay for next time.
+func (rt *RestartTracker) RecordRestart(id string) {
+	rt.mu.Lock()
+	defer rt.mu.Unlock()
+
+	h := rt.getOrCreate(id)
+	now := rt.clock.Now()
+
+	h.Restarts = append(h.Restarts, now)
+
+	// Calculate next backoff
+	if h.BackoffDelay == 0 {
+		h.BackoffDelay = 10 * time.Second // initial backoff
+	} else {
+		h.BackoffDelay = time.Duration(float64(h.BackoffDelay) * rt.cfg.BackoffMultiplier)
+	}
+	if h.BackoffDelay > rt.cfg.BackoffMax {
+		h.BackoffDelay = rt.cfg.BackoffMax
+	}
+	h.BackoffUntil = now.Add(h.BackoffDelay)
+}
+
+// Reset clears backoff and restart history for a container (e.g. when it becomes healthy).
+func (rt *RestartTracker) Reset(id string) {
+	rt.mu.Lock()
+	defer rt.mu.Unlock()
+
+	delete(rt.history, id)
+}
+
+// IsCircuitOpen returns true if the circuit is open for the given container.
+func (rt *RestartTracker) IsCircuitOpen(id string) bool {
+	rt.mu.Lock()
+	defer rt.mu.Unlock()
+
+	h, ok := rt.history[id]
+	if !ok {
+		return false
+	}
+	return h.CircuitOpen
+}
+
+// BackoffRemaining returns the time remaining in backoff for a container.
+func (rt *RestartTracker) BackoffRemaining(id string) time.Duration {
+	rt.mu.Lock()
+	defer rt.mu.Unlock()
+
+	h, ok := rt.history[id]
+	if !ok {
+		return 0
+	}
+	remaining := h.BackoffUntil.Sub(rt.clock.Now())
+	if remaining < 0 {
+		return 0
+	}
+	return remaining
+}
+
+// CircuitOpenCount returns the number of containers with open circuits.
+func (rt *RestartTracker) CircuitOpenCount() int {
+	rt.mu.Lock()
+	defer rt.mu.Unlock()
+
+	count := 0
+	for _, h := range rt.history {
+		if h.CircuitOpen {
+			count++
+		}
+	}
+	return count
+}
+
+// FormatSkipReason returns a human-readable string for a skip reason.
+func (rt *RestartTracker) FormatSkipReason(id, name string, reason SkipReason) string {
+	switch reason {
+	case SkipBackoff:
+		remaining := rt.BackoffRemaining(id)
+		return fmt.Sprintf("Container %s in backoff (%.0fs remaining)", name, remaining.Seconds())
+	case SkipCircuit:
+		return fmt.Sprintf("Container %s circuit open (restart budget exhausted)", name)
+	default:
+		return ""
+	}
+}
+
+func (rt *RestartTracker) getOrCreate(id string) *ContainerHistory {
+	h, ok := rt.history[id]
+	if !ok {
+		h = &ContainerHistory{}
+		rt.history[id] = h
+	}
+	return h
+}
+
+func (rt *RestartTracker) pruneOld(h *ContainerHistory) {
+	if rt.cfg.RestartWindow <= 0 {
+		return
+	}
+	cutoff := rt.clock.Now().Add(-rt.cfg.RestartWindow)
+	i := 0
+	for _, t := range h.Restarts {
+		if t.After(cutoff) {
+			h.Restarts[i] = t
+			i++
+		}
+	}
+	h.Restarts = h.Restarts[:i]
+}
diff --git a/internal/guardian/tracker_test.go b/internal/guardian/tracker_test.go
new file mode 100644
index 0000000..dc5786a
--- /dev/null
+++ b/internal/guardian/tracker_test.go
@@ -0,0 +1,224 @@
+package guardian
+
+import (
+	"testing"
+	"time"
+)
+
+func TestTracker_AllowsFirstRestart(t *testing.T) {
+	clk := newMockClock(time.Now())
+	rt := NewRestartTracker(DefaultTrackerConfig(), clk)
+
+	allowed, reason := rt.ShouldRestart("abc123")
+	if !allowed {
+		t.Errorf("first restart should be allowed, got reason=%s", reason)
+	}
+}
+
+func TestTracker_BackoffAfterRestart(t *testing.T) {
+	clk := newMockClock(time.Now())
+	rt := NewRestartTracker(DefaultTrackerConfig(), clk)
+
+	rt.RecordRestart("abc123")
+
+	// Immediately after, should be in backoff
+	allowed, reason := rt.ShouldRestart("abc123")
+	if allowed {
+		t.Error("should be in backoff after restart")
+	}
+	if reason != SkipBackoff {
+		t.Errorf("expected backoff reason, got %s", reason)
+	}
+
+	// Advance past initial 10s backoff
+	clk.Advance(11 * time.Second)
+
+	allowed, reason = rt.ShouldRestart("abc123")
+	if !allowed {
+		t.Errorf("should be allowed after backoff expires, got reason=%s", reason)
+	}
+}
+
+func TestTracker_ExponentialBackoff(t *testing.T) {
+	clk := newMockClock(time.Now())
+	rt := NewRestartTracker(DefaultTrackerConfig(), clk)
+
+	// First restart: 10s backoff
+	rt.RecordRestart("abc123")
+	if rt.BackoffRemaining("abc123") > 10*time.Second || rt.BackoffRemaining("abc123") < 9*time.Second {
+		t.Errorf("expected ~10s backoff, got %v", rt.BackoffRemaining("abc123"))
+	}
+
+	// Advance past first backoff
+	clk.Advance(11 * time.Second)
+
+	// Second restart: 20s backoff
+	rt.RecordRestart("abc123")
+	remaining := rt.BackoffRemaining("abc123")
+	if remaining > 20*time.Second || remaining < 19*time.Second {
+		t.Errorf("expected ~20s backoff, got %v", remaining)
+	}
+
+	// Advance past second backoff
+	clk.Advance(21 * time.Second)
+
+	// Third restart: 40s backoff
+	rt.RecordRestart("abc123")
+	remaining = rt.BackoffRemaining("abc123")
+	if remaining > 40*time.Second || remaining < 39*time.Second {
+		t.Errorf("expected ~40s backoff, got %v", remaining)
+	}
+}
+
+func TestTracker_BackoffMax(t *testing.T) {
+	clk := newMockClock(time.Now())
+	cfg := DefaultTrackerConfig()
+	cfg.BackoffMax = 30 * time.Second
+	rt := NewRestartTracker(cfg, clk)
+
+	// Restart 5 times, each time advancing past backoff
+	for i := 0; i < 5; i++ {
+		rt.RecordRestart("abc123")
+		clk.Advance(cfg.BackoffMax + time.Second)
+	}
+
+	// Backoff should be capped at BackoffMax
+	rt.RecordRestart("abc123")
+	remaining := rt.BackoffRemaining("abc123")
+	if remaining > cfg.BackoffMax {
+		t.Errorf("backoff should be capped at %v, got %v", cfg.BackoffMax, remaining)
+	}
+}
+
+func TestTracker_BudgetExhausted(t *testing.T) {
+	clk := newMockClock(time.Now())
+	cfg := DefaultTrackerConfig()
+	cfg.RestartBudget = 3
+	cfg.RestartWindow = 3600 * time.Second // large window so restarts aren't pruned
+	cfg.BackoffMax = 10 * time.Second      // small max so we advance less
+	rt := NewRestartTracker(cfg, clk)
+
+	// Record 3 restarts (budget = 3), advancing past backoff each time
+	for i := 0; i < 3; i++ {
+		rt.RecordRestart("abc123")
+		clk.Advance(cfg.BackoffMax + time.Second)
+	}
+
+	// 4th restart should be denied — circuit open
+	allowed, reason := rt.ShouldRestart("abc123")
+	if allowed {
+		t.Error("should deny restart when budget exhausted")
+	}
+	if reason != SkipCircuit {
+		t.Errorf("expected circuit reason, got %s", reason)
+	}
+	if !rt.IsCircuitOpen("abc123") {
+		t.Error("circuit should be open")
+	}
+}
+
+func TestTracker_BudgetUnlimited(t *testing.T) {
+	clk := newMockClock(time.Now())
+	cfg := DefaultTrackerConfig()
+	cfg.RestartBudget = 0 // unlimited
+	rt := NewRestartTracker(cfg, clk)
+
+	for i := 0; i < 20; i++ {
+		rt.RecordRestart("abc123")
+		clk.Advance(cfg.BackoffMax + time.Second) // advance past max backoff
+	}
+
+	allowed, _ := rt.ShouldRestart("abc123")
+	if !allowed {
+		t.Error("should allow restarts when budget is unlimited")
+	}
+}
+
+func TestTracker_Reset(t *testing.T) {
+	clk := newMockClock(time.Now())
+	cfg := DefaultTrackerConfig()
+	cfg.RestartBudget = 2
+	rt := NewRestartTracker(cfg, clk)
+
+	// Exhaust budget
+	rt.RecordRestart("abc123")
+	rt.RecordRestart("abc123")
+	clk.Advance(11 * time.Second)
+
+	allowed, _ := rt.ShouldRestart("abc123")
+	if allowed {
+		t.Error("should deny after budget exhausted")
+	}
+
+	// Reset (container became healthy)
+	rt.Reset("abc123")
+
+	allowed, _ = rt.ShouldRestart("abc123")
+	if !allowed {
+		t.Error("should allow after reset")
+	}
+}
+
+func TestTracker_PruneOldRestarts(t *testing.T) {
+	clk := newMockClock(time.Now())
+	cfg := DefaultTrackerConfig()
+	cfg.RestartBudget = 3
+	cfg.RestartWindow = 60 * time.Second
+	rt := NewRestartTracker(cfg, clk)
+
+	// Record 3 restarts
+	for i := 0; i < 3; i++ {
+		rt.RecordRestart("abc123")
+		clk.Advance(11 * time.Second)
+	}
+
+	// Advance past the window so old restarts are pruned
+	clk.Advance(60 * time.Second)
+
+	// Should be allowed now — old restarts pruned
+	allowed, _ := rt.ShouldRestart("abc123")
+	if !allowed {
+		t.Error("should allow after old restarts are pruned")
+	}
+}
+
+func TestTracker_CircuitOpenCount(t *testing.T) {
+	clk := newMockClock(time.Now())
+	cfg := DefaultTrackerConfig()
+	cfg.RestartBudget = 1
+	rt := NewRestartTracker(cfg, clk)
+
+	rt.RecordRestart("a")
+	clk.Advance(11 * time.Second)
+	rt.ShouldRestart("a") // triggers circuit open
+
+	rt.RecordRestart("b")
+	clk.Advance(11 * time.Second)
+	rt.ShouldRestart("b") // triggers circuit open
+
+	if rt.CircuitOpenCount() != 2 {
+		t.Errorf("expected 2 open circuits, got %d", rt.CircuitOpenCount())
+	}
+}
+
+func TestContainerAction(t *testing.T) {
+	tests := []struct {
+		labels   map[string]string
+		expected string
+	}{
+		{nil, "restart"},
+		{map[string]string{}, "restart"},
+		{map[string]string{"autoheal.action": "restart"}, "restart"},
+		{map[string]string{"autoheal.action": "stop"}, "stop"},
+		{map[string]string{"autoheal.action": "notify"}, "notify"},
+		{map[string]string{"autoheal.action": "none"}, "none"},
+		{map[string]string{"autoheal.action": "invalid"}, "restart"},
+	}
+
+	for _, tt := range tests {
+		got := containerAction(tt.labels)
+		if got != tt.expected {
+			t.Errorf("containerAction(%v) = %q, want %q", tt.labels, got, tt.expected)
+		}
+	}
+}
diff --git a/internal/guardian/unhealthy.go b/internal/guardian/unhealthy.go
index 527fc0f..ce45b6c 100644
--- a/internal/guardian/unhealthy.go
+++ b/internal/guardian/unhealthy.go
@@ -6,9 +6,23 @@ import (
 	"strconv"
 	"strings"
 	"time"
+
+	"github.com/Will-Luck/Docker-Guardian/internal/metrics"
 )
 
-// checkUnhealthy finds unhealthy containers and restarts them.
+// containerAction returns the action to take for a container based on its labels.
+// Possible values: "restart" (default), "stop", "notify", "none".
+func containerAction(labels map[string]string) string {
+	if action, ok := labels["autoheal.action"]; ok {
+		switch action {
+		case "restart", "stop", "notify", "none":
+			return action
+		}
+	}
+	return "restart"
+}
+
+// checkUnhealthy finds unhealthy containers and handles them based on action labels.
 func (g *Guardian) checkUnhealthy(ctx context.Context) {
 	containers, err := g.docker.UnhealthyContainers(ctx, g.cfg.ContainerLabel, g.cfg.OnlyMonitorRunning)
 	if err != nil {
@@ -16,18 +30,31 @@ func (g *Guardian) checkUnhealthy(ctx context.Context) {
 		return
 	}
 
+	metrics.UnhealthyContainers.Set(float64(len(containers)))
+	metrics.CircuitOpenContainers.Set(float64(g.tracker.CircuitOpenCount()))
+
 	for _, c := range containers {
 		// Skip containers opted out via label
 		if c.Labels["autoheal"] == "False" {
 			continue
 		}
 
+		if len(c.Names) == 0 {
+			continue
+		}
+
 		id := c.ID
 		shortID := id[:12]
 		name := strings.TrimPrefix(c.Names[0], "/")
 
+		// Check per-container action label
+		action := containerAction(c.Labels)
+		if action == "none" {
+			continue
+		}
+
 		if string(c.State) == "restarting" {
-			now := time.Now().Format("02-01-2006 15:04:05")
+			now := g.clock.Now().Format("02-01-2006 15:04:05")
 			fmt.Printf("%s Container %s (%s) found to be restarting - don't restart\n", now, name, shortID)
 			continue
 		}
@@ -36,6 +63,24 @@ func (g *Guardian) checkUnhealthy(ctx context.Context) {
 			continue
 		}
 
+		// Handle notify-only action
+		if action == "notify" {
+			g.notifier.Action(fmt.Sprintf("Container %s (%s) found to be unhealthy (action=notify)", name, shortID))
+			continue
+		}
+
+		// Circuit breaker check (for restart and stop actions)
+		if allowed, reason := g.tracker.ShouldRestart(id); !allowed {
+			msg := g.tracker.FormatSkipReason(id, name, reason)
+			now := g.clock.Now().Format("02-01-2006 15:04:05")
+			fmt.Printf("%s %s\n", now, msg)
+			metrics.SkipsTotal.WithLabelValues(name, string(reason)).Inc()
+			if reason == SkipCircuit {
+				g.notifier.Action(fmt.Sprintf("[CRITICAL] %s", msg))
+			}
+			continue
+		}
+
 		timeout := g.cfg.DefaultStopTimeout
 		if v, ok := c.Labels["autoheal.stop.timeout"]; ok {
 			if parsed, err := strconv.Atoi(v); err == nil {
@@ -43,17 +88,39 @@ func (g *Guardian) checkUnhealthy(ctx context.Context) {
 			}
 		}
 
-		now := time.Now().Format("02-01-2006 15:04:05")
+		// Handle stop action (quarantine)
+		if action == "stop" {
+			now := g.clock.Now().Format("02-01-2006 15:04:05")
+			fmt.Printf("%s Container %s (%s) found to be unhealthy - Stopping container (action=stop)\n", now, name, shortID)
+			if err := g.docker.StopContainer(ctx, id, timeout); err != nil {
+				g.log.Error("failed to stop container", "container", name, "id", shortID, "error", err)
+				g.notifier.Action(fmt.Sprintf("Container %s (%s) found to be unhealthy. Failed to stop (quarantine)!", name, shortID))
+				metrics.RestartsTotal.WithLabelValues(name, "failure").Inc()
+			} else {
+				g.notifier.Action(fmt.Sprintf("Container %s (%s) found to be unhealthy. Stopped (quarantined).", name, shortID))
+				metrics.RestartsTotal.WithLabelValues(name, "success").Inc()
+			}
+			g.tracker.RecordRestart(id)
+			continue
+		}
+
+		// Default: restart
+		now := g.clock.Now().Format("02-01-2006 15:04:05")
 		fmt.Printf("%s Container %s (%s) found to be unhealthy - Restarting container now with %ds timeout\n",
 			now, name, shortID, timeout)
 
+		start := time.Now()
 		if err := g.docker.RestartContainer(ctx, id, timeout); err != nil {
 			g.log.Error("failed to restart container", "container", name, "id", shortID, "error", err)
-			g.dispatcher.Action(fmt.Sprintf("Container %s (%s) found to be unhealthy. Failed to restart the container!", name, shortID))
+			g.notifier.Action(fmt.Sprintf("Container %s (%s) found to be unhealthy. Failed to restart the container!", name, shortID))
+			metrics.RestartsTotal.WithLabelValues(name, "failure").Inc()
 		} else {
-			g.dispatcher.Action(fmt.Sprintf("Container %s (%s) found to be unhealthy. Successfully restarted the container!", name, shortID))
+			g.notifier.Action(fmt.Sprintf("Container %s (%s) found to be unhealthy. Successfully restarted the container!", name, shortID))
+			metrics.RestartsTotal.WithLabelValues(name, "success").Inc()
 		}
+		metrics.RestartDuration.WithLabelValues(name).Observe(time.Since(start).Seconds())
 
+		g.tracker.RecordRestart(id)
 		g.runPostRestartScript(name, shortID, string(c.State), timeout)
 	}
 }
diff --git a/internal/guardian/unhealthy_test.go b/internal/guardian/unhealthy_test.go
new file mode 100644
index 0000000..3e3cdc9
--- /dev/null
+++ b/internal/guardian/unhealthy_test.go
@@ -0,0 +1,243 @@
+package guardian
+
+import (
+	"context"
+	"errors"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/Will-Luck/Docker-Guardian/internal/config"
+	"github.com/Will-Luck/Docker-Guardian/internal/logging"
+	"github.com/moby/moby/api/types/container"
+)
+
+func TestCheckUnhealthy_RestartsContainer(t *testing.T) {
+	cfg := &config.Config{
+		ContainerLabel:     "all",
+		DefaultStopTimeout: 10,
+		WatchtowerCooldown: 0,
+		GracePeriod:        0,
+		BackupLabel:        "",
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+	clk := newMockClock(time.Now())
+
+	dock.unhealthyContainers = []container.Summary{
+		{
+			ID:     "abcdef1234567890abcdef",
+			Names:  []string{"/test-app"},
+			State:  "running",
+			Labels: map[string]string{},
+		},
+	}
+
+	g := &Guardian{
+		cfg:      cfg,
+		docker:   dock,
+		notifier: notif,
+		log:      logging.New(false),
+		clock:    clk,
+		tracker:  NewRestartTracker(DefaultTrackerConfig(), clk),
+	}
+
+	g.checkUnhealthy(context.Background())
+
+	if len(dock.restartCalls) != 1 {
+		t.Fatalf("expected 1 restart call, got %d", len(dock.restartCalls))
+	}
+	if dock.restartCalls[0] != "abcdef1234567890abcdef" {
+		t.Errorf("restarted wrong container: %s", dock.restartCalls[0])
+	}
+	if len(notif.actions) != 1 || !strings.Contains(notif.actions[0], "Successfully restarted") {
+		t.Errorf("expected success notification, got %v", notif.actions)
+	}
+}
+
+func TestCheckUnhealthy_SkipsOptedOut(t *testing.T) {
+	cfg := &config.Config{
+		ContainerLabel:     "all",
+		DefaultStopTimeout: 10,
+		WatchtowerCooldown: 0,
+		GracePeriod:        0,
+		BackupLabel:        "",
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+	clk := newMockClock(time.Now())
+
+	dock.unhealthyContainers = []container.Summary{
+		{
+			ID:     "abcdef1234567890abcdef",
+			Names:  []string{"/opted-out"},
+			State:  "running",
+			Labels: map[string]string{"autoheal": "False"},
+		},
+	}
+
+	g := &Guardian{
+		cfg:      cfg,
+		docker:   dock,
+		notifier: notif,
+		log:      logging.New(false),
+		clock:    clk,
+		tracker:  NewRestartTracker(DefaultTrackerConfig(), clk),
+	}
+
+	g.checkUnhealthy(context.Background())
+
+	if len(dock.restartCalls) != 0 {
+		t.Error("should not restart opted-out container")
+	}
+}
+
+func TestCheckUnhealthy_SkipsEmptyNames(t *testing.T) {
+	cfg := &config.Config{
+		ContainerLabel:     "all",
+		DefaultStopTimeout: 10,
+		WatchtowerCooldown: 0,
+		GracePeriod:        0,
+		BackupLabel:        "",
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+	clk := newMockClock(time.Now())
+
+	dock.unhealthyContainers = []container.Summary{
+		{
+			ID:     "abcdef1234567890abcdef",
+			Names:  []string{},
+			State:  "running",
+			Labels: map[string]string{},
+		},
+	}
+
+	g := &Guardian{
+		cfg:      cfg,
+		docker:   dock,
+		notifier: notif,
+		log:      logging.New(false),
+		clock:    clk,
+		tracker:  NewRestartTracker(DefaultTrackerConfig(), clk),
+	}
+
+	// Should not panic
+	g.checkUnhealthy(context.Background())
+
+	if len(dock.restartCalls) != 0 {
+		t.Error("should not restart container with no names")
+	}
+}
+
+func TestCheckUnhealthy_SkipsRestarting(t *testing.T) {
+	cfg := &config.Config{
+		ContainerLabel:     "all",
+		DefaultStopTimeout: 10,
+		WatchtowerCooldown: 0,
+		GracePeriod:        0,
+		BackupLabel:        "",
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+	clk := newMockClock(time.Now())
+
+	dock.unhealthyContainers = []container.Summary{
+		{
+			ID:     "abcdef1234567890abcdef",
+			Names:  []string{"/restarting-app"},
+			State:  "restarting",
+			Labels: map[string]string{},
+		},
+	}
+
+	g := &Guardian{
+		cfg:      cfg,
+		docker:   dock,
+		notifier: notif,
+		log:      logging.New(false),
+		clock:    clk,
+		tracker:  NewRestartTracker(DefaultTrackerConfig(), clk),
+	}
+
+	g.checkUnhealthy(context.Background())
+
+	if len(dock.restartCalls) != 0 {
+		t.Error("should not restart container already in restarting state")
+	}
+}
+
+func TestCheckUnhealthy_RestartFailure(t *testing.T) {
+	cfg := &config.Config{
+		ContainerLabel:     "all",
+		DefaultStopTimeout: 10,
+		WatchtowerCooldown: 0,
+		GracePeriod:        0,
+		BackupLabel:        "",
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+	clk := newMockClock(time.Now())
+
+	dock.unhealthyContainers = []container.Summary{
+		{
+			ID:     "abcdef1234567890abcdef",
+			Names:  []string{"/fail-app"},
+			State:  "running",
+			Labels: map[string]string{},
+		},
+	}
+	dock.restartErr["abcdef1234567890abcdef"] = errors.New("restart failed")
+
+	g := &Guardian{
+		cfg:      cfg,
+		docker:   dock,
+		notifier: notif,
+		log:      logging.New(false),
+		clock:    clk,
+		tracker:  NewRestartTracker(DefaultTrackerConfig(), clk),
+	}
+
+	g.checkUnhealthy(context.Background())
+
+	if len(notif.actions) != 1 || !strings.Contains(notif.actions[0], "Failed") {
+		t.Errorf("expected failure notification, got %v", notif.actions)
+	}
+}
+
+func TestCheckUnhealthy_CustomTimeout(t *testing.T) {
+	cfg := &config.Config{
+		ContainerLabel:     "all",
+		DefaultStopTimeout: 10,
+		WatchtowerCooldown: 0,
+		GracePeriod:        0,
+		BackupLabel:        "",
+	}
+	dock := newMockDocker()
+	notif := &mockNotifier{}
+	clk := newMockClock(time.Now())
+
+	dock.unhealthyContainers = []container.Summary{
+		{
+			ID:     "abcdef1234567890abcdef",
+			Names:  []string{"/custom-timeout"},
+			State:  "running",
+			Labels: map[string]string{"autoheal.stop.timeout": "30"},
+		},
+	}
+
+	g := &Guardian{
+		cfg:      cfg,
+		docker:   dock,
+		notifier: notif,
+		log:      logging.New(false),
+		clock:    clk,
+		tracker:  NewRestartTracker(DefaultTrackerConfig(), clk),
+	}
+
+	g.checkUnhealthy(context.Background())
+
+	if len(dock.restartCalls) != 1 {
+		t.Fatalf("expected 1 restart, got %d", len(dock.restartCalls))
+	}
+}
diff --git a/internal/metrics/prometheus.go b/internal/metrics/prometheus.go
index 150c230..fee15c5 100644
--- a/internal/metrics/prometheus.go
+++ b/internal/metrics/prometheus.go
@@ -1,4 +1,92 @@
 package metrics
 
-// Prometheus metrics endpoint — placeholder for v2.0.0.
-// Will expose /metrics with counters for restarts, starts, skips, and notification outcomes.
+import (
+	"fmt"
+	"net/http"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promhttp"
+)
+
+// Metrics holds all Prometheus metric collectors.
+var (
+	RestartsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Name: "docker_guardian_restarts_total",
+		Help: "Total container restarts by result.",
+	}, []string{"container", "result"})
+
+	SkipsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Name: "docker_guardian_skips_total",
+		Help: "Total skipped containers by reason.",
+	}, []string{"container", "reason"})
+
+	NotificationsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Name: "docker_guardian_notifications_total",
+		Help: "Total notification sends by service and result.",
+	}, []string{"service", "result"})
+
+	EventsProcessedTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Name: "docker_guardian_events_processed_total",
+		Help: "Total Docker events processed by action.",
+	}, []string{"action"})
+
+	UnhealthyContainers = prometheus.NewGauge(prometheus.GaugeOpts{
+		Name: "docker_guardian_unhealthy_containers",
+		Help: "Current number of unhealthy containers.",
+	})
+
+	CircuitOpenContainers = prometheus.NewGauge(prometheus.GaugeOpts{
+		Name: "docker_guardian_circuit_open_containers",
+		Help: "Number of containers with open circuit breakers.",
+	})
+
+	EventStreamConnected = prometheus.NewGauge(prometheus.GaugeOpts{
+		Name: "docker_guardian_event_stream_connected",
+		Help: "1 if connected to Docker event stream, 0 otherwise.",
+	})
+
+	RestartDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
+		Name:    "docker_guardian_restart_duration_seconds",
+		Help:    "Time taken to restart a container.",
+		Buckets: prometheus.DefBuckets,
+	}, []string{"container"})
+
+	EventProcessingDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
+		Name:    "docker_guardian_event_processing_duration_seconds",
+		Help:    "Time taken to process a Docker event.",
+		Buckets: prometheus.DefBuckets,
+	})
+)
+
+func init() {
+	prometheus.MustRegister(
+		RestartsTotal,
+		SkipsTotal,
+		NotificationsTotal,
+		EventsProcessedTotal,
+		UnhealthyContainers,
+		CircuitOpenContainers,
+		EventStreamConnected,
+		RestartDuration,
+		EventProcessingDuration,
+	)
+}
+
+// Serve starts the Prometheus metrics HTTP server on the given port.
+// Returns immediately; the server runs in the background.
+// If port is 0, metrics are disabled and this is a no-op.
+func Serve(port int) {
+	if port == 0 {
+		return
+	}
+
+	mux := http.NewServeMux()
+	mux.Handle("/metrics", promhttp.Handler())
+
+	go func() {
+		addr := fmt.Sprintf(":%d", port)
+		if err := http.ListenAndServe(addr, mux); err != nil { //nolint:gosec // Metrics endpoint, intentionally unauthenticated
+			fmt.Printf("metrics server error: %v\n", err)
+		}
+	}()
+}
diff --git a/internal/notify/notifier.go b/internal/notify/notifier.go
index 813c068..ba591ea 100644
--- a/internal/notify/notifier.go
+++ b/internal/notify/notifier.go
@@ -8,18 +8,33 @@ import (
 	"net/smtp"
 	"net/url"
 	"strings"
+	"sync"
 	"time"
 
 	"github.com/Will-Luck/Docker-Guardian/internal/config"
 	"github.com/Will-Luck/Docker-Guardian/internal/logging"
+	"github.com/Will-Luck/Docker-Guardian/internal/metrics"
 )
 
+// Notifier is the interface for sending notifications from the guardian.
+type Notifier interface {
+	Startup(text string)
+	Action(text string)
+	Skip(text string)
+	Close()
+}
+
 // Dispatcher sends notifications to all configured services.
 type Dispatcher struct {
 	cfg      *config.Config
 	log      *logging.Logger
 	client   *http.Client
 	resolved []string
+	wg       sync.WaitGroup
+
+	// Rate limiting: per container+event key → last notification time
+	rateMu    sync.Mutex
+	rateLimit map[string]time.Time
 }
 
 // NewDispatcher creates a notification dispatcher from config.
@@ -30,7 +45,22 @@ func NewDispatcher(cfg *config.Config, log *logging.Logger) *Dispatcher {
 		client: &http.Client{
 			Timeout: time.Duration(cfg.CurlTimeout) * time.Second,
 		},
-		resolved: cfg.ResolvedNotifyEvents(),
+		resolved:  cfg.ResolvedNotifyEvents(),
+		rateLimit: make(map[string]time.Time),
+	}
+}
+
+// Close waits for in-flight notification goroutines to finish, with a 10-second timeout.
+func (d *Dispatcher) Close() {
+	done := make(chan struct{})
+	go func() {
+		d.wg.Wait()
+		close(done)
+	}()
+	select {
+	case <-done:
+	case <-time.After(10 * time.Second):
+		d.log.Warn("notification shutdown timed out after 10s, some notifications may have been lost")
 	}
 }
 
@@ -82,17 +112,38 @@ func (d *Dispatcher) hasEvent(event string) bool {
 	return false
 }
 
+// isRateLimited checks if a notification for this key is rate-limited.
+// Returns true if the notification should be suppressed.
+func (d *Dispatcher) isRateLimited(key string) bool {
+	if d.cfg.NotifyRateLimit <= 0 {
+		return false
+	}
+
+	d.rateMu.Lock()
+	defer d.rateMu.Unlock()
+
+	window := time.Duration(d.cfg.NotifyRateLimit) * time.Second
+	if last, ok := d.rateLimit[key]; ok {
+		if time.Since(last) < window {
+			return true
+		}
+	}
+	d.rateLimit[key] = time.Now()
+	return false
+}
+
 // Startup sends a startup notification.
 func (d *Dispatcher) Startup(text string) {
 	if !d.hasEvent("startup") {
 		return
 	}
-	d.dispatch(text)
+	d.dispatch(text, false)
 }
 
 // Action sends an action notification (success or failure).
+// Action events use retry on failure.
 func (d *Dispatcher) Action(text string) {
-	if strings.Contains(text, "Failed") {
+	if strings.Contains(text, "Failed") || strings.Contains(text, "[CRITICAL]") {
 		if !d.hasEvent("actions") && !d.hasEvent("failures") {
 			return
 		}
@@ -101,7 +152,17 @@ func (d *Dispatcher) Action(text string) {
 			return
 		}
 	}
-	d.dispatch(text)
+
+	// Rate limit using the first ~50 chars as a key (contains container name)
+	key := text
+	if len(key) > 50 {
+		key = key[:50]
+	}
+	if d.isRateLimited(key) {
+		return
+	}
+
+	d.dispatch(text, true)
 }
 
 // Skip sends a skip notification.
@@ -109,10 +170,10 @@ func (d *Dispatcher) Skip(text string) {
 	if !d.hasEvent("skips") {
 		return
 	}
-	d.dispatch(text)
+	d.dispatch(text, false)
 }
 
-func (d *Dispatcher) dispatch(text string) {
+func (d *Dispatcher) dispatch(text string, retry bool) {
 	if d.hasEvent("debug") {
 		now := time.Now().Format("2006-01-02T15:04:05-0700")
 		services := d.ConfiguredServices()
@@ -124,97 +185,201 @@ func (d *Dispatcher) dispatch(text string) {
 	}
 
 	if d.cfg.WebhookURL != "" {
-		go d.sendJSON(d.cfg.WebhookURL, map[string]string{d.cfg.WebhookJSONKey: text})
+		d.wg.Add(1)
+		go func() {
+			defer d.wg.Done()
+			d.sendWithRetry("webhook", retry, func() error {
+				return d.sendJSON(d.cfg.WebhookURL, map[string]string{d.cfg.WebhookJSONKey: text})
+			})
+		}()
 	}
 	if d.cfg.AppriseURL != "" {
-		go d.sendJSON(d.cfg.AppriseURL, map[string]string{"title": "Docker-Guardian", "body": text})
+		d.wg.Add(1)
+		go func() {
+			defer d.wg.Done()
+			d.sendWithRetry("apprise", retry, func() error {
+				return d.sendJSON(d.cfg.AppriseURL, map[string]string{"title": "Docker-Guardian", "body": text})
+			})
+		}()
 	}
 	if d.cfg.GotifyURL != "" {
-		go d.sendJSON(d.cfg.GotifyURL+"/message?token="+d.cfg.GotifyToken,
-			map[string]any{"title": "Docker-Guardian", "message": text, "priority": 5})
+		d.wg.Add(1)
+		go func() {
+			defer d.wg.Done()
+			d.sendWithRetry("gotify", retry, func() error {
+				return d.sendJSON(d.cfg.GotifyURL+"/message?token="+d.cfg.GotifyToken,
+					map[string]any{"title": "Docker-Guardian", "message": text, "priority": 5})
+			})
+		}()
 	}
 	if d.cfg.DiscordWebhook != "" {
-		go d.sendJSON(d.cfg.DiscordWebhook, map[string]any{
-			"embeds": []map[string]any{{"title": "Docker-Guardian", "description": text, "color": 3066993}},
-		})
+		d.wg.Add(1)
+		go func() {
+			defer d.wg.Done()
+			d.sendWithRetry("discord", retry, func() error {
+				return d.sendJSON(d.cfg.DiscordWebhook, map[string]any{
+					"embeds": []map[string]any{{"title": "Docker-Guardian", "description": text, "color": 3066993}},
+				})
+			})
+		}()
 	}
 	if d.cfg.SlackWebhook != "" {
-		go d.sendJSON(d.cfg.SlackWebhook, map[string]string{"text": "*Docker-Guardian*\n" + text})
+		d.wg.Add(1)
+		go func() {
+			defer d.wg.Done()
+			d.sendWithRetry("slack", retry, func() error {
+				return d.sendJSON(d.cfg.SlackWebhook, map[string]string{"text": "*Docker-Guardian*\n" + text})
+			})
+		}()
 	}
 	if d.cfg.TelegramToken != "" {
-		go d.sendJSON("https://api.telegram.org/bot"+d.cfg.TelegramToken+"/sendMessage",
-			map[string]string{"chat_id": d.cfg.TelegramChatID, "text": "Docker-Guardian: " + text})
+		d.wg.Add(1)
+		go func() {
+			defer d.wg.Done()
+			d.sendWithRetry("telegram", retry, func() error {
+				return d.sendJSON("https://api.telegram.org/bot"+d.cfg.TelegramToken+"/sendMessage",
+					map[string]string{"chat_id": d.cfg.TelegramChatID, "text": "Docker-Guardian: " + text})
+			})
+		}()
 	}
 	if d.cfg.PushoverToken != "" {
-		go d.sendForm("https://api.pushover.net/1/messages.json", map[string]string{
-			"token": d.cfg.PushoverToken, "user": d.cfg.PushoverUser,
-			"title": "Docker-Guardian", "message": text,
-		})
+		d.wg.Add(1)
+		go func() {
+			defer d.wg.Done()
+			d.sendWithRetry("pushover", retry, func() error {
+				return d.sendForm("https://api.pushover.net/1/messages.json", map[string]string{
+					"token": d.cfg.PushoverToken, "user": d.cfg.PushoverUser,
+					"title": "Docker-Guardian", "message": text,
+				})
+			})
+		}()
 	}
 	if d.cfg.PushbulletToken != "" {
-		go d.sendJSONWithHeader("https://api.pushbullet.com/v2/pushes",
-			"Access-Token", d.cfg.PushbulletToken,
-			map[string]string{"type": "note", "title": "Docker-Guardian", "body": text})
+		d.wg.Add(1)
+		go func() {
+			defer d.wg.Done()
+			d.sendWithRetry("pushbullet", retry, func() error {
+				return d.sendJSONWithHeader("https://api.pushbullet.com/v2/pushes",
+					"Access-Token", d.cfg.PushbulletToken,
+					map[string]string{"type": "note", "title": "Docker-Guardian", "body": text})
+			})
+		}()
 	}
 	if d.cfg.LunaSeaWebhook != "" {
-		go d.sendJSON(d.cfg.LunaSeaWebhook, map[string]string{"title": "Docker-Guardian", "body": text})
+		d.wg.Add(1)
+		go func() {
+			defer d.wg.Done()
+			d.sendWithRetry("lunasea", retry, func() error {
+				return d.sendJSON(d.cfg.LunaSeaWebhook, map[string]string{"title": "Docker-Guardian", "body": text})
+			})
+		}()
 	}
 	if d.cfg.EmailSMTP != "" {
-		go d.sendEmail(text)
+		d.wg.Add(1)
+		go func() {
+			defer d.wg.Done()
+			d.sendWithRetry("email", retry, func() error {
+				return d.sendEmail(text)
+			})
+		}()
+	}
+}
+
+// sendWithRetry retries a send function up to 3 times with exponential backoff.
+// Only retries if retry=true. Tracks metrics per service.
+func (d *Dispatcher) sendWithRetry(service string, retry bool, fn func() error) {
+	maxAttempts := 1
+	if retry {
+		maxAttempts = 3
+	}
+
+	delays := []time.Duration{time.Second, 2 * time.Second, 4 * time.Second}
+
+	for attempt := 0; attempt < maxAttempts; attempt++ {
+		if err := fn(); err == nil {
+			metrics.NotificationsTotal.WithLabelValues(service, "success").Inc()
+			return
+		}
+		// Only retry if we have attempts left
+		if attempt < maxAttempts-1 {
+			time.Sleep(delays[attempt])
+		}
 	}
+	metrics.NotificationsTotal.WithLabelValues(service, "failure").Inc()
 }
 
-func (d *Dispatcher) sendJSON(url string, payload any) {
+func (d *Dispatcher) sendJSON(targetURL string, payload any) error {
 	body, err := json.Marshal(payload)
 	if err != nil {
 		d.log.Error("failed to marshal notification payload", "error", err)
-		return
+		return err
 	}
-	resp, err := d.client.Post(url, "application/json", bytes.NewReader(body))
+	resp, err := d.client.Post(targetURL, "application/json", bytes.NewReader(body))
 	if err != nil {
-		d.log.Debug("notification send failed", "url", url, "error", err)
-		return
+		d.log.Warn("notification send failed", "url", targetURL, "error", err)
+		return err
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		d.log.Warn("notification returned non-2xx status", "url", targetURL, "status", resp.StatusCode)
+		return fmt.Errorf("HTTP %d", resp.StatusCode)
 	}
-	resp.Body.Close()
+	return nil
 }
 
-func (d *Dispatcher) sendJSONWithHeader(url, headerKey, headerVal string, payload any) {
+func (d *Dispatcher) sendJSONWithHeader(targetURL, headerKey, headerVal string, payload any) error {
 	body, err := json.Marshal(payload)
 	if err != nil {
-		return
+		d.log.Warn("failed to marshal notification payload", "error", err)
+		return err
 	}
-	req, err := http.NewRequest(http.MethodPost, url, bytes.NewReader(body))
+	req, err := http.NewRequest(http.MethodPost, targetURL, bytes.NewReader(body))
 	if err != nil {
-		return
+		d.log.Warn("failed to create notification request", "error", err)
+		return err
 	}
 	req.Header.Set("Content-Type", "application/json")
 	req.Header.Set(headerKey, headerVal)
 	resp, err := d.client.Do(req)
 	if err != nil {
-		return
+		d.log.Warn("notification send failed", "url", targetURL, "error", err)
+		return err
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		d.log.Warn("notification returned non-2xx status", "url", targetURL, "status", resp.StatusCode)
+		return fmt.Errorf("HTTP %d", resp.StatusCode)
 	}
-	resp.Body.Close()
+	return nil
 }
 
-func (d *Dispatcher) sendForm(endpoint string, fields map[string]string) {
+func (d *Dispatcher) sendForm(endpoint string, fields map[string]string) error {
 	vals := url.Values{}
 	for k, v := range fields {
 		vals.Set(k, v)
 	}
 	resp, err := d.client.Post(endpoint, "application/x-www-form-urlencoded", strings.NewReader(vals.Encode()))
 	if err != nil {
-		return
+		d.log.Warn("notification send failed", "url", endpoint, "error", err)
+		return err
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		d.log.Warn("notification returned non-2xx status", "url", endpoint, "status", resp.StatusCode)
+		return fmt.Errorf("HTTP %d", resp.StatusCode)
 	}
-	resp.Body.Close()
+	return nil
 }
 
-func (d *Dispatcher) sendEmail(text string) {
+func (d *Dispatcher) sendEmail(text string) error {
 	msg := fmt.Sprintf("From: %s\r\nTo: %s\r\nSubject: Docker-Guardian Alert\r\n\r\n%s",
 		d.cfg.EmailFrom, d.cfg.EmailTo, text)
 
 	auth := smtp.PlainAuth("", d.cfg.EmailUser, d.cfg.EmailPass, strings.Split(d.cfg.EmailSMTP, ":")[0])
 	err := smtp.SendMail(d.cfg.EmailSMTP, auth, d.cfg.EmailFrom, []string{d.cfg.EmailTo}, []byte(msg))
 	if err != nil {
-		d.log.Debug("email notification failed", "error", err)
+		d.log.Warn("email notification failed", "error", err)
+		return err
 	}
+	return nil
 }
diff --git a/tests/test-all.sh b/tests/test-all.sh
index 4aaf477..b692945 100755
--- a/tests/test-all.sh
+++ b/tests/test-all.sh
@@ -37,6 +37,9 @@ run_test "Backup Awareness" "test-backup.sh"
 run_test "Grace Period" "test-grace.sh"
 run_test "Watchtower Awareness" "test-watchtower.sh"
 run_test "Notifications" "test-notifications.sh"
+run_test "Opt-Out Label" "test-opt-out.sh"
+run_test "Circuit Breaker" "test-circuit-breaker.sh"
+run_test "Custom Label Filter" "test-custom-label.sh"
 
 echo "============================================="
 echo "  Final Results: ${TOTAL_PASS} suites passed, ${TOTAL_FAIL} suites failed"
diff --git a/tests/test-circuit-breaker.sh b/tests/test-circuit-breaker.sh
new file mode 100644
index 0000000..f9fae35
--- /dev/null
+++ b/tests/test-circuit-breaker.sh
@@ -0,0 +1,154 @@
+#!/usr/bin/env bash
+# Test: Circuit breaker stops restarting after budget is exhausted
+# Uses a container that always fails its healthcheck. After the restart budget
+# is hit, Guardian should stop restarting and log a CRITICAL circuit-open message.
+
+set -euo pipefail
+
+PASS=0
+FAIL=0
+GUARDIAN_IMAGE="${GUARDIAN_IMAGE:-docker-guardian}"
+
+cleanup() {
+  echo "Cleaning up..."
+  docker rm -f dg-test-cb-target dg-test-guardian-cb 2>/dev/null || true
+}
+trap cleanup EXIT
+
+echo "=== Test: Circuit Breaker ==="
+
+cleanup
+
+# Start a container that always fails its healthcheck
+docker run -d \
+  --name dg-test-cb-target \
+  --health-cmd="exit 1" \
+  --health-interval=2s \
+  --health-retries=1 \
+  --health-start-period=0s \
+  alpine:3.20 sh -c 'sleep 3600'
+
+# Wait for unhealthy
+echo "Waiting for container to become unhealthy..."
+for i in $(seq 1 15); do
+  HC_STATE=$(docker inspect -f '{{.State.Health.Status}}' dg-test-cb-target 2>/dev/null || echo "unknown")
+  if [ "$HC_STATE" = "unhealthy" ]; then
+    echo "Container is unhealthy after ${i}s"
+    break
+  fi
+  sleep 1
+done
+
+if [ "$HC_STATE" != "unhealthy" ]; then
+  echo "FAIL: Container did not become unhealthy within 15s"
+  FAIL=$((FAIL + 1))
+  echo ""
+  echo "=== Circuit Breaker Test Results: ${PASS} passed, ${FAIL} failed ==="
+  exit "$FAIL"
+fi
+
+echo "PASS: Container reached unhealthy state"
+PASS=$((PASS + 1))
+
+# Start Guardian with a very low restart budget (2) and short window/backoff
+# Budget=2 means after 2 restarts the circuit opens
+docker run -d \
+  --name dg-test-guardian-cb \
+  -e AUTOHEAL_CONTAINER_LABEL=all \
+  -e AUTOHEAL_INTERVAL=3 \
+  -e AUTOHEAL_GRACE_PERIOD=0 \
+  -e AUTOHEAL_WATCHTOWER_COOLDOWN=0 \
+  -e AUTOHEAL_RESTART_BUDGET=2 \
+  -e AUTOHEAL_RESTART_WINDOW=600 \
+  -e AUTOHEAL_BACKOFF_MULTIPLIER=1 \
+  -e AUTOHEAL_BACKOFF_MAX=1 \
+  -e AUTOHEAL_BACKOFF_RESET_AFTER=600 \
+  -e NOTIFY_EVENTS=actions \
+  -v /var/run/docker.sock:/var/run/docker.sock \
+  "$GUARDIAN_IMAGE"
+
+# ── Test 1: Guardian restarts the container at least once ──────────
+
+echo ""
+echo "--- Test 1: Guardian restarts container at least once ---"
+
+STARTED_AT_BEFORE=$(docker inspect -f '{{.State.StartedAt}}' dg-test-cb-target 2>/dev/null)
+
+RESTARTED=false
+for i in $(seq 1 30); do
+  STARTED_NOW=$(docker inspect -f '{{.State.StartedAt}}' dg-test-cb-target 2>/dev/null || echo "unknown")
+  if [ "$STARTED_NOW" != "$STARTED_AT_BEFORE" ] && [ "$STARTED_NOW" != "unknown" ]; then
+    echo "PASS: Container was restarted at least once (took ${i}s)"
+    PASS=$((PASS + 1))
+    RESTARTED=true
+    break
+  fi
+  sleep 1
+done
+
+if [ "$RESTARTED" = false ]; then
+  echo "FAIL: Container was not restarted within 30s"
+  FAIL=$((FAIL + 1))
+  docker logs dg-test-guardian-cb 2>&1 | tail -30
+  echo ""
+  echo "=== Circuit Breaker Test Results: ${PASS} passed, ${FAIL} failed ==="
+  exit "$FAIL"
+fi
+
+# ── Test 2: After budget exhausted, Guardian logs circuit open ─────
+
+echo ""
+echo "--- Test 2: Circuit breaker opens after budget exhausted ---"
+
+# Wait long enough for Guardian to hit the budget (2 restarts) and see the circuit message
+# With backoff_max=1 and budget=2, this should happen within ~30s
+echo "Waiting up to 60s for circuit breaker to open..."
+CIRCUIT_OPEN=false
+for i in $(seq 1 60); do
+  if docker logs dg-test-guardian-cb 2>&1 | grep -q "circuit open\|budget exhausted"; then
+    echo "PASS: Guardian logged circuit breaker open (took ${i}s)"
+    PASS=$((PASS + 1))
+    CIRCUIT_OPEN=true
+    break
+  fi
+  sleep 1
+done
+
+if [ "$CIRCUIT_OPEN" = false ]; then
+  echo "FAIL: Guardian did not log circuit breaker open within 60s"
+  FAIL=$((FAIL + 1))
+  echo "Guardian logs:"
+  docker logs dg-test-guardian-cb 2>&1 | tail -30
+fi
+
+# ── Test 3: Verify CRITICAL notification was logged ────────────────
+
+echo ""
+echo "--- Test 3: CRITICAL notification in logs ---"
+
+if docker logs dg-test-guardian-cb 2>&1 | grep -q "\[CRITICAL\]"; then
+  echo "PASS: CRITICAL notification logged for circuit breaker"
+  PASS=$((PASS + 1))
+else
+  echo "FAIL: No CRITICAL notification found in logs"
+  FAIL=$((FAIL + 1))
+  docker logs dg-test-guardian-cb 2>&1 | tail -20
+fi
+
+# ── Test 4: Guardian itself is still running ───────────────────────
+
+echo ""
+echo "--- Test 4: Guardian is still healthy after circuit opens ---"
+
+GUARDIAN_STATE=$(docker inspect -f '{{.State.Status}}' dg-test-guardian-cb 2>/dev/null || echo "unknown")
+if [ "$GUARDIAN_STATE" = "running" ]; then
+  echo "PASS: Guardian still running after circuit breaker activation"
+  PASS=$((PASS + 1))
+else
+  echo "FAIL: Guardian stopped unexpectedly (state=$GUARDIAN_STATE)"
+  FAIL=$((FAIL + 1))
+fi
+
+echo ""
+echo "=== Circuit Breaker Test Results: ${PASS} passed, ${FAIL} failed ==="
+exit "$FAIL"
diff --git a/tests/test-custom-label.sh b/tests/test-custom-label.sh
new file mode 100644
index 0000000..dd4a5a2
--- /dev/null
+++ b/tests/test-custom-label.sh
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+# Test: Custom label filtering
+# Verifies that AUTOHEAL_CONTAINER_LABEL restricts monitoring to only
+# containers with that label, ignoring unhealthy containers without it.
+
+set -euo pipefail
+
+PASS=0
+FAIL=0
+GUARDIAN_IMAGE="${GUARDIAN_IMAGE:-docker-guardian}"
+
+cleanup() {
+  echo "Cleaning up..."
+  docker rm -f dg-test-label-monitored dg-test-label-ignored dg-test-guardian-label 2>/dev/null || true
+}
+trap cleanup EXIT
+
+echo "=== Test: Custom Label Filtering ==="
+
+cleanup
+
+# ── Start two unhealthy containers ─────────────────────────────────
+# One with our custom label, one without
+
+echo "Starting container WITH custom label (my-monitor=true)..."
+docker run -d \
+  --name dg-test-label-monitored \
+  --label my-monitor=true \
+  --health-cmd="exit 1" \
+  --health-interval=2s \
+  --health-retries=1 \
+  --health-start-period=0s \
+  alpine:3.20 sh -c 'sleep 3600'
+
+echo "Starting container WITHOUT custom label..."
+docker run -d \
+  --name dg-test-label-ignored \
+  --health-cmd="exit 1" \
+  --health-interval=2s \
+  --health-retries=1 \
+  --health-start-period=0s \
+  alpine:3.20 sh -c 'sleep 3600'
+
+# Wait for both to become unhealthy
+echo "Waiting for both containers to become unhealthy..."
+for i in $(seq 1 15); do
+  S1=$(docker inspect -f '{{.State.Health.Status}}' dg-test-label-monitored 2>/dev/null || echo "unknown")
+  S2=$(docker inspect -f '{{.State.Health.Status}}' dg-test-label-ignored 2>/dev/null || echo "unknown")
+  if [ "$S1" = "unhealthy" ] && [ "$S2" = "unhealthy" ]; then
+    echo "Both containers unhealthy after ${i}s"
+    break
+  fi
+  sleep 1
+done
+
+if [ "$S1" != "unhealthy" ] || [ "$S2" != "unhealthy" ]; then
+  echo "FAIL: One or both containers did not become unhealthy within 15s"
+  FAIL=$((FAIL + 1))
+  echo ""
+  echo "=== Custom Label Test Results: ${PASS} passed, ${FAIL} failed ==="
+  exit "$FAIL"
+fi
+
+STARTED_MONITORED=$(docker inspect -f '{{.State.StartedAt}}' dg-test-label-monitored 2>/dev/null)
+STARTED_IGNORED=$(docker inspect -f '{{.State.StartedAt}}' dg-test-label-ignored 2>/dev/null)
+
+# Start Guardian with custom label filter
+echo "Starting Guardian with AUTOHEAL_CONTAINER_LABEL=my-monitor..."
+docker run -d \
+  --name dg-test-guardian-label \
+  -e AUTOHEAL_CONTAINER_LABEL=my-monitor \
+  -e AUTOHEAL_INTERVAL=3 \
+  -e AUTOHEAL_GRACE_PERIOD=0 \
+  -e AUTOHEAL_WATCHTOWER_COOLDOWN=0 \
+  -v /var/run/docker.sock:/var/run/docker.sock \
+  "$GUARDIAN_IMAGE"
+
+# ── Test 1: Labelled container IS restarted ────────────────────────
+
+echo ""
+echo "--- Test 1: Labelled container is restarted ---"
+
+RESTARTED=false
+for i in $(seq 1 20); do
+  STARTED_NOW=$(docker inspect -f '{{.State.StartedAt}}' dg-test-label-monitored 2>/dev/null || echo "unknown")
+  if [ "$STARTED_NOW" != "$STARTED_MONITORED" ] && [ "$STARTED_NOW" != "unknown" ]; then
+    echo "PASS: Labelled container was restarted (took ${i}s)"
+    PASS=$((PASS + 1))
+    RESTARTED=true
+    break
+  fi
+  sleep 1
+done
+
+if [ "$RESTARTED" = false ]; then
+  echo "FAIL: Labelled container was not restarted within 20s"
+  FAIL=$((FAIL + 1))
+  docker logs dg-test-guardian-label 2>&1 | tail -20
+fi
+
+# ── Test 2: Unlabelled container is NOT restarted ──────────────────
+
+echo ""
+echo "--- Test 2: Unlabelled container is ignored ---"
+
+# Give it some extra time to be sure
+sleep 10
+
+STARTED_IGN_NOW=$(docker inspect -f '{{.State.StartedAt}}' dg-test-label-ignored 2>/dev/null)
+if [ "$STARTED_IGN_NOW" = "$STARTED_IGNORED" ]; then
+  echo "PASS: Unlabelled container was NOT restarted"
+  PASS=$((PASS + 1))
+else
+  echo "FAIL: Unlabelled container was restarted despite missing label"
+  FAIL=$((FAIL + 1))
+  docker logs dg-test-guardian-label 2>&1 | tail -20
+fi
+
+# ── Test 3: Guardian logs confirm label filter ─────────────────────
+
+echo ""
+echo "--- Test 3: Guardian logs show label filter ---"
+
+if docker logs dg-test-guardian-label 2>&1 | grep -q "AUTOHEAL_CONTAINER_LABEL=my-monitor"; then
+  echo "PASS: Guardian logs confirm custom label filter"
+  PASS=$((PASS + 1))
+else
+  echo "FAIL: Guardian logs do not show custom label"
+  FAIL=$((FAIL + 1))
+  docker logs dg-test-guardian-label 2>&1 | head -10
+fi
+
+echo ""
+echo "=== Custom Label Test Results: ${PASS} passed, ${FAIL} failed ==="
+exit "$FAIL"
diff --git a/tests/test-opt-out.sh b/tests/test-opt-out.sh
new file mode 100644
index 0000000..0a955ae
--- /dev/null
+++ b/tests/test-opt-out.sh
@@ -0,0 +1,127 @@
+#!/usr/bin/env bash
+# Test: Opt-out via autoheal=False label
+# Verifies that containers with autoheal=False are NOT restarted by Guardian,
+# even when unhealthy.
+
+set -euo pipefail
+
+PASS=0
+FAIL=0
+GUARDIAN_IMAGE="${GUARDIAN_IMAGE:-docker-guardian}"
+
+cleanup() {
+  echo "Cleaning up..."
+  docker rm -f dg-test-optout-target dg-test-optout-monitored dg-test-guardian-optout 2>/dev/null || true
+}
+trap cleanup EXIT
+
+echo "=== Test: Opt-Out via autoheal=False ==="
+
+cleanup
+
+# ── Test 1: Container with autoheal=False is NOT restarted ─────────
+
+echo ""
+echo "--- Test 1: Container with autoheal=False is skipped ---"
+
+# Start an unhealthy container with autoheal=False
+docker run -d \
+  --name dg-test-optout-target \
+  --label autoheal=False \
+  --health-cmd="exit 1" \
+  --health-interval=2s \
+  --health-retries=1 \
+  --health-start-period=0s \
+  alpine:3.20 sh -c 'sleep 3600'
+
+# Wait for it to become unhealthy
+echo "Waiting for opt-out container to become unhealthy..."
+for i in $(seq 1 15); do
+  HC_STATE=$(docker inspect -f '{{.State.Health.Status}}' dg-test-optout-target 2>/dev/null || echo "unknown")
+  if [ "$HC_STATE" = "unhealthy" ]; then
+    break
+  fi
+  sleep 1
+done
+
+if [ "$HC_STATE" != "unhealthy" ]; then
+  echo "FAIL: Container did not become unhealthy within 15s"
+  FAIL=$((FAIL + 1))
+  echo ""
+  echo "=== Opt-Out Test Results: ${PASS} passed, ${FAIL} failed ==="
+  exit "$FAIL"
+fi
+
+STARTED_AT_BEFORE=$(docker inspect -f '{{.State.StartedAt}}' dg-test-optout-target 2>/dev/null)
+
+# Start Guardian
+docker run -d \
+  --name dg-test-guardian-optout \
+  -e AUTOHEAL_CONTAINER_LABEL=all \
+  -e AUTOHEAL_INTERVAL=3 \
+  -e AUTOHEAL_GRACE_PERIOD=0 \
+  -e AUTOHEAL_WATCHTOWER_COOLDOWN=0 \
+  -v /var/run/docker.sock:/var/run/docker.sock \
+  "$GUARDIAN_IMAGE"
+
+# Wait sufficient time — Guardian should NOT restart the opted-out container
+echo "Waiting 15s to confirm Guardian does NOT restart the opted-out container..."
+sleep 15
+
+STARTED_AT_AFTER=$(docker inspect -f '{{.State.StartedAt}}' dg-test-optout-target 2>/dev/null)
+
+if [ "$STARTED_AT_BEFORE" = "$STARTED_AT_AFTER" ]; then
+  echo "PASS: Opted-out container was NOT restarted"
+  PASS=$((PASS + 1))
+else
+  echo "FAIL: Opted-out container was restarted despite autoheal=False"
+  FAIL=$((FAIL + 1))
+  docker logs dg-test-guardian-optout 2>&1 | tail -20
+fi
+
+# ── Test 2: A monitored container IS still restarted ───────────────
+
+echo ""
+echo "--- Test 2: Monitored container (no opt-out) is still restarted ---"
+
+docker run -d \
+  --name dg-test-optout-monitored \
+  --health-cmd="exit 1" \
+  --health-interval=2s \
+  --health-retries=1 \
+  --health-start-period=0s \
+  alpine:3.20 sh -c 'sleep 3600'
+
+# Wait for unhealthy
+for i in $(seq 1 15); do
+  HC_STATE=$(docker inspect -f '{{.State.Health.Status}}' dg-test-optout-monitored 2>/dev/null || echo "unknown")
+  if [ "$HC_STATE" = "unhealthy" ]; then
+    break
+  fi
+  sleep 1
+done
+
+STARTED_AT_MON=$(docker inspect -f '{{.State.StartedAt}}' dg-test-optout-monitored 2>/dev/null)
+
+echo "Waiting for Guardian to restart the monitored container (up to 20s)..."
+RESTARTED=false
+for i in $(seq 1 20); do
+  STARTED_NOW=$(docker inspect -f '{{.State.StartedAt}}' dg-test-optout-monitored 2>/dev/null || echo "unknown")
+  if [ "$STARTED_NOW" != "$STARTED_AT_MON" ] && [ "$STARTED_NOW" != "unknown" ]; then
+    echo "PASS: Monitored container was restarted (took ${i}s)"
+    PASS=$((PASS + 1))
+    RESTARTED=true
+    break
+  fi
+  sleep 1
+done
+
+if [ "$RESTARTED" = false ]; then
+  echo "FAIL: Monitored container was NOT restarted within 20s"
+  FAIL=$((FAIL + 1))
+  docker logs dg-test-guardian-optout 2>&1 | tail -20
+fi
+
+echo ""
+echo "=== Opt-Out Test Results: ${PASS} passed, ${FAIL} failed ==="
+exit "$FAIL"