diff --git a/docs/config/index.md b/docs/config/index.md
index a1d7aa87..f8f0e5c1 100644
--- a/docs/config/index.md
+++ b/docs/config/index.md
@@ -106,7 +106,6 @@ preserve = true                          # Keep volumes on container removal
 [network_isolation]
 enabled = true                           # Per-app isolated networks
 network_prefix = "gordon"                # Network name prefix
-dns_suffix = ".internal"                 # DNS suffix for services
 
 # Auto-route
 [auto_route]
diff --git a/docs/config/network-isolation.md b/docs/config/network-isolation.md
index 798fb374..a41bdb8a 100644
--- a/docs/config/network-isolation.md
+++ b/docs/config/network-isolation.md
@@ -8,7 +8,6 @@ Isolate applications in separate Docker networks for enhanced security.
 [network_isolation]
 enabled = true
 network_prefix = "gordon"
-dns_suffix = ".internal"
 ```
 
 ## Options
@@ -17,7 +16,6 @@ dns_suffix = ".internal"
 |--------|------|---------|-------------|
 | `enabled` | bool | `false` | Enable per-app network isolation |
 | `network_prefix` | string | `"gordon"` | Prefix for created networks |
-| `dns_suffix` | string | `".internal"` | DNS suffix for service discovery |
 
 ## How It Works
 
@@ -99,19 +97,6 @@ db = connect("postgresql://postgres:5432/mydb")
 cache = connect("redis://redis:6379")
 ```
 
-## DNS Resolution
-
-The `dns_suffix` option adds a suffix for internal DNS resolution:
-
-```toml
-[network_isolation]
-dns_suffix = ".internal"
-```
-
-Services can be accessed as:
-- `postgres` (short form)
-- `postgres.internal` (with suffix)
-
 ## Examples
 
 ### Basic Isolation
@@ -137,7 +122,6 @@ Each app gets its own network with its own database.
 [network_isolation]
 enabled = true
 network_prefix = "prod"
-dns_suffix = ".internal"
 
 [routes]
 "app.company.com" = "company-app:v2.1.0"
diff --git a/docs/config/reference.md b/docs/config/reference.md
index 2be23fcb..669434e9 100644
--- a/docs/config/reference.md
+++ b/docs/config/reference.md
@@ -105,7 +105,6 @@ enabled = false                              # Create routes from image labels a
 [network_isolation]
 enabled = false                              # Enable per-app Docker networks
 network_prefix = "gordon"                    # Prefix for created networks
-dns_suffix = ".internal"                     # DNS suffix for internal resolution
 
 # =============================================================================
 # VOLUMES
@@ -215,7 +214,6 @@ keep_last = 3                                # Keep N newest tags per repository
 | `auto_route.enabled` | `false` | Auto-route disabled |
 | `network_isolation.enabled` | `false` | Isolation disabled |
 | `network_isolation.network_prefix` | `"gordon"` | Network prefix |
-| `network_isolation.dns_suffix` | `".internal"` | DNS suffix |
 | `volumes.auto_create` | `true` | Auto-create volumes |
 | `volumes.prefix` | `"gordon"` | Volume prefix |
 | `volumes.preserve` | `true` | Keep volumes |
diff --git a/docs/reference/docker-labels.md b/docs/reference/docker-labels.md
index f86ad315..a1bec497 100644
--- a/docs/reference/docker-labels.md
+++ b/docs/reference/docker-labels.md
@@ -41,7 +41,27 @@ Labels you can set in your Dockerfile:
 
 | Label | Example | Description |
 |-------|---------|-------------|
-| `gordon.proxy.port` | `"3000"` | Port to proxy HTTP traffic to |
+| `gordon.domains` | `"app.example.com,www.app.example.com"` | Comma-separated domains for auto-route |
+| `gordon.port` | `"3000"` | Port to proxy HTTP traffic to |
+| `gordon.proxy.port` | `"3000"` | Port to proxy HTTP traffic to (legacy alias for `gordon.port`) |
+| `gordon.health` | `"/healthz"` | HTTP health check endpoint path for readiness probing |
+| `gordon.env-file` | `"/app/.env.example"` | Path to env template file inside the image |
+
+### Health Check Label
+
+When `gordon.health` is set, Gordon performs HTTP GET requests to the specified
+path during deployment and waits for a 2xx or 3xx response before routing traffic
+to the new container:
+
+```dockerfile
+FROM node:20-alpine
+LABEL gordon.health="/api/health"
+EXPOSE 3000
+CMD ["node", "server.js"]
+```
+
+Gordon probes `http://<container-ip>:3000/api/health` until it gets a successful
+response or the `deploy.http_probe_timeout` is reached.
 
 ### Proxy Port Label
 
diff --git a/internal/adapters/in/cli/controlplane.go b/internal/adapters/in/cli/controlplane.go
index 3926c65e..296aa84e 100644
--- a/internal/adapters/in/cli/controlplane.go
+++ b/internal/adapters/in/cli/controlplane.go
@@ -34,6 +34,7 @@ type ControlPlane interface {
 
 	GetStatus(ctx context.Context) (*remote.Status, error)
 	Reload(ctx context.Context) error
+	DeployIntent(ctx context.Context, imageName string) error
 	Deploy(ctx context.Context, deployDomain string) (*remote.DeployResult, error)
 	Restart(ctx context.Context, restartDomain string, withAttachments bool) (*remote.RestartResult, error)
 	ListTags(ctx context.Context, repository string) ([]string, error)
diff --git a/internal/adapters/in/cli/controlplane_local.go b/internal/adapters/in/cli/controlplane_local.go
index 60c87577..2c051381 100644
--- a/internal/adapters/in/cli/controlplane_local.go
+++ b/internal/adapters/in/cli/controlplane_local.go
@@ -18,6 +18,7 @@ type localControlPlane struct {
 	containerSvc in.ContainerService
 	backupSvc    in.BackupService
 	registrySvc  in.RegistryService
+	deployCoord  in.DeployCoordinator
 	healthSvc    in.HealthService
 	logSvc       in.LogService
 }
@@ -27,12 +28,21 @@ func NewLocalControlPlane(kernel *app.Kernel) ControlPlane {
 		return &localControlPlane{}
 	}
 
+	registrySvc := kernel.Registry()
+	var deployCoord in.DeployCoordinator
+	if registrySvc != nil {
+		if coordinator, ok := any(registrySvc).(in.DeployCoordinator); ok {
+			deployCoord = coordinator
+		}
+	}
+
 	return &localControlPlane{
 		configSvc:    kernel.Config(),
 		secretSvc:    kernel.Secrets(),
 		containerSvc: kernel.Container(),
 		backupSvc:    kernel.Backup(),
-		registrySvc:  kernel.Registry(),
+		registrySvc:  registrySvc,
+		deployCoord:  deployCoord,
 		healthSvc:    kernel.Health(),
 		logSvc:       kernel.Logs(),
 	}
@@ -223,6 +233,13 @@ func (l *localControlPlane) Reload(_ context.Context) error {
 	return app.SendReloadSignal()
 }
 
+func (l *localControlPlane) DeployIntent(_ context.Context, imageName string) error {
+	if l.deployCoord != nil {
+		l.deployCoord.SuppressDeployEvent(imageName)
+	}
+	return nil
+}
+
 func (l *localControlPlane) Deploy(ctx context.Context, deployDomain string) (*remote.DeployResult, error) {
 	if l.containerSvc != nil && l.configSvc != nil {
 		route, err := l.configSvc.GetRoute(ctx, deployDomain)
diff --git a/internal/adapters/in/cli/controlplane_remote.go b/internal/adapters/in/cli/controlplane_remote.go
index 9b35f90a..a37b123f 100644
--- a/internal/adapters/in/cli/controlplane_remote.go
+++ b/internal/adapters/in/cli/controlplane_remote.go
@@ -88,6 +88,10 @@ func (r *remoteControlPlane) Reload(ctx context.Context) error {
 	return r.client.Reload(ctx)
 }
 
+func (r *remoteControlPlane) DeployIntent(ctx context.Context, imageName string) error {
+	return r.client.DeployIntent(ctx, imageName)
+}
+
 func (r *remoteControlPlane) Deploy(ctx context.Context, deployDomain string) (*remote.DeployResult, error) {
 	return r.client.Deploy(ctx, deployDomain)
 }
diff --git a/internal/adapters/in/cli/push.go b/internal/adapters/in/cli/push.go
index d0e2f76e..3a20e6c5 100644
--- a/internal/adapters/in/cli/push.go
+++ b/internal/adapters/in/cli/push.go
@@ -154,6 +154,15 @@ func runPush(ctx context.Context, imageArg, domainFlag, tag string, build bool,
 	}
 	fmt.Printf("Domain: %s\n", styles.Theme.Bold.Render(pushDomain))
 
+	// Signal the server to suppress event-based deploys for this image.
+	// The CLI will trigger an explicit deploy after push completes.
+	if !noDeploy {
+		if err := handle.plane.DeployIntent(ctx, imageName); err != nil {
+			// Non-fatal: worst case we get a redundant deploy via event
+			fmt.Fprintf(os.Stderr, "warning: failed to register deploy intent: %v\n", err)
+		}
+	}
+
 	if build {
 		if err := buildAndPush(ctx, version, platform, dockerfile, buildArgs, versionRef, latestRef); err != nil {
 			return err
diff --git a/internal/adapters/in/cli/remote/client.go b/internal/adapters/in/cli/remote/client.go
index 9ea2ca17..25c6a0c7 100644
--- a/internal/adapters/in/cli/remote/client.go
+++ b/internal/adapters/in/cli/remote/client.go
@@ -724,6 +724,20 @@ func (c *Client) Deploy(ctx context.Context, deployDomain string) (*DeployResult
 	return &result, nil
 }
 
+// DeployIntent tells the server that a CLI-managed push is about to happen,
+// suppressing event-based deploys for this image.
+func (c *Client) DeployIntent(ctx context.Context, imageName string) error {
+	imageName = strings.TrimSpace(imageName)
+	if imageName == "" {
+		return fmt.Errorf("image name cannot be empty")
+	}
+	resp, err := c.requestWithRetry(ctx, http.MethodPost, "/deploy-intent/"+url.PathEscape(imageName), nil)
+	if err != nil {
+		return err
+	}
+	return parseResponse(resp, nil)
+}
+
 // Restart API
 
 // RestartResult contains the result of a restart.
diff --git a/internal/adapters/in/http/admin/handler.go b/internal/adapters/in/http/admin/handler.go
index 012d7548..443f1468 100644
--- a/internal/adapters/in/http/admin/handler.go
+++ b/internal/adapters/in/http/admin/handler.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"io"
 	"net/http"
+	"net/url"
 	"strconv"
 	"strings"
 	"time"
@@ -27,6 +28,11 @@ const maxAdminRequestSize = 1 << 20 // 1MB
 // maxLogLines is the maximum allowed number of log lines that can be requested.
 const maxLogLines = 10000
 
+type registryDeployService interface {
+	in.RegistryService
+	in.DeployCoordinator
+}
+
 // Handler implements the HTTP handler for the admin API.
 type Handler struct {
 	configSvc    in.ConfigService
@@ -37,7 +43,7 @@ type Handler struct {
 	healthSvc    in.HealthService
 	secretSvc    in.SecretService
 	logSvc       in.LogService
-	registrySvc  in.RegistryService
+	registrySvc  registryDeployService
 	eventBus     out.EventPublisher
 	log          zerowrap.Logger
 }
@@ -129,7 +135,7 @@ func NewHandler(
 	healthSvc in.HealthService,
 	secretSvc in.SecretService,
 	logSvc in.LogService,
-	registrySvc in.RegistryService,
+	registrySvc registryDeployService,
 	eventBus out.EventPublisher,
 	log zerowrap.Logger,
 	backupSvc in.BackupService,
@@ -213,6 +219,7 @@ func (h *Handler) matchRoute(path string) (routeHandler, bool) {
 		{"/routes/by-image", h.handleRoutesByImage},
 		{"/routes", h.handleRoutes},
 		{"/secrets", h.handleSecrets},
+		{"/deploy-intent", h.handleDeployIntent},
 		{"/deploy", h.handleDeploy},
 		{"/restart", h.handleRestart},
 		{"/tags", h.handleTags},
@@ -228,6 +235,48 @@ func (h *Handler) matchRoute(path string) (routeHandler, bool) {
 	return nil, false
 }
 
+// handleDeployIntent handles /admin/deploy-intent/:image endpoint.
+// It registers a deploy intent, suppressing event-based deploys for the image.
+func (h *Handler) handleDeployIntent(w http.ResponseWriter, r *http.Request, path string) {
+	if r.Method != http.MethodPost {
+		h.sendError(w, http.StatusMethodNotAllowed, "method not allowed")
+		return
+	}
+
+	ctx := r.Context()
+	if !HasAccess(ctx, domain.AdminResourceConfig, domain.AdminActionWrite) {
+		h.sendError(w, http.StatusForbidden, "insufficient permissions for config:write")
+		return
+	}
+
+	if h.registrySvc == nil {
+		h.sendError(w, http.StatusServiceUnavailable, "registry service unavailable")
+		return
+	}
+
+	rawName := strings.TrimPrefix(path, "/deploy-intent/")
+	if rawName == "" || rawName == "/deploy-intent" {
+		h.sendError(w, http.StatusBadRequest, "image name required")
+		return
+	}
+
+	imageName, err := url.PathUnescape(rawName)
+	if err != nil {
+		h.sendError(w, http.StatusBadRequest, "invalid image name encoding")
+		return
+	}
+
+	log := zerowrap.FromCtx(ctx)
+	log.Info().Str("image", imageName).Msg("deploy intent registered, suppressing image.pushed events")
+
+	h.registrySvc.SuppressDeployEvent(imageName)
+
+	h.sendJSON(w, http.StatusOK, map[string]string{
+		"status": "ok",
+		"image":  imageName,
+	})
+}
+
 // sendJSON sends a JSON response.
 func (h *Handler) sendJSON(w http.ResponseWriter, status int, data any) {
 	w.Header().Set("Content-Type", "application/json")
@@ -1078,6 +1127,15 @@ func (h *Handler) handleDeploy(w http.ResponseWriter, r *http.Request, path stri
 		return
 	}
 
+	// Clear deploy event suppression now that the explicit deploy has completed.
+	// This re-enables event-based deploys for future direct docker pushes.
+	if route.Image != "" && h.registrySvc != nil {
+		// Use the registry package's image name normaliser so digest-form refs
+		// and multi-segment paths are handled correctly.
+		imageName := registry.ExtractImageName(route.Image)
+		h.registrySvc.ClearDeployEventSuppression(imageName)
+	}
+
 	log.Info().Str("domain", deployDomain).Str("container_id", container.ID).Msg("container deployed via admin API")
 	h.sendJSON(w, http.StatusOK, dto.DeployResponse{
 		Status:      "deployed",
diff --git a/internal/app/run.go b/internal/app/run.go
index 83b5174a..ce3712dc 100644
--- a/internal/app/run.go
+++ b/internal/app/run.go
@@ -1187,12 +1187,14 @@ func createContainerService(ctx context.Context, v *viper.Viper, cfg Config, svc
 		VolumePreserve:           v.GetBool("volumes.preserve"),
 		NetworkIsolation:         v.GetBool("network_isolation.enabled"),
 		NetworkPrefix:            v.GetString("network_isolation.network_prefix"),
-		DNSSuffix:                v.GetString("network_isolation.dns_suffix"),
 		NetworkGroups:            svc.configSvc.GetNetworkGroups(),
 		Attachments:              svc.configSvc.GetAttachments(),
 		ReadinessDelay:           v.GetDuration("deploy.readiness_delay"),
 		ReadinessMode:            v.GetString("deploy.readiness_mode"),
 		HealthTimeout:            v.GetDuration("deploy.health_timeout"),
+		StabilizationDelay:       v.GetDuration("deploy.stabilization_delay"),
+		TCPProbeTimeout:          v.GetDuration("deploy.tcp_probe_timeout"),
+		HTTPProbeTimeout:         v.GetDuration("deploy.http_probe_timeout"),
 		DrainDelay:               v.GetDuration("deploy.drain_delay"),
 		DrainMode:                v.GetString("deploy.drain_mode"),
 		DrainTimeout:             v.GetDuration("deploy.drain_timeout"),
@@ -2346,7 +2348,6 @@ func loadConfig(v *viper.Viper, configPath string) error {
 	v.SetDefault("auto_route.enabled", false)
 	v.SetDefault("network_isolation.enabled", false)
 	v.SetDefault("network_isolation.network_prefix", "gordon")
-	v.SetDefault("network_isolation.dns_suffix", ".internal")
 	v.SetDefault("volumes.auto_create", true)
 	v.SetDefault("volumes.prefix", "gordon")
 	v.SetDefault("volumes.preserve", true)
@@ -2374,6 +2375,9 @@ func loadConfig(v *viper.Viper, configPath string) error {
 	v.SetDefault("deploy.readiness_delay", "5s")
 	v.SetDefault("deploy.readiness_mode", "auto")
 	v.SetDefault("deploy.health_timeout", "90s")
+	v.SetDefault("deploy.stabilization_delay", "2s")
+	v.SetDefault("deploy.tcp_probe_timeout", "30s")
+	v.SetDefault("deploy.http_probe_timeout", "60s")
 	v.SetDefault("deploy.drain_mode", "auto")
 	v.SetDefault("deploy.drain_timeout", "30s")
 
diff --git a/internal/boundaries/in/deploy.go b/internal/boundaries/in/deploy.go
new file mode 100644
index 00000000..d75c4966
--- /dev/null
+++ b/internal/boundaries/in/deploy.go
@@ -0,0 +1,10 @@
+package in
+
+// DeployCoordinator defines deploy coordination operations.
+//
+// These methods let the CLI/admin API suppress image-pushed deploy events
+// while an explicit deploy flow is in progress.
+type DeployCoordinator interface {
+	SuppressDeployEvent(imageName string)
+	ClearDeployEventSuppression(imageName string)
+}
diff --git a/internal/boundaries/in/mocks/mock_deploy_coordinator.go b/internal/boundaries/in/mocks/mock_deploy_coordinator.go
new file mode 100644
index 00000000..77f7350d
--- /dev/null
+++ b/internal/boundaries/in/mocks/mock_deploy_coordinator.go
@@ -0,0 +1,110 @@
+// Code generated by mockery; DO NOT EDIT.
+// github.com/vektra/mockery
+// template: testify
+
+package mocks
+
+import (
+	mock "github.com/stretchr/testify/mock"
+)
+
+// NewMockDeployCoordinator creates a new instance of MockDeployCoordinator. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
+// The first argument is typically a *testing.T value.
+func NewMockDeployCoordinator(t interface {
+	mock.TestingT
+	Cleanup(func())
+}) *MockDeployCoordinator {
+	mock := &MockDeployCoordinator{}
+	mock.Mock.Test(t)
+
+	t.Cleanup(func() { mock.AssertExpectations(t) })
+
+	return mock
+}
+
+// MockDeployCoordinator is an autogenerated mock type for the DeployCoordinator type.
+type MockDeployCoordinator struct {
+	mock.Mock
+}
+
+type MockDeployCoordinator_Expecter struct {
+	mock *mock.Mock
+}
+
+func (_m *MockDeployCoordinator) EXPECT() *MockDeployCoordinator_Expecter {
+	return &MockDeployCoordinator_Expecter{mock: &_m.Mock}
+}
+
+// ClearDeployEventSuppression provides a mock function for the type MockDeployCoordinator.
+func (_mock *MockDeployCoordinator) ClearDeployEventSuppression(imageName string) {
+	_mock.Called(imageName)
+}
+
+// MockDeployCoordinator_ClearDeployEventSuppression_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'ClearDeployEventSuppression'.
+type MockDeployCoordinator_ClearDeployEventSuppression_Call struct {
+	*mock.Call
+}
+
+// ClearDeployEventSuppression is a helper method to define mock.On call.
+//   - imageName string
+func (_e *MockDeployCoordinator_Expecter) ClearDeployEventSuppression(imageName interface{}) *MockDeployCoordinator_ClearDeployEventSuppression_Call {
+	return &MockDeployCoordinator_ClearDeployEventSuppression_Call{Call: _e.mock.On("ClearDeployEventSuppression", imageName)}
+}
+
+func (_c *MockDeployCoordinator_ClearDeployEventSuppression_Call) Run(run func(imageName string)) *MockDeployCoordinator_ClearDeployEventSuppression_Call {
+	_c.Call.Run(func(args mock.Arguments) {
+		var arg0 string
+		if args[0] != nil {
+			arg0 = args[0].(string)
+		}
+		run(arg0)
+	})
+	return _c
+}
+
+func (_c *MockDeployCoordinator_ClearDeployEventSuppression_Call) Return() *MockDeployCoordinator_ClearDeployEventSuppression_Call {
+	_c.Call.Return()
+	return _c
+}
+
+func (_c *MockDeployCoordinator_ClearDeployEventSuppression_Call) RunAndReturn(run func(imageName string)) *MockDeployCoordinator_ClearDeployEventSuppression_Call {
+	_c.Run(run)
+	return _c
+}
+
+// SuppressDeployEvent provides a mock function for the type MockDeployCoordinator.
+func (_mock *MockDeployCoordinator) SuppressDeployEvent(imageName string) {
+	_mock.Called(imageName)
+}
+
+// MockDeployCoordinator_SuppressDeployEvent_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'SuppressDeployEvent'.
+type MockDeployCoordinator_SuppressDeployEvent_Call struct {
+	*mock.Call
+}
+
+// SuppressDeployEvent is a helper method to define mock.On call.
+//   - imageName string
+func (_e *MockDeployCoordinator_Expecter) SuppressDeployEvent(imageName interface{}) *MockDeployCoordinator_SuppressDeployEvent_Call {
+	return &MockDeployCoordinator_SuppressDeployEvent_Call{Call: _e.mock.On("SuppressDeployEvent", imageName)}
+}
+
+func (_c *MockDeployCoordinator_SuppressDeployEvent_Call) Run(run func(imageName string)) *MockDeployCoordinator_SuppressDeployEvent_Call {
+	_c.Call.Run(func(args mock.Arguments) {
+		var arg0 string
+		if args[0] != nil {
+			arg0 = args[0].(string)
+		}
+		run(arg0)
+	})
+	return _c
+}
+
+func (_c *MockDeployCoordinator_SuppressDeployEvent_Call) Return() *MockDeployCoordinator_SuppressDeployEvent_Call {
+	_c.Call.Return()
+	return _c
+}
+
+func (_c *MockDeployCoordinator_SuppressDeployEvent_Call) RunAndReturn(run func(imageName string)) *MockDeployCoordinator_SuppressDeployEvent_Call {
+	_c.Run(run)
+	return _c
+}
diff --git a/internal/boundaries/in/mocks/mock_registry_service.go b/internal/boundaries/in/mocks/mock_registry_service.go
index 7ee610f4..52821cb6 100644
--- a/internal/boundaries/in/mocks/mock_registry_service.go
+++ b/internal/boundaries/in/mocks/mock_registry_service.go
@@ -231,6 +231,46 @@ func (_c *MockRegistryService_CancelUpload_Call) RunAndReturn(run func(ctx conte
 	return _c
 }
 
+// ClearDeployEventSuppression provides a mock function for the type MockRegistryService
+func (_mock *MockRegistryService) ClearDeployEventSuppression(imageName string) {
+	_mock.Called(imageName)
+	return
+}
+
+// MockRegistryService_ClearDeployEventSuppression_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'ClearDeployEventSuppression'
+type MockRegistryService_ClearDeployEventSuppression_Call struct {
+	*mock.Call
+}
+
+// ClearDeployEventSuppression is a helper method to define mock.On call
+//   - imageName string
+func (_e *MockRegistryService_Expecter) ClearDeployEventSuppression(imageName interface{}) *MockRegistryService_ClearDeployEventSuppression_Call {
+	return &MockRegistryService_ClearDeployEventSuppression_Call{Call: _e.mock.On("ClearDeployEventSuppression", imageName)}
+}
+
+func (_c *MockRegistryService_ClearDeployEventSuppression_Call) Run(run func(imageName string)) *MockRegistryService_ClearDeployEventSuppression_Call {
+	_c.Call.Run(func(args mock.Arguments) {
+		var arg0 string
+		if args[0] != nil {
+			arg0 = args[0].(string)
+		}
+		run(
+			arg0,
+		)
+	})
+	return _c
+}
+
+func (_c *MockRegistryService_ClearDeployEventSuppression_Call) Return() *MockRegistryService_ClearDeployEventSuppression_Call {
+	_c.Call.Return()
+	return _c
+}
+
+func (_c *MockRegistryService_ClearDeployEventSuppression_Call) RunAndReturn(run func(imageName string)) *MockRegistryService_ClearDeployEventSuppression_Call {
+	_c.Run(run)
+	return _c
+}
+
 // DeleteManifest provides a mock function for the type MockRegistryService
 func (_mock *MockRegistryService) DeleteManifest(ctx context.Context, name string, reference string) error {
 	ret := _mock.Called(ctx, name, reference)
@@ -895,3 +935,43 @@ func (_c *MockRegistryService_StartUpload_Call) RunAndReturn(run func(ctx contex
 	_c.Call.Return(run)
 	return _c
 }
+
+// SuppressDeployEvent provides a mock function for the type MockRegistryService
+func (_mock *MockRegistryService) SuppressDeployEvent(imageName string) {
+	_mock.Called(imageName)
+	return
+}
+
+// MockRegistryService_SuppressDeployEvent_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'SuppressDeployEvent'
+type MockRegistryService_SuppressDeployEvent_Call struct {
+	*mock.Call
+}
+
+// SuppressDeployEvent is a helper method to define mock.On call
+//   - imageName string
+func (_e *MockRegistryService_Expecter) SuppressDeployEvent(imageName interface{}) *MockRegistryService_SuppressDeployEvent_Call {
+	return &MockRegistryService_SuppressDeployEvent_Call{Call: _e.mock.On("SuppressDeployEvent", imageName)}
+}
+
+func (_c *MockRegistryService_SuppressDeployEvent_Call) Run(run func(imageName string)) *MockRegistryService_SuppressDeployEvent_Call {
+	_c.Call.Run(func(args mock.Arguments) {
+		var arg0 string
+		if args[0] != nil {
+			arg0 = args[0].(string)
+		}
+		run(
+			arg0,
+		)
+	})
+	return _c
+}
+
+func (_c *MockRegistryService_SuppressDeployEvent_Call) Return() *MockRegistryService_SuppressDeployEvent_Call {
+	_c.Call.Return()
+	return _c
+}
+
+func (_c *MockRegistryService_SuppressDeployEvent_Call) RunAndReturn(run func(imageName string)) *MockRegistryService_SuppressDeployEvent_Call {
+	_c.Run(run)
+	return _c
+}
diff --git a/internal/usecase/container/readiness.go b/internal/usecase/container/readiness.go
new file mode 100644
index 00000000..44dafe63
--- /dev/null
+++ b/internal/usecase/container/readiness.go
@@ -0,0 +1,137 @@
+package container
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net"
+	"net/http"
+	"time"
+
+	"github.com/bnema/zerowrap"
+)
+
+var errProbeTimeout = errors.New("probe timeout")
+
+func probeLoop(
+	ctx context.Context,
+	deadline time.Time,
+	baseSleep time.Duration,
+	defaultAttemptTimeout time.Duration,
+	work func(attemptCtx context.Context) (success bool, err error),
+) error {
+	for {
+		if err := ctx.Err(); err != nil {
+			return err
+		}
+		if time.Now().After(deadline) {
+			return errProbeTimeout
+		}
+
+		remaining := time.Until(deadline)
+		attemptTimeout := defaultAttemptTimeout
+		if remaining < attemptTimeout {
+			attemptTimeout = remaining
+		}
+
+		attemptCtx, cancel := context.WithTimeout(ctx, attemptTimeout)
+		success, err := work(attemptCtx)
+		cancel()
+		if err != nil {
+			return err
+		}
+		if success {
+			return nil
+		}
+
+		sleepInterval := baseSleep
+		if remProbe := time.Until(deadline); remProbe < sleepInterval {
+			sleepInterval = remProbe
+		}
+		if ctxDeadline, ok := ctx.Deadline(); ok {
+			if remCtx := time.Until(ctxDeadline); remCtx < sleepInterval {
+				sleepInterval = remCtx
+			}
+		}
+		if sleepInterval <= 0 {
+			if ctx.Err() != nil {
+				return ctx.Err()
+			}
+			return errProbeTimeout
+		}
+
+		t := time.NewTimer(sleepInterval)
+		select {
+		case <-t.C:
+		case <-ctx.Done():
+			t.Stop()
+			return ctx.Err()
+		}
+	}
+}
+
+// tcpProbe attempts a TCP connection to addr, retrying every 500ms until
+// success or timeout. This is the universal fallback readiness check —
+// it verifies the process is at least accepting connections.
+func tcpProbe(ctx context.Context, addr string, timeout time.Duration) error {
+	log := zerowrap.FromCtx(ctx)
+	deadline := time.Now().Add(timeout)
+	dialer := &net.Dialer{}
+	attempts := 0
+	var lastErr error
+	err := probeLoop(ctx, deadline, 500*time.Millisecond, time.Second, func(attemptCtx context.Context) (bool, error) {
+		conn, err := dialer.DialContext(attemptCtx, "tcp", addr)
+		attempts++
+		if err == nil {
+			conn.Close()
+			log.Debug().Str("addr", addr).Int("attempts", attempts).Msg("TCP probe connected")
+			return true, nil
+		}
+
+		lastErr = err
+		if attempts <= 3 || attempts%10 == 0 {
+			log.Debug().Err(err).Str("addr", addr).Int("attempt", attempts).Msg("TCP probe attempt failed")
+		}
+		return false, nil
+	})
+	if errors.Is(err, errProbeTimeout) {
+		return fmt.Errorf("TCP probe timeout after %s: %s not reachable (attempts=%d, last_error=%v)", timeout, addr, attempts, lastErr)
+	}
+	return err
+}
+
+// httpProbe performs HTTP GET requests to url, retrying every 1s until a
+// 2xx/3xx response or timeout. Used when gordon.health label is set.
+func httpProbe(ctx context.Context, url string, timeout time.Duration) error {
+	client := &http.Client{
+		CheckRedirect: func(req *http.Request, via []*http.Request) error {
+			// Don't follow redirects — treat 3xx as a successful response.
+			return http.ErrUseLastResponse
+		},
+	}
+	deadline := time.Now().Add(timeout)
+	var lastStatus int
+	err := probeLoop(ctx, deadline, time.Second, 2*time.Second, func(attemptCtx context.Context) (bool, error) {
+		req, reqErr := http.NewRequestWithContext(attemptCtx, http.MethodGet, url, nil)
+		if reqErr != nil {
+			return false, reqErr
+		}
+
+		resp, err := client.Do(req)
+		if err != nil {
+			return false, nil
+		}
+
+		lastStatus = resp.StatusCode
+		resp.Body.Close()
+		if lastStatus >= 200 && lastStatus < 400 {
+			return true, nil
+		}
+
+		return false, nil
+	})
+	if errors.Is(err, errProbeTimeout) {
+		return fmt.Errorf("HTTP probe timeout after %s: last status %d from %s", timeout, lastStatus, url)
+	}
+	return err
+}
diff --git a/internal/usecase/container/readiness_test.go b/internal/usecase/container/readiness_test.go
new file mode 100644
index 00000000..811ba9ff
--- /dev/null
+++ b/internal/usecase/container/readiness_test.go
@@ -0,0 +1,167 @@
+package container
+
+import (
+	"context"
+	"net"
+	"net/http"
+	"net/http/httptest"
+	"sync/atomic"
+	"syscall"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// --- TCP Probe Tests ---
+
+func TestTCPProbe_Success(t *testing.T) {
+	ln, err := net.Listen("tcp", "127.0.0.1:0")
+	require.NoError(t, err)
+	defer ln.Close()
+
+	// Accept connections in background so the probe succeeds
+	go func() {
+		for {
+			conn, err := ln.Accept()
+			if err != nil {
+				return
+			}
+			conn.Close()
+		}
+	}()
+
+	ctx := testContext()
+	err = tcpProbe(ctx, ln.Addr().String(), 5*time.Second)
+	assert.NoError(t, err)
+}
+
+func TestTCPProbe_Timeout(t *testing.T) {
+	// Reserve an ephemeral port and close it immediately to get a deterministic
+	// unreachable target, avoiding assumptions about privileged ports.
+	ln, err := net.Listen("tcp", "127.0.0.1:0")
+	require.NoError(t, err)
+	addr := ln.Addr().String()
+	ln.Close()
+
+	ctx := testContext()
+	err = tcpProbe(ctx, addr, 200*time.Millisecond)
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "TCP probe timeout")
+}
+
+func TestTCPProbe_ContextCancelled(t *testing.T) {
+	ctx, cancel := context.WithCancel(testContext())
+	cancel()
+	err := tcpProbe(ctx, "127.0.0.1:1", 5*time.Second)
+	assert.Error(t, err)
+}
+
+func TestTCPProbe_DelayedListener(t *testing.T) {
+	ln, err := net.Listen("tcp", "127.0.0.1:0")
+	require.NoError(t, err)
+	// Close initially so probe fails at first
+	addr := ln.Addr().String()
+	ln.Close()
+
+	// Re-open after 1 second
+	go func() {
+		time.Sleep(time.Second)
+		lc := net.ListenConfig{
+			Control: func(network, address string, c syscall.RawConn) error {
+				return c.Control(func(fd uintptr) {
+					_ = syscall.SetsockoptInt(int(fd), syscall.SOL_SOCKET, syscall.SO_REUSEADDR, 1)
+				})
+			},
+		}
+		newLn, err := lc.Listen(context.Background(), "tcp", addr)
+		if err != nil {
+			t.Logf("delayed listener failed to bind: %v", err)
+			return
+		}
+		defer newLn.Close()
+		for {
+			conn, err := newLn.Accept()
+			if err != nil {
+				return
+			}
+			conn.Close()
+		}
+	}()
+
+	ctx := testContext()
+	err = tcpProbe(ctx, addr, 5*time.Second)
+	assert.NoError(t, err)
+}
+
+// --- HTTP Probe Tests ---
+
+func TestHTTPProbe_Success(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/healthz" {
+			w.WriteHeader(200)
+			return
+		}
+		w.WriteHeader(404)
+	}))
+	defer srv.Close()
+
+	ctx := testContext()
+	err := httpProbe(ctx, srv.URL+"/healthz", 5*time.Second)
+	assert.NoError(t, err)
+}
+
+func TestHTTPProbe_ServerError_ThenSuccess(t *testing.T) {
+	var calls int32
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		n := atomic.AddInt32(&calls, 1)
+		if n < 3 {
+			w.WriteHeader(503)
+			return
+		}
+		w.WriteHeader(200)
+	}))
+	defer srv.Close()
+
+	ctx := testContext()
+	err := httpProbe(ctx, srv.URL+"/healthz", 10*time.Second)
+	assert.NoError(t, err)
+	assert.GreaterOrEqual(t, atomic.LoadInt32(&calls), int32(3))
+}
+
+func TestHTTPProbe_Timeout(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(503)
+	}))
+	defer srv.Close()
+
+	ctx := testContext()
+	err := httpProbe(ctx, srv.URL+"/healthz", 500*time.Millisecond)
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "HTTP probe timeout")
+}
+
+func TestHTTPProbe_ContextCancelled(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(503)
+	}))
+	defer srv.Close()
+
+	ctx, cancel := context.WithCancel(testContext())
+	cancel()
+	err := httpProbe(ctx, srv.URL+"/healthz", 5*time.Second)
+	assert.Error(t, err)
+}
+
+func TestHTTPProbe_RedirectIsSuccess(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		http.Redirect(w, r, "/other", http.StatusFound)
+	}))
+	defer srv.Close()
+
+	ctx := testContext()
+	// The probe should treat 3xx as success without following the redirect
+	err := httpProbe(ctx, srv.URL+"/healthz", 5*time.Second)
+	assert.NoError(t, err)
+}
diff --git a/internal/usecase/container/service.go b/internal/usecase/container/service.go
index 0e5cf71f..7aebc1fd 100644
--- a/internal/usecase/container/service.go
+++ b/internal/usecase/container/service.go
@@ -38,7 +38,6 @@ type Config struct {
 	VolumePreserve           bool
 	NetworkIsolation         bool
 	NetworkPrefix            string
-	DNSSuffix                string
 	NetworkGroups            map[string][]string
 	Attachments              map[string][]string
 	ReadinessDelay           time.Duration // Delay after container starts before considering it ready
@@ -48,6 +47,9 @@ type Config struct {
 	DrainDelayConfigured     bool          // True when deploy.drain_delay was explicitly configured
 	DrainMode                string        // Drain strategy: auto, inflight, delay
 	DrainTimeout             time.Duration // Max wait for in-flight requests to drain
+	StabilizationDelay       time.Duration // Post-switch monitoring window (default 2s)
+	TCPProbeTimeout          time.Duration // TCP probe timeout (default 30s)
+	HTTPProbeTimeout         time.Duration // HTTP probe timeout (default 60s)
 }
 
 var tracer = otel.Tracer("gordon.container")
@@ -257,26 +259,9 @@ func (s *Service) Deploy(ctx context.Context, route domain.Route) (*domain.Conta
 	log := zerowrap.FromCtx(ctx)
 
 	// Record deploy metrics and trace status
-	defer func() {
-		if err != nil {
-			span.RecordError(err)
-			span.SetStatus(codes.Error, err.Error())
-		}
-		if s.metrics == nil {
-			return
-		}
-		attrs := metric.WithAttributes(
-			attribute.String("domain", route.Domain),
-			attribute.String("image", route.Image),
-		)
-		s.metrics.DeployTotal.Add(ctx, 1, attrs)
-		s.metrics.DeployDuration.Record(ctx, time.Since(deployStart).Seconds(), attrs)
-		if err != nil {
-			s.metrics.DeployErrors.Add(ctx, 1, attrs)
-		}
-	}()
+	defer s.recordDeployMetrics(ctx, span, route, deployStart, &err)
 
-	existing, hasExisting := s.getTrackedContainer(route.Domain)
+	existing, hasExisting := s.resolveExistingContainer(ctx, route.Domain)
 
 	resources, err := s.prepareDeployResources(ctx, route, existing)
 	if err != nil {
@@ -287,9 +272,15 @@ func (s *Service) Deploy(ctx context.Context, route domain.Route) (*domain.Conta
 	// the exact same image (by Docker image ID), return it immediately.
 	// This prevents the double-deploy caused by the event-based deploy
 	// (triggered by image.pushed) racing with the explicit CLI deploy call.
-	if hasExisting && existing.ImageID != "" {
-		if skip, container := s.skipRedundantDeploy(ctx, existing, resources.actualImageRef); skip {
-			return container, nil
+	if hasExisting {
+		existingForSkip := existing
+		if existingForSkip.ImageID == "" && existingForSkip.Image != "" && normalizeImageRef(existingForSkip.Image) == normalizeImageRef(route.Image) {
+			existingForSkip = s.containerForRedundantCheck(ctx, existingForSkip)
+		}
+		if existingForSkip.ImageID != "" {
+			if skip, container := s.skipRedundantDeploy(ctx, existingForSkip, resources.actualImageRef); skip {
+				return container, nil
+			}
 		}
 	}
 
@@ -299,6 +290,22 @@ func (s *Service) Deploy(ctx context.Context, route domain.Route) (*domain.Conta
 	}
 
 	invalidated := s.activateDeployedContainer(ctx, route.Domain, newContainer)
+
+	// Post-switch stabilization: verify new container stays running
+	if hasExisting {
+		stable, stabilizeErr := s.stabilizeNewContainer(ctx, route.Domain, newContainer, existing)
+		if stabilizeErr != nil {
+			// Both old and new containers are dead; assign to named return so
+			// the deferred recordDeployMetrics and span see the failure.
+			err = stabilizeErr
+			return nil, err
+		}
+		if !stable {
+			// Rollback performed — old container is restored
+			return existing, nil
+		}
+	}
+
 	s.finalizePreviousContainer(ctx, route.Domain, existing, hasExisting, invalidated, newContainer.ID)
 
 	// Start container log collection (non-blocking, errors don't fail deployment)
@@ -315,6 +322,29 @@ func (s *Service) Deploy(ctx context.Context, route domain.Route) (*domain.Conta
 	return newContainer, nil
 }
 
+// recordDeployMetrics records span error status and deploy metrics at the end of a Deploy call.
+// It is called via defer and receives a pointer to the named return error so it can observe
+// the final error value after all deferred functions have run.
+func (s *Service) recordDeployMetrics(ctx context.Context, span trace.Span, route domain.Route, start time.Time, errPtr *error) {
+	err := *errPtr
+	if err != nil {
+		span.RecordError(err)
+		span.SetStatus(codes.Error, err.Error())
+	}
+	if s.metrics == nil {
+		return
+	}
+	attrs := metric.WithAttributes(
+		attribute.String("domain", route.Domain),
+		attribute.String("image", route.Image),
+	)
+	s.metrics.DeployTotal.Add(ctx, 1, attrs)
+	s.metrics.DeployDuration.Record(ctx, time.Since(start).Seconds(), attrs)
+	if err != nil {
+		s.metrics.DeployErrors.Add(ctx, 1, attrs)
+	}
+}
+
 // skipRedundantDeploy checks whether the existing container is already running
 // the same image (by Docker image ID) as the one we are about to deploy.
 // When a push triggers both an event-based deploy and an explicit CLI deploy,
@@ -347,6 +377,26 @@ func (s *Service) skipRedundantDeploy(ctx context.Context, existing *domain.Cont
 	return true, existing
 }
 
+func (s *Service) containerForRedundantCheck(ctx context.Context, existing *domain.Container) *domain.Container {
+	if existing == nil || existing.ImageID != "" || existing.ID == "" {
+		return existing
+	}
+
+	log := zerowrap.FromCtx(ctx)
+	inspected, err := s.runtime.InspectContainer(ctx, existing.ID)
+	if err != nil {
+		log.Debug().Err(err).Str("container_id", existing.ID).Msg("cannot inspect existing container for redundancy check")
+		return existing
+	}
+	if inspected == nil || inspected.ImageID == "" {
+		return existing
+	}
+
+	existingCopy := *existing
+	existingCopy.ImageID = inspected.ImageID
+	return &existingCopy
+}
+
 type deployResources struct {
 	networkName    string
 	actualImageRef string
@@ -355,11 +405,68 @@ type deployResources struct {
 	volumes        map[string]string
 }
 
-func (s *Service) getTrackedContainer(domainName string) (*domain.Container, bool) {
+// resolveExistingContainer returns the currently running container for a domain.
+// It first checks the in-memory map (fast path). If not found, it queries the
+// runtime for a running container with matching name and managed label. This
+// handles cases where Gordon restarted and in-memory state is stale.
+//
+// The slow path checks canonical, -new, and -next names because a previous
+// deploy may have been interrupted (e.g. eventbus timeout, Gordon restart)
+// leaving the active container under a temp name. Canonical is preferred when
+// multiple matches exist.
+func (s *Service) resolveExistingContainer(ctx context.Context, domainName string) (*domain.Container, bool) {
+	// Fast path: check in-memory state
 	s.mu.RLock()
 	container, ok := s.containers[domainName]
 	s.mu.RUnlock()
-	return container, ok
+	if ok {
+		return container, true
+	}
+
+	// Slow path: query runtime for running containers
+	log := zerowrap.FromCtx(ctx)
+	canonicalName := fmt.Sprintf("gordon-%s", domainName)
+	candidateNames := map[string]bool{
+		canonicalName:           true,
+		canonicalName + "-new":  true,
+		canonicalName + "-next": true,
+	}
+
+	running, err := s.runtime.ListContainers(ctx, false)
+	if err != nil {
+		log.Warn().Err(err).Msg("failed to list running containers for existing container resolution")
+		return nil, false
+	}
+
+	// Prefer canonical name; fall back to any running managed temp container.
+	var best *domain.Container
+	for _, c := range running {
+		if !candidateNames[c.Name] || c.Labels[domain.LabelManaged] != "true" {
+			continue
+		}
+		if best == nil || c.Name == canonicalName {
+			best = c
+		}
+	}
+
+	if best != nil {
+		log.Info().
+			Str("container_id", best.ID).
+			Str("container_name", best.Name).
+			Msg("resolved existing container from runtime (in-memory state was stale)")
+
+		// Update in-memory state so subsequent lookups are fast
+		s.mu.Lock()
+		if _, alreadyTracked := s.containers[domainName]; !alreadyTracked {
+			s.managedCount++
+		}
+		s.containers[domainName] = best
+		s.mu.Unlock()
+
+		return best, true
+	}
+
+	return nil, false
 }
 
 func (s *Service) prepareDeployResources(ctx context.Context, route domain.Route, existing *domain.Container) (*deployResources, error) {
@@ -438,7 +545,7 @@ func (s *Service) createStartedContainer(ctx context.Context, route domain.Route
 		return nil, log.WrapErr(err, "failed to start container")
 	}
 	if !domain.IsSkipReadiness(ctx) {
-		if err := s.waitForReady(ctx, newContainer.ID); err != nil {
+		if err := s.waitForReady(ctx, newContainer.ID, containerConfig); err != nil {
 			s.cleanupFailedContainer(ctx, newContainer.ID)
 			return nil, log.WrapErr(err, "container failed readiness check")
 		}
@@ -480,6 +587,85 @@ func (s *Service) activateDeployedContainer(ctx context.Context, domainName stri
 	return false
 }
 
+// stabilizeNewContainer monitors the new container briefly after traffic switch.
+// If it crashes during this window, rolls back to old container.
+// Returns true if stabilization succeeded, false if rollback was performed.
+// Returns an error only when both new and old containers are dead.
+func (s *Service) stabilizeNewContainer(ctx context.Context, domainName string, newContainer, oldContainer *domain.Container) (bool, error) {
+	log := zerowrap.FromCtx(ctx)
+
+	if oldContainer == nil {
+		return true, nil
+	}
+
+	s.mu.RLock()
+	delay := s.config.StabilizationDelay
+	s.mu.RUnlock()
+	if delay == 0 {
+		delay = 2 * time.Second
+	}
+
+	// Brief stabilization: verify new container is still running after delay
+	select {
+	case <-time.After(delay):
+	case <-ctx.Done():
+		return true, nil
+	}
+
+	running, err := s.runtime.IsContainerRunning(ctx, newContainer.ID)
+	if err != nil || !running {
+		log.Error().
+			Str("new_container", newContainer.ID).
+			Str("old_container", oldContainer.ID).
+			Msg("new container crashed during stabilization, rolling back to old")
+
+		// Verify old container is still running before restoring it.
+		// If both old and new are dead (e.g., OOM), returning a dead
+		// container as "existing" would leave the domain in a broken state.
+		oldRunning, oldErr := s.runtime.IsContainerRunning(ctx, oldContainer.ID)
+		if oldErr != nil || !oldRunning {
+			log.Error().
+				Str("old_container", oldContainer.ID).
+				Msg("old container is also not running, cannot rollback")
+
+			// Cleanup failed new container
+			if stopErr := s.runtime.StopContainer(ctx, newContainer.ID); stopErr != nil {
+				log.WrapErrWithFields(stopErr, "failed to stop failed new container during rollback", map[string]any{zerowrap.FieldEntityID: newContainer.ID})
+			}
+			if removeErr := s.runtime.RemoveContainer(ctx, newContainer.ID, true); removeErr != nil {
+				log.WrapErrWithFields(removeErr, "failed to remove failed new container during rollback", map[string]any{zerowrap.FieldEntityID: newContainer.ID})
+			}
+
+			return false, fmt.Errorf("stabilization failed: new container crashed and old container %s is also not running", oldContainer.ID)
+		}
+
+		// Rollback: restore old container as tracked
+		s.mu.Lock()
+		s.containers[domainName] = oldContainer
+		s.mu.Unlock()
+
+		// Re-invalidate proxy cache to point back to old
+		s.mu.RLock()
+		inv := s.cacheInvalidator
+		s.mu.RUnlock()
+		if inv != nil {
+			inv.InvalidateTarget(ctx, domainName)
+		}
+
+		// Cleanup failed new container
+		if stopErr := s.runtime.StopContainer(ctx, newContainer.ID); stopErr != nil {
+			log.WrapErrWithFields(stopErr, "failed to stop failed new container during rollback", map[string]any{zerowrap.FieldEntityID: newContainer.ID})
+		}
+		if removeErr := s.runtime.RemoveContainer(ctx, newContainer.ID, true); removeErr != nil {
+			log.WrapErrWithFields(removeErr, "failed to remove failed new container during rollback", map[string]any{zerowrap.FieldEntityID: newContainer.ID})
+		}
+
+		return false, nil
+	}
+
+	return true, nil
+}
+
 func (s *Service) finalizePreviousContainer(ctx context.Context, domainName string, existing *domain.Container, hasExisting, invalidated bool, newContainerID string) {
 	if !hasExisting {
 		return
@@ -1587,7 +1773,19 @@ func (s *Service) cleanupOrphanedContainers(ctx context.Context, domainName stri
 
 	for _, c := range allContainers {
 		if (c.Name == expectedName || c.Name == expectedNewName || c.Name == expectedNextName) && c.ID != skipContainerID {
-			log.Info().Str(zerowrap.FieldEntityID, c.ID).Str(zerowrap.FieldStatus, c.Status).Msg("found orphaned container, removing")
+			// Skip any running container regardless of name — a running
+			// temp container may be actively serving traffic while the
+			// new container stabilizes, or may be from a concurrent deploy.
+			if c.Status == "running" {
+				log.Debug().
+					Str(zerowrap.FieldEntityID, c.ID).
+					Str("container_name", c.Name).
+					Str(zerowrap.FieldStatus, c.Status).
+					Msg("skipping running container during orphan cleanup")
+				continue
+			}
+
+			log.Info().Str(zerowrap.FieldEntityID, c.ID).Str("container_name", c.Name).Str(zerowrap.FieldStatus, c.Status).Msg("found orphaned container, removing")
 
 			if err := s.runtime.StopContainer(ctx, c.ID); err != nil {
 				log.WrapErrWithFields(err, "failed to stop orphaned container", map[string]any{zerowrap.FieldEntityID: c.ID})
@@ -2065,7 +2263,7 @@ func (s *Service) findContainerByName(ctx context.Context, name string) *domain.
 	return nil
 }
 
-func (s *Service) waitForReady(ctx context.Context, containerID string) error {
+func (s *Service) waitForReady(ctx context.Context, containerID string, containerConfig *domain.ContainerConfig) error {
 	if err := s.pollContainerRunning(ctx, containerID); err != nil {
 		return err
 	}
@@ -2092,15 +2290,123 @@ func (s *Service) waitForReady(ctx context.Context, containerID string) error {
 		}
 		return s.waitForHealthy(ctx, containerID, cfg.HealthTimeout)
 	default: // auto
-		_, hasHealthcheck, err := s.runtime.GetContainerHealthStatus(ctx, containerID)
-		if err != nil {
-			return err
-		}
-		if hasHealthcheck {
-			return s.waitForHealthy(ctx, containerID, cfg.HealthTimeout)
-		}
-		return s.waitForReadyByDelay(ctx, containerID)
+		return s.readinessCascade(ctx, containerID, containerConfig, cfg)
+	}
+}
+
+// readinessCascade auto-detects the strongest available readiness signal:
+//  1. Docker healthcheck (if present) → wait for healthy status
+//  2. HTTP probe (if gordon.health label set) → GET until 2xx/3xx
+//  3. TCP probe (if port info available) → connect until accepted
+//  4. Delay fallback (last resort) → waitForReadyByDelay
+func (s *Service) readinessCascade(ctx context.Context, containerID string, containerConfig *domain.ContainerConfig, cfg Config) error {
+	log := zerowrap.FromCtx(ctx)
+
+	// 1. Docker healthcheck
+	_, hasHealthcheck, err := s.runtime.GetContainerHealthStatus(ctx, containerID)
+	if err != nil {
+		return err
+	}
+	if hasHealthcheck {
+		log.Info().Msg("readiness cascade: using Docker healthcheck")
+		return s.waitForHealthy(ctx, containerID, cfg.HealthTimeout)
+	}
+
+	// 2. HTTP probe via gordon.health label
+	if probed, probeErr := s.tryHTTPProbe(ctx, containerID, containerConfig, cfg); probed {
+		return probeErr
+	}
+
+	// 3. TCP probe (if port info available)
+	if probed, probeErr := s.tryTCPProbe(ctx, containerID, containerConfig, cfg); probed {
+		return probeErr
+	}
+
+	// 4. Delay fallback
+	log.Info().Msg("readiness cascade: using delay fallback")
+	return s.waitForReadyByDelay(ctx, containerID)
+}
+
+// tryHTTPProbe attempts an HTTP probe if the gordon.health label is set.
+// Returns (true, err) if the probe was attempted, (false, nil) if skipped.
+func (s *Service) tryHTTPProbe(ctx context.Context, containerID string, containerConfig *domain.ContainerConfig, cfg Config) (bool, error) {
+	log := zerowrap.FromCtx(ctx)
+	if containerConfig == nil {
+		return false, nil
+	}
+	healthPath, ok := containerConfig.Labels[domain.LabelHealth]
+	if !ok || healthPath == "" {
+		return false, nil
+	}
+	ip, port, probeErr := s.resolveContainerEndpoint(ctx, containerID)
+	if probeErr != nil || ip == "" || port <= 0 {
+		log.Debug().Err(probeErr).Msg("readiness cascade: HTTP probe skipped, could not resolve container endpoint")
+		return false, nil
+	}
+	url := fmt.Sprintf("http://%s:%d%s", ip, port, healthPath)
+	timeout := cfg.HTTPProbeTimeout
+	if timeout == 0 {
+		timeout = 60 * time.Second
+	}
+	log.Info().Str("url", url).Dur("timeout", timeout).Msg("readiness cascade: using HTTP probe")
+	return true, httpProbe(ctx, url, timeout)
+}
+
+// tryTCPProbe attempts a TCP probe if port info is available.
+// Returns (true, err) if the probe was attempted, (false, nil) if skipped.
+func (s *Service) tryTCPProbe(ctx context.Context, containerID string, containerConfig *domain.ContainerConfig, cfg Config) (bool, error) {
+	log := zerowrap.FromCtx(ctx)
+	if containerConfig == nil || len(containerConfig.Ports) == 0 {
+		return false, nil
+	}
+	ip, port, probeErr := s.resolveContainerEndpoint(ctx, containerID)
+	if probeErr != nil || ip == "" || port <= 0 {
+		log.Debug().Err(probeErr).Msg("readiness cascade: TCP probe skipped, could not resolve container endpoint")
+		return false, nil
+	}
+	addr := fmt.Sprintf("%s:%d", ip, port)
+	timeout := cfg.TCPProbeTimeout
+	if timeout == 0 {
+		timeout = 30 * time.Second
+	}
+	log.Info().Str("addr", addr).Dur("timeout", timeout).Msg("readiness cascade: using TCP probe")
+	return true, tcpProbe(ctx, addr, timeout)
+}
+
+// resolveContainerEndpoint returns a host-reachable address for probing.
+// In rootless podman/Docker setups, container internal IPs are not routable
+// from the host. We use the host port binding (127.0.0.1:<mapped_port>)
+// which is always reachable. Falls back to the container's internal IP
+// only if no host port mapping exists (e.g. host-network mode).
+func (s *Service) resolveContainerEndpoint(ctx context.Context, containerID string) (string, int, error) {
+	log := zerowrap.FromCtx(ctx)
+
+	// First, get the container's internal port from network info
+	_, internalPort, err := s.runtime.GetContainerNetworkInfo(ctx, containerID)
+	if err != nil {
+		return "", 0, err
+	}
+
+	// Try to resolve via host port binding (works in rootless podman/Docker)
+	hostPort, hostErr := s.runtime.GetContainerPort(ctx, containerID, internalPort)
+	if hostErr == nil && hostPort > 0 {
+		log.Debug().
+			Int("internal_port", internalPort).
+			Int("host_port", hostPort).
+			Msg("resolved container endpoint via host port binding")
+		return "127.0.0.1", hostPort, nil
+	}
+
+	// Fallback: use internal IP (works in Docker rootful / host network)
+	ip, _, fallbackErr := s.runtime.GetContainerNetworkInfo(ctx, containerID)
+	if fallbackErr != nil {
+		return "", 0, fallbackErr
 	}
+	log.Debug().
+		Str("ip", ip).
+		Int("port", internalPort).
+		Msg("resolved container endpoint via internal IP (no host port binding)")
+	return ip, internalPort, nil
 }
 
 // waitForReadyByDelay waits using the legacy running+delay strategy.
diff --git a/internal/usecase/container/service_readiness_test.go b/internal/usecase/container/service_readiness_test.go
index 1e613376..a7c3e2b6 100644
--- a/internal/usecase/container/service_readiness_test.go
+++ b/internal/usecase/container/service_readiness_test.go
@@ -1,16 +1,31 @@
 package container
 
 import (
+	"net"
 	"strings"
 	"testing"
 	"time"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/mock"
+	"github.com/stretchr/testify/require"
 
 	"github.com/bnema/gordon/internal/boundaries/out/mocks"
+	"github.com/bnema/gordon/internal/domain"
 )
 
+// reserveEphemeralAddr binds on 127.0.0.1:0, captures the assigned port, closes
+// the listener, and returns the address string. The port is guaranteed to be
+// unreachable (no process listening) when the probe runs.
+func reserveEphemeralAddr(t *testing.T) (ip string, port int) {
+	t.Helper()
+	ln, err := net.Listen("tcp", "127.0.0.1:0")
+	require.NoError(t, err)
+	addr := ln.Addr().(*net.TCPAddr)
+	ln.Close()
+	return addr.IP.String(), addr.Port
+}
+
 func TestService_WaitForReady_AutoFallsBackToDelayWhenNoHealthcheck(t *testing.T) {
 	runtime := mocks.NewMockContainerRuntime(t)
 	svc := NewService(runtime, nil, nil, nil, Config{
@@ -28,7 +43,166 @@ func TestService_WaitForReady_AutoFallsBackToDelayWhenNoHealthcheck(t *testing.T
 	// Delay-mode fallback verification
 	runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once()
 
-	err := svc.waitForReady(ctx, containerID)
+	err := svc.waitForReady(ctx, containerID, nil)
+	assert.NoError(t, err)
+}
+
+func TestService_WaitForReady_AutoCascadeUsesTCPWhenNoHealthcheckAndNoHealthLabel(t *testing.T) {
+	runtime := mocks.NewMockContainerRuntime(t)
+	svc := NewService(runtime, nil, nil, nil, Config{
+		ReadinessMode: "auto",
+	})
+
+	ctx := testContext()
+	containerID := "container-1"
+
+	containerConfig := &domain.ContainerConfig{
+		Ports:  []int{8080},
+		Labels: map[string]string{},
+	}
+
+	// Reserve an ephemeral loopback port that is guaranteed to be unreachable.
+	probeIP, probePort := reserveEphemeralAddr(t)
+
+	// Initial running poll
+	runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once()
+	// No Docker healthcheck
+	runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("", false, nil).Once()
+	// Cascade resolves container endpoint for TCP probe
+	runtime.EXPECT().GetContainerNetworkInfo(mock.Anything, containerID).Return(probeIP, probePort, nil).Once()
+	// Host port binding resolution — return the same loopback addr so probe hits it
+	runtime.EXPECT().GetContainerPort(mock.Anything, containerID, probePort).Return(probePort, nil).Once()
+
+	// TCP probe will try to connect — which will fail since the port is closed.
+	// Use a short health timeout so the test doesn't hang.
+	svc.mu.Lock()
+	svc.config.TCPProbeTimeout = 50 * time.Millisecond
+	svc.mu.Unlock()
+
+	err := svc.waitForReady(ctx, containerID, containerConfig)
+	// Expect TCP probe timeout (no server listening)
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "TCP probe timeout")
+}
+
+func TestService_WaitForReady_AutoCascadeUsesHTTPProbeWhenHealthLabelSet(t *testing.T) {
+	runtime := mocks.NewMockContainerRuntime(t)
+	svc := NewService(runtime, nil, nil, nil, Config{
+		ReadinessMode:    "auto",
+		HTTPProbeTimeout: 50 * time.Millisecond,
+	})
+
+	ctx := testContext()
+	containerID := "container-1"
+
+	containerConfig := &domain.ContainerConfig{
+		Ports: []int{8080},
+		Labels: map[string]string{
+			domain.LabelHealth: "/healthz",
+		},
+	}
+
+	// Reserve an ephemeral loopback port that is guaranteed to be unreachable.
+	probeIP, probePort := reserveEphemeralAddr(t)
+
+	// Initial running poll
+	runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once()
+	// No Docker healthcheck
+	runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("", false, nil).Once()
+	// Cascade resolves container endpoint for HTTP probe
+	runtime.EXPECT().GetContainerNetworkInfo(mock.Anything, containerID).Return(probeIP, probePort, nil).Once()
+	// Host port binding resolution — return the same loopback addr so probe hits it
+	runtime.EXPECT().GetContainerPort(mock.Anything, containerID, probePort).Return(probePort, nil).Once()
+
+	err := svc.waitForReady(ctx, containerID, containerConfig)
+	// Expect HTTP probe timeout (no server listening)
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "HTTP probe timeout")
+}
+
+func TestService_WaitForReady_AutoCascadeUsesHealthcheckWhenPresent(t *testing.T) {
+	runtime := mocks.NewMockContainerRuntime(t)
+	svc := NewService(runtime, nil, nil, nil, Config{
+		ReadinessMode: "auto",
+		HealthTimeout: 50 * time.Millisecond,
+	})
+
+	ctx := testContext()
+	containerID := "container-1"
+
+	containerConfig := &domain.ContainerConfig{
+		Ports: []int{8080},
+		Labels: map[string]string{
+			domain.LabelHealth: "/healthz",
+		},
+	}
+
+	// Initial running poll
+	runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once()
+	// Docker healthcheck IS present — cascade detects it, then waitForHealthy polls it
+	// First call: cascade detection (hasHealthcheck=true)
+	// Second call: waitForHealthy loop (status=healthy)
+	runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("starting", true, nil).Once()
+	runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("healthy", true, nil).Once()
+
+	err := svc.waitForReady(ctx, containerID, containerConfig)
+	assert.NoError(t, err)
+}
+
+func TestService_WaitForReady_AutoCascadeFallsToDelayWhenNoEndpoint(t *testing.T) {
+	runtime := mocks.NewMockContainerRuntime(t)
+	svc := NewService(runtime, nil, nil, nil, Config{
+		ReadinessMode:  "auto",
+		ReadinessDelay: time.Millisecond,
+	})
+
+	ctx := testContext()
+	containerID := "container-1"
+
+	// Container has ports but network info is unavailable
+	containerConfig := &domain.ContainerConfig{
+		Ports:  []int{8080},
+		Labels: map[string]string{},
+	}
+
+	// Initial running poll
+	runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once()
+	// No Docker healthcheck
+	runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("", false, nil).Once()
+	// Network info fails — can't do TCP probe, fall through to delay
+	runtime.EXPECT().GetContainerNetworkInfo(mock.Anything, containerID).Return("", 0, assert.AnError).Once()
+	// Delay-mode fallback verification
+	runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once()
+
+	err := svc.waitForReady(ctx, containerID, containerConfig)
+	assert.NoError(t, err)
+}
+
+func TestService_WaitForReady_ExplicitDelayModeIgnoresCascade(t *testing.T) {
+	runtime := mocks.NewMockContainerRuntime(t)
+	svc := NewService(runtime, nil, nil, nil, Config{
+		ReadinessMode:  "delay",
+		ReadinessDelay: time.Millisecond,
+	})
+
+	ctx := testContext()
+	containerID := "container-1"
+
+	// Even with health label and ports, "delay" mode skips cascade entirely
+	containerConfig := &domain.ContainerConfig{
+		Ports: []int{8080},
+		Labels: map[string]string{
+			domain.LabelHealth: "/healthz",
+		},
+	}
+
+	// Initial running poll
+	runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once()
+	// No GetContainerHealthStatus call — delay mode skips it
+	// Delay-mode verification
+	runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once()
+
+	err := svc.waitForReady(ctx, containerID, containerConfig)
 	assert.NoError(t, err)
 }
 
diff --git a/internal/usecase/container/service_test.go b/internal/usecase/container/service_test.go
index fae2eeb8..c1a9feb9 100644
--- a/internal/usecase/container/service_test.go
+++ b/internal/usecase/container/service_test.go
@@ -25,6 +25,17 @@ func testContext() context.Context {
 	return zerowrap.WithCtx(context.Background(), zerowrap.Default())
 }
 
+// testMinDelayConfig returns a Config with all timing delays set to 1ms,
+// suitable for unit tests that don't want to wait for real timeouts.
+func testMinDelayConfig() Config {
+	return Config{
+		ReadinessDelay:       time.Millisecond,
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
+		StabilizationDelay:   time.Millisecond,
+	}
+}
+
 func setupMetricsTest(t *testing.T) (*telemetry.Metrics, *sdkmetric.ManualReader) {
 	t.Helper()
 
@@ -184,10 +195,11 @@ func TestService_Deploy_Success(t *testing.T) {
 	eventBus := mocks.NewMockEventPublisher(t)
 
 	config := Config{
-		NetworkIsolation: false,
-		VolumeAutoCreate: false,
-		ReadinessDelay:   time.Millisecond, // Minimal delay for tests
-		DrainDelay:       time.Millisecond,
+		NetworkIsolation:     false,
+		VolumeAutoCreate:     false,
+		ReadinessDelay:       time.Millisecond, // Minimal delay for tests
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
 	}
 
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
@@ -198,6 +210,9 @@ func TestService_Deploy_Success(t *testing.T) {
 		Image:  "myapp:latest",
 	}
 
+	// No container in memory — runtime resolution returns nothing
+	runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil)
+
 	// Setup mocks - no orphaned containers
 	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil)
 
@@ -261,10 +276,11 @@ func TestService_Deploy_ReadinessRecoveryWindow_AllowsTransientFlap(t *testing.T
 	eventBus := mocks.NewMockEventPublisher(t)
 
 	config := Config{
-		NetworkIsolation: false,
-		VolumeAutoCreate: false,
-		ReadinessDelay:   time.Millisecond,
-		DrainDelay:       time.Millisecond,
+		NetworkIsolation:     false,
+		VolumeAutoCreate:     false,
+		ReadinessDelay:       time.Millisecond,
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
 	}
 
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
@@ -275,6 +291,9 @@ func TestService_Deploy_ReadinessRecoveryWindow_AllowsTransientFlap(t *testing.T
 		Image:  "myapp:latest",
 	}
 
+	// No container in memory — runtime resolution returns nothing
+	runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil)
+
 	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil)
 	runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil)
 	runtime.EXPECT().PullImage(mock.Anything, "myapp:latest").Return(nil)
@@ -329,6 +348,9 @@ func TestService_Deploy_ImagePullFailure(t *testing.T) {
 		Image:  "myapp:latest",
 	}
 
+	// No container in memory — runtime resolution returns nothing
+	runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil)
+
 	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil)
 	runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil)
 	runtime.EXPECT().PullImage(mock.Anything, "myapp:latest").Return(errors.New("image not found"))
@@ -406,8 +428,10 @@ func TestService_Deploy_ReplacesExistingContainer(t *testing.T) {
 	cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t)
 
 	config := Config{
-		ReadinessDelay: time.Millisecond, // Minimal delay for tests
-		DrainDelay:     time.Millisecond,
+		ReadinessDelay:       time.Millisecond, // Minimal delay for tests
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
+		StabilizationDelay:   time.Millisecond,
 	}
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 	svc.SetProxyCacheInvalidator(cacheInvalidator)
@@ -445,8 +469,8 @@ func TestService_Deploy_ReplacesExistingContainer(t *testing.T) {
 	})).Return(newContainer, nil)
 	runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil)
 
-	// Wait for ready
-	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2)
+	// Wait for ready (2 calls) + stabilization check (1 call)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3)
 
 	// Inspect after ready
 	runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{
@@ -460,7 +484,7 @@ func TestService_Deploy_ReplacesExistingContainer(t *testing.T) {
 	// Synchronous cache invalidation before old container cleanup
 	cacheInvalidator.EXPECT().InvalidateTarget(mock.Anything, "test.example.com").Return()
 
-	// Now cleanup old container (after new one is ready + cache invalidated + drain delay)
+	// Now cleanup old container (after new one is ready + cache invalidated + stabilization + drain delay)
 	runtime.EXPECT().StopContainer(mock.Anything, "old-container").Return(nil)
 	runtime.EXPECT().RemoveContainer(mock.Anything, "old-container", true).Return(nil)
 
@@ -471,10 +495,106 @@ func TestService_Deploy_ReplacesExistingContainer(t *testing.T) {
 
 	assert.NoError(t, err)
 	assert.Equal(t, "new-container", result.ID)
+}
+
+// TestService_Deploy_ResolvesExistingFromRuntime_WhenMemoryStale verifies that
+// when in-memory state has no tracked container for a domain, Deploy queries
+// the runtime and discovers the running container. This prevents orphan cleanup
+// from killing the real active container after a Gordon restart.
+func TestService_Deploy_ResolvesExistingFromRuntime_WhenMemoryStale(t *testing.T) {
+	runtime := mocks.NewMockContainerRuntime(t)
+	envLoader := mocks.NewMockEnvLoader(t)
+	eventBus := mocks.NewMockEventPublisher(t)
+	cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t)
+
+	config := testMinDelayConfig()
+	svc := NewService(runtime, envLoader, eventBus, nil, config)
+	svc.SetProxyCacheInvalidator(cacheInvalidator)
+	ctx := testContext()
+
+	// NO tracked container in memory — simulates Gordon restart
+	// (svc.containers is empty for "test.example.com")
+
+	route := domain.Route{
+		Domain: "test.example.com",
+		Image:  "myapp:v2",
+	}
+
+	// resolveExistingContainer should query running containers from runtime
+	runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{
+		{
+			ID:     "runtime-active-container",
+			Name:   "gordon-test.example.com",
+			Status: "running",
+			Labels: map[string]string{
+				domain.LabelDomain:  "test.example.com",
+				domain.LabelManaged: "true",
+			},
+		},
+	}, nil)
+
+	// Orphan cleanup lists ALL containers (including stopped) — the runtime-active
+	// container should be skipped because it is now recognized as the existing one
+	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{
+		{
+			ID:     "runtime-active-container",
+			Name:   "gordon-test.example.com",
+			Status: "running",
+			Labels: map[string]string{
+				domain.LabelDomain:  "test.example.com",
+				domain.LabelManaged: "true",
+			},
+		},
+	}, nil)
+
+	// Image operations
+	runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil)
+	runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil)
+	runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil)
+
+	// Environment
+	envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil)
+	runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil)
+
+	// Create new container with -new suffix (zero-downtime: existing was resolved from runtime)
+	newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"}
+	runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool {
+		return cfg.Name == "gordon-test.example.com-new"
+	})).Return(newContainer, nil)
+	runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil)
+
+	// Wait for ready (2 calls) + stabilization check (1 call)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3)
+
+	// Inspect after ready
+	runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{
+		ID:     "new-container",
+		Status: "running",
+	}, nil)
+
+	// Publish event
+	eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil)
+
+	// Synchronous cache invalidation
+	cacheInvalidator.EXPECT().InvalidateTarget(mock.Anything, "test.example.com").Return()
+
+	// AFTER new container is ready: stop and remove old runtime-discovered container
+	// This is the key assertion — StopContainer on the runtime-active container
+	// happens during finalizePreviousContainer, NOT during orphan cleanup
+	runtime.EXPECT().StopContainer(mock.Anything, "runtime-active-container").Return(nil)
+	runtime.EXPECT().RemoveContainer(mock.Anything, "runtime-active-container", true).Return(nil)
+
+	// Rename new container to canonical name
+	runtime.EXPECT().RenameContainer(mock.Anything, "new-container", "gordon-test.example.com").Return(nil)
+
+	result, err := svc.Deploy(ctx, route)
+
+	assert.NoError(t, err)
+	assert.Equal(t, "new-container", result.ID)
 
-	// Ensure the service's tracked container entry now points to the new container.
-	tracked, ok := svc.Get(ctx, "test.example.com")
-	assert.True(t, ok)
+	// Verify new container is now tracked
+	tracked, exists := svc.Get(ctx, "test.example.com")
+	assert.True(t, exists)
 	assert.Equal(t, "new-container", tracked.ID)
 }
 
@@ -483,10 +603,7 @@ func TestService_Deploy_SkipRedundantDeploy_GetImageIDError(t *testing.T) {
 	envLoader := mocks.NewMockEnvLoader(t)
 	eventBus := mocks.NewMockEventPublisher(t)
 
-	config := Config{
-		ReadinessDelay: time.Millisecond,
-		DrainDelay:     time.Millisecond,
-	}
+	config := testMinDelayConfig()
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 	ctx := testContext()
 
@@ -519,7 +636,8 @@ func TestService_Deploy_SkipRedundantDeploy_GetImageIDError(t *testing.T) {
 	newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"}
 	runtime.EXPECT().CreateContainer(mock.Anything, mock.AnythingOfType("*domain.ContainerConfig")).Return(newContainer, nil)
 	runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil)
-	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2)
+	// Wait for ready (2 calls) + stabilization check (1 call)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3)
 	runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{
 		ID: "new-container", Status: "running",
 	}, nil)
@@ -542,10 +660,11 @@ func TestService_Deploy_WithNetworkIsolation(t *testing.T) {
 	eventBus := mocks.NewMockEventPublisher(t)
 
 	config := Config{
-		NetworkIsolation: true,
-		NetworkPrefix:    "gordon",
-		ReadinessDelay:   time.Millisecond, // Minimal delay for tests
-		DrainDelay:       time.Millisecond,
+		NetworkIsolation:     true,
+		NetworkPrefix:        "gordon",
+		ReadinessDelay:       time.Millisecond, // Minimal delay for tests
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
 	}
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 	ctx := testContext()
@@ -555,6 +674,9 @@ func TestService_Deploy_WithNetworkIsolation(t *testing.T) {
 		Image:  "myapp:latest",
 	}
 
+	// No container in memory — runtime resolution returns nothing
+	runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil)
+
 	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil)
 	runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp:latest"}, nil)
 	runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:latest").Return([]int{8080}, nil)
@@ -594,10 +716,11 @@ func TestService_Deploy_WithVolumeAutoCreate(t *testing.T) {
 	eventBus := mocks.NewMockEventPublisher(t)
 
 	config := Config{
-		VolumeAutoCreate: true,
-		VolumePrefix:     "gordon",
-		ReadinessDelay:   time.Millisecond, // Minimal delay for tests
-		DrainDelay:       time.Millisecond,
+		VolumeAutoCreate:     true,
+		VolumePrefix:         "gordon",
+		ReadinessDelay:       time.Millisecond, // Minimal delay for tests
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
 	}
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 	ctx := testContext()
@@ -607,6 +730,9 @@ func TestService_Deploy_WithVolumeAutoCreate(t *testing.T) {
 		Image:  "myapp:latest",
 	}
 
+	// No container in memory — runtime resolution returns nothing
+	runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil)
+
 	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil)
 	runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp:latest"}, nil)
 	runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:latest").Return([]int{8080}, nil)
@@ -1312,6 +1438,7 @@ func TestService_Deploy_InternalDeployForcesPull(t *testing.T) {
 		InternalRegistryPassword: "secret",
 		ReadinessDelay:           time.Millisecond,
 		DrainDelay:               time.Millisecond,
+		DrainDelayConfigured:     true,
 	}
 
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
@@ -1324,6 +1451,9 @@ func TestService_Deploy_InternalDeployForcesPull(t *testing.T) {
 		Image:  "myapp:latest",
 	}
 
+	// No container in memory — runtime resolution returns nothing
+	runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil)
+
 	// Setup mocks - no orphaned containers
 	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil)
 
@@ -1379,8 +1509,9 @@ func TestService_AutoStart_StartsNewContainers(t *testing.T) {
 	eventBus := mocks.NewMockEventPublisher(t)
 
 	config := Config{
-		ReadinessDelay: time.Millisecond,
-		DrainDelay:     time.Millisecond,
+		ReadinessDelay:       time.Millisecond,
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
 	}
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 	ctx := testContext()
@@ -1390,6 +1521,9 @@ func TestService_AutoStart_StartsNewContainers(t *testing.T) {
 		{Domain: "app2.example.com", Image: "myapp2:latest"},
 	}
 
+	// No container in memory — runtime resolution returns nothing (one per route)
+	runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil).Times(2)
+
 	// Setup mocks for route deployments
 	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil).Times(2)
 	runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp1:latest", "myapp2:latest"}, nil).Times(2)
@@ -1422,8 +1556,9 @@ func TestService_AutoStart_SkipsExistingContainers(t *testing.T) {
 	eventBus := mocks.NewMockEventPublisher(t)
 
 	config := Config{
-		ReadinessDelay: time.Millisecond,
-		DrainDelay:     time.Millisecond,
+		ReadinessDelay:       time.Millisecond,
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
 	}
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 	ctx := testContext()
@@ -1438,6 +1573,9 @@ func TestService_AutoStart_SkipsExistingContainers(t *testing.T) {
 		{Domain: "app2.example.com", Image: "myapp2:latest"}, // New route
 	}
 
+	// No container in memory for app2 — runtime resolution returns nothing
+	runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil).Once()
+
 	// Only deploy for app2 (app1 is skipped). Readiness is skipped — no IsContainerRunning.
 	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil).Once()
 	runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp2:latest"}, nil).Once()
@@ -1472,6 +1610,9 @@ func TestService_AutoStart_HandlesDeployErrors(t *testing.T) {
 		{Domain: "app1.example.com", Image: "myapp1:latest"},
 	}
 
+	// No container in memory — runtime resolution returns nothing
+	runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil)
+
 	// Setup mocks for failure
 	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil)
 	runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil)
@@ -1490,10 +1631,11 @@ func TestService_AutoStart_UsesInternalDeployContext(t *testing.T) {
 	eventBus := mocks.NewMockEventPublisher(t)
 
 	config := Config{
-		RegistryDomain: "reg.example.com",
-		RegistryPort:   5000,
-		ReadinessDelay: time.Millisecond,
-		DrainDelay:     time.Millisecond,
+		RegistryDomain:       "reg.example.com",
+		RegistryPort:         5000,
+		ReadinessDelay:       time.Millisecond,
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
 	}
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 
@@ -1504,6 +1646,9 @@ func TestService_AutoStart_UsesInternalDeployContext(t *testing.T) {
 		{Domain: "app1.example.com", Image: "reg.example.com/myapp:latest"},
 	}
 
+	// No container in memory — runtime resolution returns nothing
+	runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil)
+
 	// Key assertion: PullImage should be called with localhost:5000 rewrite,
 	// NOT the original reg.example.com/myapp:latest.
 	// Readiness is skipped — no IsContainerRunning calls.
@@ -1538,8 +1683,10 @@ func TestService_Deploy_OrphanCleanupSkipsTrackedContainer(t *testing.T) {
 	cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t)
 
 	config := Config{
-		ReadinessDelay: time.Millisecond, // Minimal delay for tests
-		DrainDelay:     time.Millisecond,
+		ReadinessDelay:       time.Millisecond, // Minimal delay for tests
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
+		StabilizationDelay:   time.Millisecond,
 	}
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 	svc.SetProxyCacheInvalidator(cacheInvalidator)
@@ -1585,8 +1732,8 @@ func TestService_Deploy_OrphanCleanupSkipsTrackedContainer(t *testing.T) {
 	})).Return(newContainer, nil)
 	runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil)
 
-	// Wait for ready
-	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2)
+	// Wait for ready (2 calls) + stabilization check (1 call)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3)
 
 	// Inspect after ready
 	runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{
@@ -1619,16 +1766,18 @@ func TestService_Deploy_OrphanCleanupSkipsTrackedContainer(t *testing.T) {
 	assert.Equal(t, "new-container", tracked.ID)
 }
 
-// TestService_Deploy_OrphanCleanupRemovesTrueOrphans verifies that containers
-// with the same name but NOT tracked are properly removed as orphans.
+// TestService_Deploy_OrphanCleanupRemovesTrueOrphans verifies that stopped canonical
+// containers with the same name but NOT tracked are properly removed as orphans.
+// Running canonical containers are preserved (see NeverKillsRunningCanonical test).
 func TestService_Deploy_OrphanCleanupRemovesTrueOrphans(t *testing.T) {
 	runtime := mocks.NewMockContainerRuntime(t)
 	envLoader := mocks.NewMockEnvLoader(t)
 	eventBus := mocks.NewMockEventPublisher(t)
 
 	config := Config{
-		ReadinessDelay: time.Millisecond, // Minimal delay for tests
-		DrainDelay:     time.Millisecond,
+		ReadinessDelay:       time.Millisecond, // Minimal delay for tests
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
 	}
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 	ctx := testContext()
@@ -1640,17 +1789,21 @@ func TestService_Deploy_OrphanCleanupRemovesTrueOrphans(t *testing.T) {
 		Image:  "myapp:v1",
 	}
 
-	// ListContainers returns an orphaned container (same name, but not tracked)
-	// This could happen if Gordon crashed and restarted, or container was created manually
+	// No container in memory — runtime resolution finds no managed container
+	// (the orphan is stopped so it won't appear in running-only query)
+	runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil)
+
+	// ListContainers(true) returns a STOPPED orphaned container (same name, but not tracked)
+	// Stopped canonical containers are safe to remove during orphan cleanup
 	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{
 		{
 			ID:     "orphan-container",
 			Name:   "gordon-test.example.com",
-			Status: "running",
+			Status: "exited",
 		},
 	}, nil)
 
-	// Orphan should be stopped and removed BEFORE we proceed
+	// Stopped orphan should be stopped and removed BEFORE we proceed
 	runtime.EXPECT().StopContainer(mock.Anything, "orphan-container").Return(nil)
 	runtime.EXPECT().RemoveContainer(mock.Anything, "orphan-container", true).Return(nil)
 
@@ -1700,10 +1853,7 @@ func TestService_Deploy_OrphanCleanupRemovesStaleNewContainer(t *testing.T) {
 	eventBus := mocks.NewMockEventPublisher(t)
 	cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t)
 
-	config := Config{
-		ReadinessDelay: time.Millisecond,
-		DrainDelay:     time.Millisecond,
-	}
+	config := testMinDelayConfig()
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 	svc.SetProxyCacheInvalidator(cacheInvalidator)
 	ctx := testContext()
@@ -1746,7 +1896,8 @@ func TestService_Deploy_OrphanCleanupRemovesStaleNewContainer(t *testing.T) {
 		return cfg.Name == "gordon-test.example.com-new"
 	})).Return(newContainer, nil)
 	runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil)
-	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2)
+	// Wait for ready (2 calls) + stabilization check (1 call)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3)
 	runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{
 		ID:     "new-container",
 		Status: "running",
@@ -1772,10 +1923,7 @@ func TestService_Deploy_TrackedTempContainerUsesAlternateTempName(t *testing.T)
 	eventBus := mocks.NewMockEventPublisher(t)
 	cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t)
 
-	config := Config{
-		ReadinessDelay: time.Millisecond,
-		DrainDelay:     time.Millisecond,
-	}
+	config := testMinDelayConfig()
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 	svc.SetProxyCacheInvalidator(cacheInvalidator)
 	ctx := testContext()
@@ -1811,7 +1959,8 @@ func TestService_Deploy_TrackedTempContainerUsesAlternateTempName(t *testing.T)
 		return cfg.Name == "gordon-test.example.com-next"
 	})).Return(created, nil)
 	runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil)
-	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2)
+	// Wait for ready (2 calls) + stabilization check (1 call)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3)
 	runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{
 		ID:     "new-container",
 		Status: "running",
@@ -1961,10 +2110,7 @@ func TestService_Deploy_ConcurrentSameDomain(t *testing.T) {
 	eventBus := mocks.NewMockEventPublisher(t)
 	cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t)
 
-	config := Config{
-		ReadinessDelay: time.Millisecond,
-		DrainDelay:     time.Millisecond,
-	}
+	config := testMinDelayConfig()
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 	svc.SetProxyCacheInvalidator(cacheInvalidator)
 	ctx := testContext()
@@ -1982,6 +2128,9 @@ func TestService_Deploy_ConcurrentSameDomain(t *testing.T) {
 	// First deploy: no existing container → creates gordon-test.example.com
 	// Second deploy: sees first container → creates gordon-test.example.com-new
 
+	// First deploy resolves from runtime (no container in memory yet)
+	runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil).Once()
+
 	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil).Times(2)
 	runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp:latest"}, nil).Times(2)
 	runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:latest").Return([]int{8080}, nil).Times(2)
@@ -2005,8 +2154,8 @@ func TestService_Deploy_ConcurrentSameDomain(t *testing.T) {
 		})
 
 	runtime.EXPECT().StartContainer(mock.Anything, mock.AnythingOfType("string")).Return(nil).Times(2)
-	// IsContainerRunning is called 2x per deploy in waitForReady (initial check + after delay)
-	runtime.EXPECT().IsContainerRunning(mock.Anything, mock.AnythingOfType("string")).Return(true, nil).Times(4)
+	// IsContainerRunning: 2x per deploy in waitForReady + 1x stabilization for the second deploy (which has existing)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, mock.AnythingOfType("string")).Return(true, nil).Times(5)
 
 	runtime.EXPECT().InspectContainer(mock.Anything, mock.AnythingOfType("string")).
 		RunAndReturn(func(_ context.Context, id string) (*domain.Container, error) {
@@ -2069,8 +2218,9 @@ func TestService_Deploy_ContextCancellation(t *testing.T) {
 	eventBus := mocks.NewMockEventPublisher(t)
 
 	config := Config{
-		ReadinessDelay: time.Millisecond,
-		DrainDelay:     time.Millisecond,
+		ReadinessDelay:       time.Millisecond,
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
 	}
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 
@@ -2097,10 +2247,7 @@ func TestService_Deploy_CacheInvalidationBeforeOldContainerStop(t *testing.T) {
 	eventBus := mocks.NewMockEventPublisher(t)
 	cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t)
 
-	config := Config{
-		ReadinessDelay: time.Millisecond,
-		DrainDelay:     time.Millisecond,
-	}
+	config := testMinDelayConfig()
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 	svc.SetProxyCacheInvalidator(cacheInvalidator)
 	ctx := testContext()
@@ -2132,7 +2279,8 @@ func TestService_Deploy_CacheInvalidationBeforeOldContainerStop(t *testing.T) {
 	newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"}
 	runtime.EXPECT().CreateContainer(mock.Anything, mock.AnythingOfType("*domain.ContainerConfig")).Return(newContainer, nil)
 	runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil)
-	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2)
+	// Wait for ready (2 calls) + stabilization check (1 call)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3)
 	runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{
 		ID: "new-container", Status: "running",
 	}, nil)
@@ -2176,10 +2324,7 @@ func TestService_Deploy_NilCacheInvalidator(t *testing.T) {
 	envLoader := mocks.NewMockEnvLoader(t)
 	eventBus := mocks.NewMockEventPublisher(t)
 
-	config := Config{
-		ReadinessDelay: time.Millisecond,
-		DrainDelay:     time.Millisecond,
-	}
+	config := testMinDelayConfig()
 	// Intentionally NOT setting cache invalidator
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 	ctx := testContext()
@@ -2205,7 +2350,8 @@ func TestService_Deploy_NilCacheInvalidator(t *testing.T) {
 	newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"}
 	runtime.EXPECT().CreateContainer(mock.Anything, mock.AnythingOfType("*domain.ContainerConfig")).Return(newContainer, nil)
 	runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil)
-	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2)
+	// Wait for ready (2 calls) + stabilization check (1 call)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3)
 	runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{
 		ID: "new-container", Status: "running",
 	}, nil)
@@ -2226,8 +2372,9 @@ func TestService_Deploy_SkipsRedundantDeploy(t *testing.T) {
 	eventBus := mocks.NewMockEventPublisher(t)
 
 	config := Config{
-		ReadinessDelay: time.Millisecond,
-		DrainDelay:     time.Millisecond,
+		ReadinessDelay:       time.Millisecond,
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
 	}
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 	ctx := testContext()
@@ -2273,16 +2420,68 @@ func TestService_Deploy_SkipsRedundantDeploy(t *testing.T) {
 	assert.Equal(t, "existing-container", tracked.ID)
 }
 
-func TestService_Deploy_DoesNotSkipWhenImageIDDiffers(t *testing.T) {
+func TestService_Deploy_SkipsRedundantDeploy_WhenTrackedImageIDMissing(t *testing.T) {
 	runtime := mocks.NewMockContainerRuntime(t)
 	envLoader := mocks.NewMockEnvLoader(t)
 	eventBus := mocks.NewMockEventPublisher(t)
-	cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t)
 
 	config := Config{
-		ReadinessDelay: time.Millisecond,
-		DrainDelay:     time.Millisecond,
+		ReadinessDelay:       time.Millisecond,
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
+	}
+	svc := NewService(runtime, envLoader, eventBus, nil, config)
+	ctx := testContext()
+
+	// Pre-populate with existing container missing ImageID (common after stale in-memory state).
+	existingContainer := &domain.Container{
+		ID:     "existing-container",
+		Name:   "gordon-test.example.com",
+		Image:  "myapp:latest",
+		Status: "running",
 	}
+	svc.containers["test.example.com"] = existingContainer
+
+	route := domain.Route{
+		Domain: "test.example.com",
+		Image:  "myapp:latest",
+	}
+
+	// prepareDeployResources
+	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil)
+	runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp:latest"}, nil)
+	runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:latest").Return([]int{8080}, nil)
+	envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil)
+	runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:latest").Return([]string{}, nil)
+
+	// Existing container image ID is recovered from runtime inspection,
+	// then redundancy check can skip the deploy.
+	runtime.EXPECT().InspectContainer(mock.Anything, "existing-container").Return(&domain.Container{
+		ID:      "existing-container",
+		ImageID: "sha256:abc123",
+		Status:  "running",
+	}, nil)
+	runtime.EXPECT().GetImageID(mock.Anything, "myapp:latest").Return("sha256:abc123", nil)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "existing-container").Return(true, nil)
+
+	result, err := svc.Deploy(ctx, route)
+
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.Equal(t, "existing-container", result.ID)
+
+	tracked, exists := svc.Get(ctx, "test.example.com")
+	assert.True(t, exists)
+	assert.Equal(t, "existing-container", tracked.ID)
+}
+
+func TestService_Deploy_DoesNotSkipWhenImageIDDiffers(t *testing.T) {
+	runtime := mocks.NewMockContainerRuntime(t)
+	envLoader := mocks.NewMockEnvLoader(t)
+	eventBus := mocks.NewMockEventPublisher(t)
+	cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t)
+
+	config := testMinDelayConfig()
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 	svc.SetProxyCacheInvalidator(cacheInvalidator)
 	ctx := testContext()
@@ -2316,7 +2515,8 @@ func TestService_Deploy_DoesNotSkipWhenImageIDDiffers(t *testing.T) {
 	newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"}
 	runtime.EXPECT().CreateContainer(mock.Anything, mock.AnythingOfType("*domain.ContainerConfig")).Return(newContainer, nil)
 	runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil)
-	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2)
+	// Wait for ready (2 calls) + stabilization check (1 call)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3)
 	runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{
 		ID: "new-container", Status: "running",
 	}, nil)
@@ -2380,10 +2580,7 @@ func TestService_Deploy_SkipRedundantDeploy_ContainerNotRunning(t *testing.T) {
 	envLoader := mocks.NewMockEnvLoader(t)
 	eventBus := mocks.NewMockEventPublisher(t)
 
-	config := Config{
-		ReadinessDelay: time.Millisecond,
-		DrainDelay:     time.Millisecond,
-	}
+	config := testMinDelayConfig()
 	svc := NewService(runtime, envLoader, eventBus, nil, config)
 	ctx := testContext()
 
@@ -2417,7 +2614,8 @@ func TestService_Deploy_SkipRedundantDeploy_ContainerNotRunning(t *testing.T) {
 	newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"}
 	runtime.EXPECT().CreateContainer(mock.Anything, mock.AnythingOfType("*domain.ContainerConfig")).Return(newContainer, nil)
 	runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil)
-	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2)
+	// Wait for ready (2 calls) + stabilization check (1 call)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3)
 	runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{
 		ID: "new-container", Status: "running",
 	}, nil)
@@ -2433,3 +2631,527 @@ func TestService_Deploy_SkipRedundantDeploy_ContainerNotRunning(t *testing.T) {
 	assert.NoError(t, err)
 	assert.Equal(t, "new-container", result.ID)
 }
+
+// TestService_Deploy_OrphanCleanup_NeverKillsRunningCanonical verifies that orphan
+// cleanup never stops a running container with the canonical name. Only stopped
+// canonical containers and temp containers (-new/-next) should be cleaned up.
+// The active canonical container is resolved via resolveExistingContainer (it has the
+// managed label), so the new container is created with the "-new" suffix — Docker
+// disallows creating a container with the same name as an existing running container.
+func TestService_Deploy_OrphanCleanup_NeverKillsRunningCanonical(t *testing.T) {
+	runtime := mocks.NewMockContainerRuntime(t)
+	envLoader := mocks.NewMockEnvLoader(t)
+	eventBus := mocks.NewMockEventPublisher(t)
+
+	config := testMinDelayConfig()
+	svc := NewService(runtime, envLoader, eventBus, nil, config)
+	ctx := testContext()
+
+	// The canonical container is running and has the managed label — it will be
+	// discovered by resolveExistingContainer's slow path (no in-memory state).
+	existingContainer := &domain.Container{
+		ID:     "active-canonical",
+		Name:   "gordon-test.example.com",
+		Status: "running",
+		Labels: map[string]string{domain.LabelManaged: "true"},
+	}
+
+	route := domain.Route{
+		Domain: "test.example.com",
+		Image:  "myapp:v2",
+	}
+
+	// resolveExistingContainer: slow path finds the running canonical container via label.
+	runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{existingContainer}, nil)
+
+	// cleanupOrphanedContainers: finds the running canonical AND a stale -new leftover.
+	// The canonical container must NOT be killed because it's running.
+	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{
+		existingContainer,
+		{
+			ID:     "stale-leftover",
+			Name:   "gordon-test.example.com-new",
+			Status: "exited",
+		},
+	}, nil)
+
+	// Only the stale -new container should be stopped and removed during orphan cleanup.
+	runtime.EXPECT().StopContainer(mock.Anything, "stale-leftover").Return(nil).Once()
+	runtime.EXPECT().RemoveContainer(mock.Anything, "stale-leftover", true).Return(nil).Once()
+
+	// Image operations: pull new image (existing.Image is empty so redundancy check is skipped)
+	runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil)
+	runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil)
+	runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil)
+
+	// Environment
+	envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil)
+	runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil)
+
+	// Create container — existing is the canonical container, so the new one gets "-new" suffix.
+	// Docker enforces unique container names; using "-new" avoids a name collision.
+	newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"}
+	runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool {
+		return cfg.Name == "gordon-test.example.com-new"
+	})).Return(newContainer, nil)
+	runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil)
+
+	// Wait for ready (delay mode): pollContainerRunning (1×) + waitForReadyByDelay (1×)
+	// Post-switch stabilization: 1×  — total 3 calls.
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3)
+
+	// Inspect after ready
+	runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{
+		ID:     "new-container",
+		Status: "running",
+		Ports:  []int{8080},
+	}, nil)
+
+	// Rename new container to canonical name after stabilization.
+	runtime.EXPECT().RenameContainer(mock.Anything, "new-container", "gordon-test.example.com").Return(nil).Once()
+
+	// Stop and remove old canonical container.
+	runtime.EXPECT().StopContainer(mock.Anything, "active-canonical").Return(nil).Once()
+	runtime.EXPECT().RemoveContainer(mock.Anything, "active-canonical", true).Return(nil).Once()
+
+	// Publish event
+	eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil)
+
+	result, err := svc.Deploy(ctx, route)
+
+	assert.NoError(t, err)
+	assert.Equal(t, "new-container", result.ID)
+
+	// Verify new container is tracked
+	tracked, exists := svc.Get(ctx, "test.example.com")
+	assert.True(t, exists)
+	assert.Equal(t, "new-container", tracked.ID)
+}
+
+// TestService_Deploy_RollbackOnPostSwitchCrash verifies that if the new container crashes
+// during the post-switch stabilization window, traffic is automatically rolled back to the
+// old container: the old container is restored as tracked, the proxy cache is re-invalidated,
+// and the failed new container is cleaned up.
+func TestService_Deploy_RollbackOnPostSwitchCrash(t *testing.T) {
+	runtime := mocks.NewMockContainerRuntime(t)
+	envLoader := mocks.NewMockEnvLoader(t)
+	eventBus := mocks.NewMockEventPublisher(t)
+	cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t)
+
+	config := Config{
+		ReadinessDelay:       time.Millisecond,
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
+		StabilizationDelay:   time.Millisecond,
+	}
+	svc := NewService(runtime, envLoader, eventBus, nil, config)
+	svc.SetProxyCacheInvalidator(cacheInvalidator)
+	ctx := testContext()
+
+	// Pre-populate with existing container
+	existing := &domain.Container{
+		ID:     "old-container",
+		Name:   "gordon-test.example.com",
+		Status: "running",
+	}
+	svc.containers["test.example.com"] = existing
+
+	route := domain.Route{
+		Domain: "test.example.com",
+		Image:  "myapp:v2",
+	}
+
+	// Cleanup orphans
+	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil)
+
+	// Image operations
+	runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil)
+	runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil)
+	runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil)
+
+	// Environment
+	envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil)
+	runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil)
+
+	// Create new container
+	newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"}
+	runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool {
+		return cfg.Name == "gordon-test.example.com-new"
+	})).Return(newContainer, nil)
+	runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil)
+
+	// Readiness: new container is running during readiness checks (2 calls)
+	// Stabilization: new container has crashed (returns false)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(false, nil).Once()
+
+	// Rollback: verify old container is still running before restoring
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "old-container").Return(true, nil).Once()
+
+	// Inspect after ready
+	runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{
+		ID:     "new-container",
+		Name:   "gordon-test.example.com-new",
+		Status: "running",
+	}, nil)
+
+	// Publish event (happens during activateDeployedContainer)
+	eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil)
+
+	// InvalidateTarget called TWICE: once during activate, once during rollback
+	cacheInvalidator.EXPECT().InvalidateTarget(mock.Anything, "test.example.com").Return().Times(2)
+
+	// Cleanup failed new container during rollback
+	runtime.EXPECT().StopContainer(mock.Anything, "new-container").Return(nil)
+	runtime.EXPECT().RemoveContainer(mock.Anything, "new-container", true).Return(nil)
+
+	// StopContainer on old-container should NOT be called (old stays running)
+	// (no expectation for StopContainer("old-container"))
+
+	result, err := svc.Deploy(ctx, route)
+
+	// Deploy returns the old (existing) container after rollback
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.Equal(t, "old-container", result.ID, "should return old container after rollback")
+
+	// Verify old container is restored as tracked
+	trackedRB, existsRB := svc.Get(ctx, "test.example.com")
+	assert.True(t, existsRB)
+	assert.Equal(t, "old-container", trackedRB.ID, "old container should be restored after rollback")
+}
+
+// TestService_Deploy_StabilizationSuccess verifies that when the new container remains
+// healthy during the stabilization window, deployment proceeds normally and the old
+// container is finalized (stopped and removed).
+func TestService_Deploy_StabilizationSuccess(t *testing.T) {
+	runtime := mocks.NewMockContainerRuntime(t)
+	envLoader := mocks.NewMockEnvLoader(t)
+	eventBus := mocks.NewMockEventPublisher(t)
+	cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t)
+
+	config := Config{
+		ReadinessDelay:       time.Millisecond,
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
+		StabilizationDelay:   time.Millisecond,
+	}
+	svc := NewService(runtime, envLoader, eventBus, nil, config)
+	svc.SetProxyCacheInvalidator(cacheInvalidator)
+	ctx := testContext()
+
+	// Pre-populate with existing container
+	existing := &domain.Container{
+		ID:     "old-container",
+		Name:   "gordon-test.example.com",
+		Status: "running",
+	}
+	svc.containers["test.example.com"] = existing
+
+	route := domain.Route{
+		Domain: "test.example.com",
+		Image:  "myapp:v2",
+	}
+
+	// Cleanup orphans
+	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil)
+
+	// Image operations
+	runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil)
+	runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil)
+	runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil)
+
+	// Environment
+	envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil)
+	runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil)
+
+	// Create new container
+	newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"}
+	runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool {
+		return cfg.Name == "gordon-test.example.com-new"
+	})).Return(newContainer, nil)
+	runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil)
+
+	// Readiness (2 calls) + stabilization check (1 call) — all return running
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3)
+
+	// Inspect after ready
+	runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{
+		ID:     "new-container",
+		Name:   "gordon-test.example.com-new",
+		Status: "running",
+	}, nil)
+
+	// Publish event
+	eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil)
+
+	// Cache invalidation (once, during activate)
+	cacheInvalidator.EXPECT().InvalidateTarget(mock.Anything, "test.example.com").Return()
+
+	// Old container finalized normally
+	runtime.EXPECT().StopContainer(mock.Anything, "old-container").Return(nil)
+	runtime.EXPECT().RemoveContainer(mock.Anything, "old-container", true).Return(nil)
+	runtime.EXPECT().RenameContainer(mock.Anything, "new-container", "gordon-test.example.com").Return(nil)
+
+	result, err := svc.Deploy(ctx, route)
+
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.Equal(t, "new-container", result.ID, "should return new container on successful stabilization")
+
+	// Verify new container is tracked
+	trackedSS, existsSS := svc.Get(ctx, "test.example.com")
+	assert.True(t, existsSS)
+	assert.Equal(t, "new-container", trackedSS.ID)
+}
+
+// TestService_Deploy_ZeroDowntime_OldNeverStoppedBeforeNewReady verifies the strict
+// ordering invariant: StopContainer("old-container") must NEVER happen before
+// InspectContainer("new-container") completes (the last step of readiness).
+// Uses testify's NotBefore() to enforce mock call ordering.
+func TestService_Deploy_ZeroDowntime_OldNeverStoppedBeforeNewReady(t *testing.T) {
+	runtime := mocks.NewMockContainerRuntime(t)
+	envLoader := mocks.NewMockEnvLoader(t)
+	eventBus := mocks.NewMockEventPublisher(t)
+	cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t)
+
+	config := Config{
+		ReadinessDelay:       time.Millisecond,
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
+		StabilizationDelay:   time.Millisecond,
+	}
+	svc := NewService(runtime, envLoader, eventBus, nil, config)
+	svc.SetProxyCacheInvalidator(cacheInvalidator)
+	ctx := testContext()
+
+	// Pre-populate with existing tracked container
+	existingContainer := &domain.Container{
+		ID:     "old-container",
+		Name:   "gordon-test.example.com",
+		Status: "running",
+	}
+	svc.containers["test.example.com"] = existingContainer
+
+	route := domain.Route{
+		Domain: "test.example.com",
+		Image:  "myapp:v2",
+	}
+
+	// Cleanup orphans
+	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil)
+
+	// Image operations
+	runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil)
+	runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil)
+	runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil)
+
+	// Environment
+	envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil)
+	runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil)
+
+	// Create new container with -new suffix for zero-downtime
+	newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"}
+	runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool {
+		return cfg.Name == "gordon-test.example.com-new"
+	})).Return(newContainer, nil)
+	runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil)
+
+	// Readiness: pollContainerRunning (1 call) + waitForReadyByDelay verification (1 call)
+	// Stabilization check (1 call)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3)
+
+	// KEY ORDERING CONSTRAINT: InspectContainer("new-container") is the last step
+	// of readiness inside createStartedContainer. StopContainer("old-container")
+	// must happen AFTER this call completes.
+	inspectCall := runtime.EXPECT().InspectContainer(mock.Anything, "new-container").
+		Return(&domain.Container{
+			ID:     "new-container",
+			Name:   "gordon-test.example.com-new",
+			Status: "running",
+		}, nil)
+
+	// Publish event
+	eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil)
+
+	// Synchronous cache invalidation
+	cacheInvalidator.EXPECT().InvalidateTarget(mock.Anything, "test.example.com").Return()
+
+	// STRICT ORDERING: StopContainer on old container must NOT happen before
+	// InspectContainer on new container (readiness completion)
+	runtime.EXPECT().StopContainer(mock.Anything, "old-container").
+		Return(nil).
+		NotBefore(inspectCall.Call)
+
+	runtime.EXPECT().RemoveContainer(mock.Anything, "old-container", true).Return(nil)
+	runtime.EXPECT().RenameContainer(mock.Anything, "new-container", "gordon-test.example.com").Return(nil)
+
+	result, err := svc.Deploy(ctx, route)
+
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.Equal(t, "new-container", result.ID)
+
+	// Verify new container is tracked
+	tracked, exists := svc.Get(ctx, "test.example.com")
+	assert.True(t, exists)
+	assert.Equal(t, "new-container", tracked.ID)
+}
+
+// TestService_Deploy_ZeroDowntime_ReadinessFailure_OldUntouched verifies that when a
+// new container fails readiness (IsContainerRunning returns false), the old container
+// is completely untouched: never stopped, never removed. Deploy returns an error and
+// the old container remains as the tracked container.
+func TestService_Deploy_ZeroDowntime_ReadinessFailure_OldUntouched(t *testing.T) {
+	runtime := mocks.NewMockContainerRuntime(t)
+	envLoader := mocks.NewMockEnvLoader(t)
+	eventBus := mocks.NewMockEventPublisher(t)
+
+	config := Config{
+		ReadinessDelay:       time.Millisecond,
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
+		StabilizationDelay:   time.Millisecond,
+	}
+	svc := NewService(runtime, envLoader, eventBus, nil, config)
+	ctx := testContext()
+
+	// Pre-populate with existing tracked container
+	existingContainer := &domain.Container{
+		ID:     "old-container",
+		Name:   "gordon-test.example.com",
+		Status: "running",
+	}
+	svc.containers["test.example.com"] = existingContainer
+
+	route := domain.Route{
+		Domain: "test.example.com",
+		Image:  "myapp:v2",
+	}
+
+	// Cleanup orphans
+	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil)
+
+	// Image operations
+	runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil)
+	runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil)
+	runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil)
+
+	// Environment
+	envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil)
+	runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil)
+
+	// Create new container
+	newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"}
+	runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool {
+		return cfg.Name == "gordon-test.example.com-new"
+	})).Return(newContainer, nil)
+	runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil)
+
+	// Readiness FAILS: pollContainerRunning succeeds (container starts) but
+	// waitForReadyByDelay verification finds it not running.
+	// pollContainerRunning: returns true (container initially starts)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Once()
+	// waitForReadyByDelay post-delay verification: returns false (container crashed)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(false, nil).Once()
+	// Recovery window poll: return error to fail fast instead of waiting 30s
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").
+		Return(false, errors.New("container exited")).Once()
+
+	// cleanupFailedContainer: stop and remove the failed new container
+	runtime.EXPECT().StopContainer(mock.Anything, "new-container").Return(nil)
+	runtime.EXPECT().RemoveContainer(mock.Anything, "new-container", true).Return(nil)
+
+	// NOTE: StopContainer("old-container") and RemoveContainer("old-container", true)
+	// are intentionally NOT mocked. If either is called, testify will panic with
+	// "unexpected method call" — verifying the old container is never touched.
+
+	result, err := svc.Deploy(ctx, route)
+
+	// Deploy should return an error (readiness failure)
+	assert.Error(t, err)
+	assert.Nil(t, result)
+	assert.Contains(t, err.Error(), "readiness check")
+
+	// Old container must still be tracked — completely untouched
+	tracked, exists := svc.Get(ctx, "test.example.com")
+	assert.True(t, exists, "old container should still be tracked after readiness failure")
+	assert.Equal(t, "old-container", tracked.ID, "old container ID should be unchanged")
+}
+
+// TestService_Deploy_ZeroDowntime_NoExisting_FirstDeploy is a sanity check:
+// first deploy with no existing container should work normally without any
+// finalize/drain logic being triggered.
+func TestService_Deploy_ZeroDowntime_NoExisting_FirstDeploy(t *testing.T) {
+	runtime := mocks.NewMockContainerRuntime(t)
+	envLoader := mocks.NewMockEnvLoader(t)
+	eventBus := mocks.NewMockEventPublisher(t)
+
+	config := Config{
+		ReadinessDelay:       time.Millisecond,
+		DrainDelay:           time.Millisecond,
+		DrainDelayConfigured: true,
+		StabilizationDelay:   time.Millisecond,
+	}
+	svc := NewService(runtime, envLoader, eventBus, nil, config)
+	ctx := testContext()
+
+	// NO existing container — this is a first deploy
+
+	route := domain.Route{
+		Domain: "test.example.com",
+		Image:  "myapp:v2",
+	}
+
+	// resolveExistingContainer: no container in memory, runtime returns nothing
+	runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil)
+
+	// Cleanup orphans — nothing found
+	runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil)
+
+	// Image operations
+	runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil)
+	runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil)
+	runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil)
+
+	// Environment
+	envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil)
+	runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil)
+
+	// Create container — canonical name (no -new suffix since no existing container)
+	newContainer := &domain.Container{ID: "first-container", Name: "gordon-test.example.com", Status: "created"}
+	runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool {
+		return cfg.Name == "gordon-test.example.com"
+	})).Return(newContainer, nil)
+	runtime.EXPECT().StartContainer(mock.Anything, "first-container").Return(nil)
+
+	// Readiness: pollContainerRunning (1 call) + waitForReadyByDelay verification (1 call)
+	// No stabilization check for first deploy (hasExisting is false)
+	runtime.EXPECT().IsContainerRunning(mock.Anything, "first-container").Return(true, nil).Times(2)
+
+	// Inspect after readiness
+	runtime.EXPECT().InspectContainer(mock.Anything, "first-container").Return(&domain.Container{
+		ID:     "first-container",
+		Name:   "gordon-test.example.com",
+		Status: "running",
+		Ports:  []int{8080},
+	}, nil)
+
+	// Publish event
+	eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil)
+
+	// NOTE: No StopContainer, RemoveContainer, or RenameContainer expectations —
+	// first deploy has no previous container to finalize, and no stabilization check.
+
+	result, err := svc.Deploy(ctx, route)
+
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.Equal(t, "first-container", result.ID)
+	assert.Equal(t, "running", result.Status)
+
+	// Verify container is tracked
+	tracked, exists := svc.Get(ctx, "test.example.com")
+	assert.True(t, exists)
+	assert.Equal(t, "first-container", tracked.ID)
+}
diff --git a/internal/usecase/proxy/service_test.go b/internal/usecase/proxy/service_test.go
index 841bb907..ddb1b606 100644
--- a/internal/usecase/proxy/service_test.go
+++ b/internal/usecase/proxy/service_test.go
@@ -539,14 +539,9 @@ func TestDrainRegistryInFlight(t *testing.T) {
 
 	svc.registryInFlight.Add(2)
 
-	done := make(chan struct{})
+	result := make(chan bool, 1)
 	go func() {
-		drained := svc.DrainRegistryInFlight(50 * time.Millisecond)
-		if !drained {
-			// Signal failure via done channel by leaving it open — test will time out
-			return
-		}
-		close(done)
+		result <- svc.DrainRegistryInFlight(50 * time.Millisecond)
 	}()
 
 	time.Sleep(5 * time.Millisecond)
@@ -554,10 +549,12 @@ func TestDrainRegistryInFlight(t *testing.T) {
 	svc.registryInFlight.Add(-1)
 
 	select {
-	case <-done:
-		// good — drained cleanly
+	case drained := <-result:
+		if !drained {
+			t.Fatalf("DrainRegistryInFlight returned false; expected true after all requests completed")
+		}
 	case <-time.After(500 * time.Millisecond):
-		t.Fatal("DrainRegistryInFlight did not return true after requests completed")
+		t.Fatal("DrainRegistryInFlight did not return within timeout")
 	}
 }
 
diff --git a/internal/usecase/registry/service.go b/internal/usecase/registry/service.go
index e1efff25..66602856 100644
--- a/internal/usecase/registry/service.go
+++ b/internal/usecase/registry/service.go
@@ -7,6 +7,8 @@ import (
 	"fmt"
 	"io"
 	"strings"
+	"sync"
+	"time"
 
 	"github.com/bnema/zerowrap"
 	"go.opentelemetry.io/otel"
@@ -24,10 +26,11 @@ var registryTracer = otel.Tracer("gordon.registry")
 
 // Service implements the RegistryService interface.
 type Service struct {
-	blobStorage     out.BlobStorage
-	manifestStorage out.ManifestStorage
-	eventBus        out.EventPublisher
-	metrics         *telemetry.Metrics
+	blobStorage      out.BlobStorage
+	manifestStorage  out.ManifestStorage
+	eventBus         out.EventPublisher
+	metrics          *telemetry.Metrics
+	suppressedImages sync.Map // imageName -> *time.Timer
 }
 
 // SetMetrics sets the telemetry metrics for the registry service.
@@ -48,6 +51,85 @@ func NewService(
 	}
 }
 
+// SuppressDeployEvent marks an image name to skip image.pushed events.
+// The suppression auto-expires after 2 minutes to prevent leaks.
+func (s *Service) SuppressDeployEvent(imageName string) {
+	imageName = ExtractImageName(strings.TrimSpace(imageName))
+	if imageName == "" {
+		return
+	}
+
+	var timer *time.Timer
+	timer = time.AfterFunc(2*time.Minute, func() {
+		// Only delete if this timer is still the current one, preventing an
+		// old timer's callback from removing a newer suppression entry.
+		if v, ok := s.suppressedImages.Load(imageName); ok && v == timer {
+			s.suppressedImages.Delete(imageName)
+		}
+	})
+	if existing, loaded := s.suppressedImages.LoadOrStore(imageName, timer); loaded {
+		existing.(*time.Timer).Stop()
+		s.suppressedImages.Store(imageName, timer)
+	}
+}
+
+// ClearDeployEventSuppression removes event suppression for an image.
+func (s *Service) ClearDeployEventSuppression(imageName string) {
+	imageName = ExtractImageName(strings.TrimSpace(imageName))
+	if imageName == "" {
+		return
+	}
+
+	if v, loaded := s.suppressedImages.LoadAndDelete(imageName); loaded {
+		v.(*time.Timer).Stop()
+	}
+}
+
+// ExtractImageName returns just the repository path of a container image
+// reference, stripping any registry host prefix, tag, and digest.
+// Examples:
+//
+//	"reg.example.com/team/my-app:latest" -> "team/my-app"
+//	"reg.example.com/my-app@sha256:abc"  -> "my-app"
+//	"my-app:v1.2"                        -> "my-app"
+func ExtractImageName(imageRef string) string {
+	name := imageRef
+	// Strip digest
+	if idx := strings.Index(name, "@"); idx != -1 {
+		name = name[:idx]
+	}
+	// Strip tag
+	// Find the last colon, but only strip it if it comes after any slash
+	// (to avoid treating a port number in the host as a tag).
+	if idx := strings.LastIndex(name, ":"); idx != -1 {
+		slashIdx := strings.LastIndex(name, "/")
+		if idx > slashIdx {
+			name = name[:idx]
+		}
+	}
+	// Strip registry host: if the first segment contains a dot or colon it is
+	// a registry hostname; remove it.
+	parts := strings.SplitN(name, "/", 2)
+	if len(parts) == 2 {
+		host := parts[0]
+		if strings.ContainsAny(host, ".:") || host == "localhost" {
+			name = parts[1]
+		}
+	}
+	return name
+}
+
+// IsDeployEventSuppressed checks if deploy events are suppressed for an image.
+func (s *Service) IsDeployEventSuppressed(imageName string) bool {
+	imageName = ExtractImageName(strings.TrimSpace(imageName))
+	if imageName == "" {
+		return false
+	}
+
+	_, exists := s.suppressedImages.Load(imageName)
+	return exists
+}
+
 // GetManifest retrieves a manifest by name and reference.
 func (s *Service) GetManifest(ctx context.Context, name, reference string) (*domain.Manifest, error) {
 	ctx = zerowrap.CtxWithFields(ctx, map[string]any{
@@ -110,13 +192,17 @@ func (s *Service) PutManifest(ctx context.Context, manifest *domain.Manifest) (s
 	// A docker push sends manifests by both digest and tag; firing only on
 	// tag prevents duplicate deploy triggers for the same push.
 	if s.eventBus != nil && !strings.HasPrefix(manifest.Reference, "sha256:") {
-		if err := s.eventBus.Publish(domain.EventImagePushed, domain.ImagePushedPayload{
-			Name:        manifest.Name,
-			Reference:   manifest.Reference,
-			Manifest:    manifest.Data,
-			Annotations: manifest.Annotations,
-		}); err != nil {
-			log.Warn().Err(err).Msg("failed to publish image pushed event")
+		if s.IsDeployEventSuppressed(manifest.Name) {
+			log.Info().Str("image", manifest.Name).Msg("skipping image.pushed event: CLI deploy intent active")
+		} else {
+			if err := s.eventBus.Publish(domain.EventImagePushed, domain.ImagePushedPayload{
+				Name:        manifest.Name,
+				Reference:   manifest.Reference,
+				Manifest:    manifest.Data,
+				Annotations: manifest.Annotations,
+			}); err != nil {
+				log.Warn().Err(err).Msg("failed to publish image pushed event")
+			}
 		}
 	}
 
diff --git a/internal/usecase/registry/service_test.go b/internal/usecase/registry/service_test.go
index 4b6efb16..b29f8401 100644
--- a/internal/usecase/registry/service_test.go
+++ b/internal/usecase/registry/service_test.go
@@ -11,6 +11,7 @@ import (
 	"github.com/bnema/zerowrap"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/mock"
+	"github.com/stretchr/testify/require"
 
 	"github.com/bnema/gordon/internal/boundaries/out/mocks"
 	"github.com/bnema/gordon/internal/domain"
@@ -437,3 +438,41 @@ func TestService_ListRepositories_Error(t *testing.T) {
 	assert.Nil(t, repos)
 	assert.Contains(t, err.Error(), "failed to list repositories")
 }
+
+func TestPutManifest_SkipsEventWhenDeployIntentSuppressed(t *testing.T) {
+	blobStorage := mocks.NewMockBlobStorage(t)
+	manifestStorage := mocks.NewMockManifestStorage(t)
+	eventBus := mocks.NewMockEventPublisher(t)
+
+	svc := NewService(blobStorage, manifestStorage, eventBus)
+
+	svc.SuppressDeployEvent("my-app")
+
+	manifestStorage.EXPECT().PutManifest("my-app", "latest", "application/vnd.oci.image.manifest.v1+json", mock.Anything).Return(nil)
+
+	manifest := &domain.Manifest{
+		Name:        "my-app",
+		Reference:   "latest",
+		ContentType: "application/vnd.oci.image.manifest.v1+json",
+		Data:        []byte(`{"schemaVersion":2}`),
+	}
+
+	_, err := svc.PutManifest(context.Background(), manifest)
+	require.NoError(t, err)
+
+	eventBus.AssertNotCalled(t, "Publish", mock.Anything, mock.Anything)
+}
+
+func TestSuppressDeployEvent_ClearsCorrectly(t *testing.T) {
+	blobStorage := mocks.NewMockBlobStorage(t)
+	manifestStorage := mocks.NewMockManifestStorage(t)
+	eventBus := mocks.NewMockEventPublisher(t)
+
+	svc := NewService(blobStorage, manifestStorage, eventBus)
+
+	svc.SuppressDeployEvent("my-app")
+	assert.True(t, svc.IsDeployEventSuppressed("my-app"))
+
+	svc.ClearDeployEventSuppression("my-app")
+	assert.False(t, svc.IsDeployEventSuppressed("my-app"))
+}
diff --git a/wiki/examples/production.md b/wiki/examples/production.md
index 4957894e..bb802d83 100644
--- a/wiki/examples/production.md
+++ b/wiki/examples/production.md
@@ -58,7 +58,6 @@ preserve = true
 [network_isolation]
 enabled = true
 network_prefix = "prod"
-dns_suffix = ".internal"
 
 # Application routes with pinned versions
 [routes]