diff --git a/docs/config/index.md b/docs/config/index.md index a1d7aa87..f8f0e5c1 100644 --- a/docs/config/index.md +++ b/docs/config/index.md @@ -106,7 +106,6 @@ preserve = true # Keep volumes on container removal [network_isolation] enabled = true # Per-app isolated networks network_prefix = "gordon" # Network name prefix -dns_suffix = ".internal" # DNS suffix for services # Auto-route [auto_route] diff --git a/docs/config/network-isolation.md b/docs/config/network-isolation.md index 798fb374..a41bdb8a 100644 --- a/docs/config/network-isolation.md +++ b/docs/config/network-isolation.md @@ -8,7 +8,6 @@ Isolate applications in separate Docker networks for enhanced security. [network_isolation] enabled = true network_prefix = "gordon" -dns_suffix = ".internal" ``` ## Options @@ -17,7 +16,6 @@ dns_suffix = ".internal" |--------|------|---------|-------------| | `enabled` | bool | `false` | Enable per-app network isolation | | `network_prefix` | string | `"gordon"` | Prefix for created networks | -| `dns_suffix` | string | `".internal"` | DNS suffix for service discovery | ## How It Works @@ -99,19 +97,6 @@ db = connect("postgresql://postgres:5432/mydb") cache = connect("redis://redis:6379") ``` -## DNS Resolution - -The `dns_suffix` option adds a suffix for internal DNS resolution: - -```toml -[network_isolation] -dns_suffix = ".internal" -``` - -Services can be accessed as: -- `postgres` (short form) -- `postgres.internal` (with suffix) - ## Examples ### Basic Isolation @@ -137,7 +122,6 @@ Each app gets its own network with its own database. [network_isolation] enabled = true network_prefix = "prod" -dns_suffix = ".internal" [routes] "app.company.com" = "company-app:v2.1.0" diff --git a/docs/config/reference.md b/docs/config/reference.md index 2be23fcb..669434e9 100644 --- a/docs/config/reference.md +++ b/docs/config/reference.md @@ -105,7 +105,6 @@ enabled = false # Create routes from image labels a [network_isolation] enabled = false # Enable per-app Docker networks network_prefix = "gordon" # Prefix for created networks -dns_suffix = ".internal" # DNS suffix for internal resolution # ============================================================================= # VOLUMES @@ -215,7 +214,6 @@ keep_last = 3 # Keep N newest tags per repository | `auto_route.enabled` | `false` | Auto-route disabled | | `network_isolation.enabled` | `false` | Isolation disabled | | `network_isolation.network_prefix` | `"gordon"` | Network prefix | -| `network_isolation.dns_suffix` | `".internal"` | DNS suffix | | `volumes.auto_create` | `true` | Auto-create volumes | | `volumes.prefix` | `"gordon"` | Volume prefix | | `volumes.preserve` | `true` | Keep volumes | diff --git a/docs/reference/docker-labels.md b/docs/reference/docker-labels.md index f86ad315..a1bec497 100644 --- a/docs/reference/docker-labels.md +++ b/docs/reference/docker-labels.md @@ -41,7 +41,27 @@ Labels you can set in your Dockerfile: | Label | Example | Description | |-------|---------|-------------| -| `gordon.proxy.port` | `"3000"` | Port to proxy HTTP traffic to | +| `gordon.domains` | `"app.example.com,www.app.example.com"` | Comma-separated domains for auto-route | +| `gordon.port` | `"3000"` | Port to proxy HTTP traffic to | +| `gordon.proxy.port` | `"3000"` | Port to proxy HTTP traffic to (legacy alias for `gordon.port`) | +| `gordon.health` | `"/healthz"` | HTTP health check endpoint path for readiness probing | +| `gordon.env-file` | `"/app/.env.example"` | Path to env template file inside the image | + +### Health Check Label + +When `gordon.health` is set, Gordon performs HTTP GET requests to the specified +path during deployment and waits for a 2xx or 3xx response before routing traffic +to the new container: + +```dockerfile +FROM node:20-alpine +LABEL gordon.health="/api/health" +EXPOSE 3000 +CMD ["node", "server.js"] +``` + +Gordon probes `http://:3000/api/health` until it gets a successful +response or the `deploy.http_probe_timeout` is reached. ### Proxy Port Label diff --git a/internal/adapters/in/cli/controlplane.go b/internal/adapters/in/cli/controlplane.go index 3926c65e..296aa84e 100644 --- a/internal/adapters/in/cli/controlplane.go +++ b/internal/adapters/in/cli/controlplane.go @@ -34,6 +34,7 @@ type ControlPlane interface { GetStatus(ctx context.Context) (*remote.Status, error) Reload(ctx context.Context) error + DeployIntent(ctx context.Context, imageName string) error Deploy(ctx context.Context, deployDomain string) (*remote.DeployResult, error) Restart(ctx context.Context, restartDomain string, withAttachments bool) (*remote.RestartResult, error) ListTags(ctx context.Context, repository string) ([]string, error) diff --git a/internal/adapters/in/cli/controlplane_local.go b/internal/adapters/in/cli/controlplane_local.go index 60c87577..2c051381 100644 --- a/internal/adapters/in/cli/controlplane_local.go +++ b/internal/adapters/in/cli/controlplane_local.go @@ -18,6 +18,7 @@ type localControlPlane struct { containerSvc in.ContainerService backupSvc in.BackupService registrySvc in.RegistryService + deployCoord in.DeployCoordinator healthSvc in.HealthService logSvc in.LogService } @@ -27,12 +28,21 @@ func NewLocalControlPlane(kernel *app.Kernel) ControlPlane { return &localControlPlane{} } + registrySvc := kernel.Registry() + var deployCoord in.DeployCoordinator + if registrySvc != nil { + if coordinator, ok := any(registrySvc).(in.DeployCoordinator); ok { + deployCoord = coordinator + } + } + return &localControlPlane{ configSvc: kernel.Config(), secretSvc: kernel.Secrets(), containerSvc: kernel.Container(), backupSvc: kernel.Backup(), - registrySvc: kernel.Registry(), + registrySvc: registrySvc, + deployCoord: deployCoord, healthSvc: kernel.Health(), logSvc: kernel.Logs(), } @@ -223,6 +233,13 @@ func (l *localControlPlane) Reload(_ context.Context) error { return app.SendReloadSignal() } +func (l *localControlPlane) DeployIntent(_ context.Context, imageName string) error { + if l.deployCoord != nil { + l.deployCoord.SuppressDeployEvent(imageName) + } + return nil +} + func (l *localControlPlane) Deploy(ctx context.Context, deployDomain string) (*remote.DeployResult, error) { if l.containerSvc != nil && l.configSvc != nil { route, err := l.configSvc.GetRoute(ctx, deployDomain) diff --git a/internal/adapters/in/cli/controlplane_remote.go b/internal/adapters/in/cli/controlplane_remote.go index 9b35f90a..a37b123f 100644 --- a/internal/adapters/in/cli/controlplane_remote.go +++ b/internal/adapters/in/cli/controlplane_remote.go @@ -88,6 +88,10 @@ func (r *remoteControlPlane) Reload(ctx context.Context) error { return r.client.Reload(ctx) } +func (r *remoteControlPlane) DeployIntent(ctx context.Context, imageName string) error { + return r.client.DeployIntent(ctx, imageName) +} + func (r *remoteControlPlane) Deploy(ctx context.Context, deployDomain string) (*remote.DeployResult, error) { return r.client.Deploy(ctx, deployDomain) } diff --git a/internal/adapters/in/cli/push.go b/internal/adapters/in/cli/push.go index d0e2f76e..3a20e6c5 100644 --- a/internal/adapters/in/cli/push.go +++ b/internal/adapters/in/cli/push.go @@ -154,6 +154,15 @@ func runPush(ctx context.Context, imageArg, domainFlag, tag string, build bool, } fmt.Printf("Domain: %s\n", styles.Theme.Bold.Render(pushDomain)) + // Signal the server to suppress event-based deploys for this image. + // The CLI will trigger an explicit deploy after push completes. + if !noDeploy { + if err := handle.plane.DeployIntent(ctx, imageName); err != nil { + // Non-fatal: worst case we get a redundant deploy via event + fmt.Fprintf(os.Stderr, "warning: failed to register deploy intent: %v\n", err) + } + } + if build { if err := buildAndPush(ctx, version, platform, dockerfile, buildArgs, versionRef, latestRef); err != nil { return err diff --git a/internal/adapters/in/cli/remote/client.go b/internal/adapters/in/cli/remote/client.go index 9ea2ca17..25c6a0c7 100644 --- a/internal/adapters/in/cli/remote/client.go +++ b/internal/adapters/in/cli/remote/client.go @@ -724,6 +724,20 @@ func (c *Client) Deploy(ctx context.Context, deployDomain string) (*DeployResult return &result, nil } +// DeployIntent tells the server that a CLI-managed push is about to happen, +// suppressing event-based deploys for this image. +func (c *Client) DeployIntent(ctx context.Context, imageName string) error { + imageName = strings.TrimSpace(imageName) + if imageName == "" { + return fmt.Errorf("image name cannot be empty") + } + resp, err := c.requestWithRetry(ctx, http.MethodPost, "/deploy-intent/"+url.PathEscape(imageName), nil) + if err != nil { + return err + } + return parseResponse(resp, nil) +} + // Restart API // RestartResult contains the result of a restart. diff --git a/internal/adapters/in/http/admin/handler.go b/internal/adapters/in/http/admin/handler.go index 012d7548..443f1468 100644 --- a/internal/adapters/in/http/admin/handler.go +++ b/internal/adapters/in/http/admin/handler.go @@ -7,6 +7,7 @@ import ( "fmt" "io" "net/http" + "net/url" "strconv" "strings" "time" @@ -27,6 +28,11 @@ const maxAdminRequestSize = 1 << 20 // 1MB // maxLogLines is the maximum allowed number of log lines that can be requested. const maxLogLines = 10000 +type registryDeployService interface { + in.RegistryService + in.DeployCoordinator +} + // Handler implements the HTTP handler for the admin API. type Handler struct { configSvc in.ConfigService @@ -37,7 +43,7 @@ type Handler struct { healthSvc in.HealthService secretSvc in.SecretService logSvc in.LogService - registrySvc in.RegistryService + registrySvc registryDeployService eventBus out.EventPublisher log zerowrap.Logger } @@ -129,7 +135,7 @@ func NewHandler( healthSvc in.HealthService, secretSvc in.SecretService, logSvc in.LogService, - registrySvc in.RegistryService, + registrySvc registryDeployService, eventBus out.EventPublisher, log zerowrap.Logger, backupSvc in.BackupService, @@ -213,6 +219,7 @@ func (h *Handler) matchRoute(path string) (routeHandler, bool) { {"/routes/by-image", h.handleRoutesByImage}, {"/routes", h.handleRoutes}, {"/secrets", h.handleSecrets}, + {"/deploy-intent", h.handleDeployIntent}, {"/deploy", h.handleDeploy}, {"/restart", h.handleRestart}, {"/tags", h.handleTags}, @@ -228,6 +235,48 @@ func (h *Handler) matchRoute(path string) (routeHandler, bool) { return nil, false } +// handleDeployIntent handles /admin/deploy-intent/:image endpoint. +// It registers a deploy intent, suppressing event-based deploys for the image. +func (h *Handler) handleDeployIntent(w http.ResponseWriter, r *http.Request, path string) { + if r.Method != http.MethodPost { + h.sendError(w, http.StatusMethodNotAllowed, "method not allowed") + return + } + + ctx := r.Context() + if !HasAccess(ctx, domain.AdminResourceConfig, domain.AdminActionWrite) { + h.sendError(w, http.StatusForbidden, "insufficient permissions for config:write") + return + } + + if h.registrySvc == nil { + h.sendError(w, http.StatusServiceUnavailable, "registry service unavailable") + return + } + + rawName := strings.TrimPrefix(path, "/deploy-intent/") + if rawName == "" || rawName == "/deploy-intent" { + h.sendError(w, http.StatusBadRequest, "image name required") + return + } + + imageName, err := url.PathUnescape(rawName) + if err != nil { + h.sendError(w, http.StatusBadRequest, "invalid image name encoding") + return + } + + log := zerowrap.FromCtx(ctx) + log.Info().Str("image", imageName).Msg("deploy intent registered, suppressing image.pushed events") + + h.registrySvc.SuppressDeployEvent(imageName) + + h.sendJSON(w, http.StatusOK, map[string]string{ + "status": "ok", + "image": imageName, + }) +} + // sendJSON sends a JSON response. func (h *Handler) sendJSON(w http.ResponseWriter, status int, data any) { w.Header().Set("Content-Type", "application/json") @@ -1078,6 +1127,15 @@ func (h *Handler) handleDeploy(w http.ResponseWriter, r *http.Request, path stri return } + // Clear deploy event suppression now that the explicit deploy has completed. + // This re-enables event-based deploys for future direct docker pushes. + if route.Image != "" && h.registrySvc != nil { + // Use the registry package's image name normaliser so digest-form refs + // and multi-segment paths are handled correctly. + imageName := registry.ExtractImageName(route.Image) + h.registrySvc.ClearDeployEventSuppression(imageName) + } + log.Info().Str("domain", deployDomain).Str("container_id", container.ID).Msg("container deployed via admin API") h.sendJSON(w, http.StatusOK, dto.DeployResponse{ Status: "deployed", diff --git a/internal/app/run.go b/internal/app/run.go index 83b5174a..ce3712dc 100644 --- a/internal/app/run.go +++ b/internal/app/run.go @@ -1187,12 +1187,14 @@ func createContainerService(ctx context.Context, v *viper.Viper, cfg Config, svc VolumePreserve: v.GetBool("volumes.preserve"), NetworkIsolation: v.GetBool("network_isolation.enabled"), NetworkPrefix: v.GetString("network_isolation.network_prefix"), - DNSSuffix: v.GetString("network_isolation.dns_suffix"), NetworkGroups: svc.configSvc.GetNetworkGroups(), Attachments: svc.configSvc.GetAttachments(), ReadinessDelay: v.GetDuration("deploy.readiness_delay"), ReadinessMode: v.GetString("deploy.readiness_mode"), HealthTimeout: v.GetDuration("deploy.health_timeout"), + StabilizationDelay: v.GetDuration("deploy.stabilization_delay"), + TCPProbeTimeout: v.GetDuration("deploy.tcp_probe_timeout"), + HTTPProbeTimeout: v.GetDuration("deploy.http_probe_timeout"), DrainDelay: v.GetDuration("deploy.drain_delay"), DrainMode: v.GetString("deploy.drain_mode"), DrainTimeout: v.GetDuration("deploy.drain_timeout"), @@ -2346,7 +2348,6 @@ func loadConfig(v *viper.Viper, configPath string) error { v.SetDefault("auto_route.enabled", false) v.SetDefault("network_isolation.enabled", false) v.SetDefault("network_isolation.network_prefix", "gordon") - v.SetDefault("network_isolation.dns_suffix", ".internal") v.SetDefault("volumes.auto_create", true) v.SetDefault("volumes.prefix", "gordon") v.SetDefault("volumes.preserve", true) @@ -2374,6 +2375,9 @@ func loadConfig(v *viper.Viper, configPath string) error { v.SetDefault("deploy.readiness_delay", "5s") v.SetDefault("deploy.readiness_mode", "auto") v.SetDefault("deploy.health_timeout", "90s") + v.SetDefault("deploy.stabilization_delay", "2s") + v.SetDefault("deploy.tcp_probe_timeout", "30s") + v.SetDefault("deploy.http_probe_timeout", "60s") v.SetDefault("deploy.drain_mode", "auto") v.SetDefault("deploy.drain_timeout", "30s") diff --git a/internal/boundaries/in/deploy.go b/internal/boundaries/in/deploy.go new file mode 100644 index 00000000..d75c4966 --- /dev/null +++ b/internal/boundaries/in/deploy.go @@ -0,0 +1,10 @@ +package in + +// DeployCoordinator defines deploy coordination operations. +// +// These methods let the CLI/admin API suppress image-pushed deploy events +// while an explicit deploy flow is in progress. +type DeployCoordinator interface { + SuppressDeployEvent(imageName string) + ClearDeployEventSuppression(imageName string) +} diff --git a/internal/boundaries/in/mocks/mock_deploy_coordinator.go b/internal/boundaries/in/mocks/mock_deploy_coordinator.go new file mode 100644 index 00000000..77f7350d --- /dev/null +++ b/internal/boundaries/in/mocks/mock_deploy_coordinator.go @@ -0,0 +1,110 @@ +// Code generated by mockery; DO NOT EDIT. +// github.com/vektra/mockery +// template: testify + +package mocks + +import ( + mock "github.com/stretchr/testify/mock" +) + +// NewMockDeployCoordinator creates a new instance of MockDeployCoordinator. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +// The first argument is typically a *testing.T value. +func NewMockDeployCoordinator(t interface { + mock.TestingT + Cleanup(func()) +}) *MockDeployCoordinator { + mock := &MockDeployCoordinator{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +} + +// MockDeployCoordinator is an autogenerated mock type for the DeployCoordinator type. +type MockDeployCoordinator struct { + mock.Mock +} + +type MockDeployCoordinator_Expecter struct { + mock *mock.Mock +} + +func (_m *MockDeployCoordinator) EXPECT() *MockDeployCoordinator_Expecter { + return &MockDeployCoordinator_Expecter{mock: &_m.Mock} +} + +// ClearDeployEventSuppression provides a mock function for the type MockDeployCoordinator. +func (_mock *MockDeployCoordinator) ClearDeployEventSuppression(imageName string) { + _mock.Called(imageName) +} + +// MockDeployCoordinator_ClearDeployEventSuppression_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'ClearDeployEventSuppression'. +type MockDeployCoordinator_ClearDeployEventSuppression_Call struct { + *mock.Call +} + +// ClearDeployEventSuppression is a helper method to define mock.On call. +// - imageName string +func (_e *MockDeployCoordinator_Expecter) ClearDeployEventSuppression(imageName interface{}) *MockDeployCoordinator_ClearDeployEventSuppression_Call { + return &MockDeployCoordinator_ClearDeployEventSuppression_Call{Call: _e.mock.On("ClearDeployEventSuppression", imageName)} +} + +func (_c *MockDeployCoordinator_ClearDeployEventSuppression_Call) Run(run func(imageName string)) *MockDeployCoordinator_ClearDeployEventSuppression_Call { + _c.Call.Run(func(args mock.Arguments) { + var arg0 string + if args[0] != nil { + arg0 = args[0].(string) + } + run(arg0) + }) + return _c +} + +func (_c *MockDeployCoordinator_ClearDeployEventSuppression_Call) Return() *MockDeployCoordinator_ClearDeployEventSuppression_Call { + _c.Call.Return() + return _c +} + +func (_c *MockDeployCoordinator_ClearDeployEventSuppression_Call) RunAndReturn(run func(imageName string)) *MockDeployCoordinator_ClearDeployEventSuppression_Call { + _c.Run(run) + return _c +} + +// SuppressDeployEvent provides a mock function for the type MockDeployCoordinator. +func (_mock *MockDeployCoordinator) SuppressDeployEvent(imageName string) { + _mock.Called(imageName) +} + +// MockDeployCoordinator_SuppressDeployEvent_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'SuppressDeployEvent'. +type MockDeployCoordinator_SuppressDeployEvent_Call struct { + *mock.Call +} + +// SuppressDeployEvent is a helper method to define mock.On call. +// - imageName string +func (_e *MockDeployCoordinator_Expecter) SuppressDeployEvent(imageName interface{}) *MockDeployCoordinator_SuppressDeployEvent_Call { + return &MockDeployCoordinator_SuppressDeployEvent_Call{Call: _e.mock.On("SuppressDeployEvent", imageName)} +} + +func (_c *MockDeployCoordinator_SuppressDeployEvent_Call) Run(run func(imageName string)) *MockDeployCoordinator_SuppressDeployEvent_Call { + _c.Call.Run(func(args mock.Arguments) { + var arg0 string + if args[0] != nil { + arg0 = args[0].(string) + } + run(arg0) + }) + return _c +} + +func (_c *MockDeployCoordinator_SuppressDeployEvent_Call) Return() *MockDeployCoordinator_SuppressDeployEvent_Call { + _c.Call.Return() + return _c +} + +func (_c *MockDeployCoordinator_SuppressDeployEvent_Call) RunAndReturn(run func(imageName string)) *MockDeployCoordinator_SuppressDeployEvent_Call { + _c.Run(run) + return _c +} diff --git a/internal/boundaries/in/mocks/mock_registry_service.go b/internal/boundaries/in/mocks/mock_registry_service.go index 7ee610f4..52821cb6 100644 --- a/internal/boundaries/in/mocks/mock_registry_service.go +++ b/internal/boundaries/in/mocks/mock_registry_service.go @@ -231,6 +231,46 @@ func (_c *MockRegistryService_CancelUpload_Call) RunAndReturn(run func(ctx conte return _c } +// ClearDeployEventSuppression provides a mock function for the type MockRegistryService +func (_mock *MockRegistryService) ClearDeployEventSuppression(imageName string) { + _mock.Called(imageName) + return +} + +// MockRegistryService_ClearDeployEventSuppression_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'ClearDeployEventSuppression' +type MockRegistryService_ClearDeployEventSuppression_Call struct { + *mock.Call +} + +// ClearDeployEventSuppression is a helper method to define mock.On call +// - imageName string +func (_e *MockRegistryService_Expecter) ClearDeployEventSuppression(imageName interface{}) *MockRegistryService_ClearDeployEventSuppression_Call { + return &MockRegistryService_ClearDeployEventSuppression_Call{Call: _e.mock.On("ClearDeployEventSuppression", imageName)} +} + +func (_c *MockRegistryService_ClearDeployEventSuppression_Call) Run(run func(imageName string)) *MockRegistryService_ClearDeployEventSuppression_Call { + _c.Call.Run(func(args mock.Arguments) { + var arg0 string + if args[0] != nil { + arg0 = args[0].(string) + } + run( + arg0, + ) + }) + return _c +} + +func (_c *MockRegistryService_ClearDeployEventSuppression_Call) Return() *MockRegistryService_ClearDeployEventSuppression_Call { + _c.Call.Return() + return _c +} + +func (_c *MockRegistryService_ClearDeployEventSuppression_Call) RunAndReturn(run func(imageName string)) *MockRegistryService_ClearDeployEventSuppression_Call { + _c.Run(run) + return _c +} + // DeleteManifest provides a mock function for the type MockRegistryService func (_mock *MockRegistryService) DeleteManifest(ctx context.Context, name string, reference string) error { ret := _mock.Called(ctx, name, reference) @@ -895,3 +935,43 @@ func (_c *MockRegistryService_StartUpload_Call) RunAndReturn(run func(ctx contex _c.Call.Return(run) return _c } + +// SuppressDeployEvent provides a mock function for the type MockRegistryService +func (_mock *MockRegistryService) SuppressDeployEvent(imageName string) { + _mock.Called(imageName) + return +} + +// MockRegistryService_SuppressDeployEvent_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'SuppressDeployEvent' +type MockRegistryService_SuppressDeployEvent_Call struct { + *mock.Call +} + +// SuppressDeployEvent is a helper method to define mock.On call +// - imageName string +func (_e *MockRegistryService_Expecter) SuppressDeployEvent(imageName interface{}) *MockRegistryService_SuppressDeployEvent_Call { + return &MockRegistryService_SuppressDeployEvent_Call{Call: _e.mock.On("SuppressDeployEvent", imageName)} +} + +func (_c *MockRegistryService_SuppressDeployEvent_Call) Run(run func(imageName string)) *MockRegistryService_SuppressDeployEvent_Call { + _c.Call.Run(func(args mock.Arguments) { + var arg0 string + if args[0] != nil { + arg0 = args[0].(string) + } + run( + arg0, + ) + }) + return _c +} + +func (_c *MockRegistryService_SuppressDeployEvent_Call) Return() *MockRegistryService_SuppressDeployEvent_Call { + _c.Call.Return() + return _c +} + +func (_c *MockRegistryService_SuppressDeployEvent_Call) RunAndReturn(run func(imageName string)) *MockRegistryService_SuppressDeployEvent_Call { + _c.Run(run) + return _c +} diff --git a/internal/usecase/container/readiness.go b/internal/usecase/container/readiness.go new file mode 100644 index 00000000..44dafe63 --- /dev/null +++ b/internal/usecase/container/readiness.go @@ -0,0 +1,137 @@ +package container + +import ( + "context" + "errors" + "fmt" + "net" + "net/http" + "time" + + "github.com/bnema/zerowrap" +) + +var errProbeTimeout = errors.New("probe timeout") + +func probeLoop( + ctx context.Context, + deadline time.Time, + baseSleep time.Duration, + defaultAttemptTimeout time.Duration, + work func(attemptCtx context.Context) (success bool, err error), +) error { + for { + if err := ctx.Err(); err != nil { + return err + } + if time.Now().After(deadline) { + return errProbeTimeout + } + + remaining := time.Until(deadline) + attemptTimeout := defaultAttemptTimeout + if remaining < attemptTimeout { + attemptTimeout = remaining + } + + attemptCtx, cancel := context.WithTimeout(ctx, attemptTimeout) + success, err := work(attemptCtx) + cancel() + if err != nil { + return err + } + if success { + return nil + } + + sleepInterval := baseSleep + if remProbe := time.Until(deadline); remProbe < sleepInterval { + sleepInterval = remProbe + } + if ctxDeadline, ok := ctx.Deadline(); ok { + if remCtx := time.Until(ctxDeadline); remCtx < sleepInterval { + sleepInterval = remCtx + } + } + if sleepInterval <= 0 { + if ctx.Err() != nil { + return ctx.Err() + } + return errProbeTimeout + } + + t := time.NewTimer(sleepInterval) + select { + case <-t.C: + case <-ctx.Done(): + t.Stop() + return ctx.Err() + } + } +} + +// tcpProbe attempts a TCP connection to addr, retrying every 500ms until +// success or timeout. This is the universal fallback readiness check — +// it verifies the process is at least accepting connections. +func tcpProbe(ctx context.Context, addr string, timeout time.Duration) error { + log := zerowrap.FromCtx(ctx) + deadline := time.Now().Add(timeout) + dialer := &net.Dialer{} + attempts := 0 + var lastErr error + err := probeLoop(ctx, deadline, 500*time.Millisecond, time.Second, func(attemptCtx context.Context) (bool, error) { + conn, err := dialer.DialContext(attemptCtx, "tcp", addr) + attempts++ + if err == nil { + conn.Close() + log.Debug().Str("addr", addr).Int("attempts", attempts).Msg("TCP probe connected") + return true, nil + } + + lastErr = err + if attempts <= 3 || attempts%10 == 0 { + log.Debug().Err(err).Str("addr", addr).Int("attempt", attempts).Msg("TCP probe attempt failed") + } + return false, nil + }) + if errors.Is(err, errProbeTimeout) { + return fmt.Errorf("TCP probe timeout after %s: %s not reachable (attempts=%d, last_error=%v)", timeout, addr, attempts, lastErr) + } + return err +} + +// httpProbe performs HTTP GET requests to url, retrying every 1s until a +// 2xx/3xx response or timeout. Used when gordon.health label is set. +func httpProbe(ctx context.Context, url string, timeout time.Duration) error { + client := &http.Client{ + CheckRedirect: func(req *http.Request, via []*http.Request) error { + // Don't follow redirects — treat 3xx as a successful response. + return http.ErrUseLastResponse + }, + } + deadline := time.Now().Add(timeout) + var lastStatus int + err := probeLoop(ctx, deadline, time.Second, 2*time.Second, func(attemptCtx context.Context) (bool, error) { + req, reqErr := http.NewRequestWithContext(attemptCtx, http.MethodGet, url, nil) + if reqErr != nil { + return false, reqErr + } + + resp, err := client.Do(req) + if err != nil { + return false, nil + } + + lastStatus = resp.StatusCode + resp.Body.Close() + if lastStatus >= 200 && lastStatus < 400 { + return true, nil + } + + return false, nil + }) + if errors.Is(err, errProbeTimeout) { + return fmt.Errorf("HTTP probe timeout after %s: last status %d from %s", timeout, lastStatus, url) + } + return err +} diff --git a/internal/usecase/container/readiness_test.go b/internal/usecase/container/readiness_test.go new file mode 100644 index 00000000..811ba9ff --- /dev/null +++ b/internal/usecase/container/readiness_test.go @@ -0,0 +1,167 @@ +package container + +import ( + "context" + "net" + "net/http" + "net/http/httptest" + "sync/atomic" + "syscall" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// --- TCP Probe Tests --- + +func TestTCPProbe_Success(t *testing.T) { + ln, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + defer ln.Close() + + // Accept connections in background so the probe succeeds + go func() { + for { + conn, err := ln.Accept() + if err != nil { + return + } + conn.Close() + } + }() + + ctx := testContext() + err = tcpProbe(ctx, ln.Addr().String(), 5*time.Second) + assert.NoError(t, err) +} + +func TestTCPProbe_Timeout(t *testing.T) { + // Reserve an ephemeral port and close it immediately to get a deterministic + // unreachable target, avoiding assumptions about privileged ports. + ln, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + addr := ln.Addr().String() + ln.Close() + + ctx := testContext() + err = tcpProbe(ctx, addr, 200*time.Millisecond) + assert.Error(t, err) + assert.Contains(t, err.Error(), "TCP probe timeout") +} + +func TestTCPProbe_ContextCancelled(t *testing.T) { + ctx, cancel := context.WithCancel(testContext()) + cancel() + err := tcpProbe(ctx, "127.0.0.1:1", 5*time.Second) + assert.Error(t, err) +} + +func TestTCPProbe_DelayedListener(t *testing.T) { + ln, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + // Close initially so probe fails at first + addr := ln.Addr().String() + ln.Close() + + // Re-open after 1 second + go func() { + time.Sleep(time.Second) + lc := net.ListenConfig{ + Control: func(network, address string, c syscall.RawConn) error { + return c.Control(func(fd uintptr) { + _ = syscall.SetsockoptInt(int(fd), syscall.SOL_SOCKET, syscall.SO_REUSEADDR, 1) + }) + }, + } + newLn, err := lc.Listen(context.Background(), "tcp", addr) + if err != nil { + t.Logf("delayed listener failed to bind: %v", err) + return + } + defer newLn.Close() + for { + conn, err := newLn.Accept() + if err != nil { + return + } + conn.Close() + } + }() + + ctx := testContext() + err = tcpProbe(ctx, addr, 5*time.Second) + assert.NoError(t, err) +} + +// --- HTTP Probe Tests --- + +func TestHTTPProbe_Success(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/healthz" { + w.WriteHeader(200) + return + } + w.WriteHeader(404) + })) + defer srv.Close() + + ctx := testContext() + err := httpProbe(ctx, srv.URL+"/healthz", 5*time.Second) + assert.NoError(t, err) +} + +func TestHTTPProbe_ServerError_ThenSuccess(t *testing.T) { + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + n := atomic.AddInt32(&calls, 1) + if n < 3 { + w.WriteHeader(503) + return + } + w.WriteHeader(200) + })) + defer srv.Close() + + ctx := testContext() + err := httpProbe(ctx, srv.URL+"/healthz", 10*time.Second) + assert.NoError(t, err) + assert.GreaterOrEqual(t, atomic.LoadInt32(&calls), int32(3)) +} + +func TestHTTPProbe_Timeout(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(503) + })) + defer srv.Close() + + ctx := testContext() + err := httpProbe(ctx, srv.URL+"/healthz", 500*time.Millisecond) + assert.Error(t, err) + assert.Contains(t, err.Error(), "HTTP probe timeout") +} + +func TestHTTPProbe_ContextCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(503) + })) + defer srv.Close() + + ctx, cancel := context.WithCancel(testContext()) + cancel() + err := httpProbe(ctx, srv.URL+"/healthz", 5*time.Second) + assert.Error(t, err) +} + +func TestHTTPProbe_RedirectIsSuccess(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Redirect(w, r, "/other", http.StatusFound) + })) + defer srv.Close() + + ctx := testContext() + // The probe should treat 3xx as success without following the redirect + err := httpProbe(ctx, srv.URL+"/healthz", 5*time.Second) + assert.NoError(t, err) +} diff --git a/internal/usecase/container/service.go b/internal/usecase/container/service.go index 0e5cf71f..7aebc1fd 100644 --- a/internal/usecase/container/service.go +++ b/internal/usecase/container/service.go @@ -38,7 +38,6 @@ type Config struct { VolumePreserve bool NetworkIsolation bool NetworkPrefix string - DNSSuffix string NetworkGroups map[string][]string Attachments map[string][]string ReadinessDelay time.Duration // Delay after container starts before considering it ready @@ -48,6 +47,9 @@ type Config struct { DrainDelayConfigured bool // True when deploy.drain_delay was explicitly configured DrainMode string // Drain strategy: auto, inflight, delay DrainTimeout time.Duration // Max wait for in-flight requests to drain + StabilizationDelay time.Duration // Post-switch monitoring window (default 2s) + TCPProbeTimeout time.Duration // TCP probe timeout (default 30s) + HTTPProbeTimeout time.Duration // HTTP probe timeout (default 60s) } var tracer = otel.Tracer("gordon.container") @@ -257,26 +259,9 @@ func (s *Service) Deploy(ctx context.Context, route domain.Route) (*domain.Conta log := zerowrap.FromCtx(ctx) // Record deploy metrics and trace status - defer func() { - if err != nil { - span.RecordError(err) - span.SetStatus(codes.Error, err.Error()) - } - if s.metrics == nil { - return - } - attrs := metric.WithAttributes( - attribute.String("domain", route.Domain), - attribute.String("image", route.Image), - ) - s.metrics.DeployTotal.Add(ctx, 1, attrs) - s.metrics.DeployDuration.Record(ctx, time.Since(deployStart).Seconds(), attrs) - if err != nil { - s.metrics.DeployErrors.Add(ctx, 1, attrs) - } - }() + defer s.recordDeployMetrics(ctx, span, route, deployStart, &err) - existing, hasExisting := s.getTrackedContainer(route.Domain) + existing, hasExisting := s.resolveExistingContainer(ctx, route.Domain) resources, err := s.prepareDeployResources(ctx, route, existing) if err != nil { @@ -287,9 +272,15 @@ func (s *Service) Deploy(ctx context.Context, route domain.Route) (*domain.Conta // the exact same image (by Docker image ID), return it immediately. // This prevents the double-deploy caused by the event-based deploy // (triggered by image.pushed) racing with the explicit CLI deploy call. - if hasExisting && existing.ImageID != "" { - if skip, container := s.skipRedundantDeploy(ctx, existing, resources.actualImageRef); skip { - return container, nil + if hasExisting { + existingForSkip := existing + if existingForSkip.ImageID == "" && existingForSkip.Image != "" && normalizeImageRef(existingForSkip.Image) == normalizeImageRef(route.Image) { + existingForSkip = s.containerForRedundantCheck(ctx, existingForSkip) + } + if existingForSkip.ImageID != "" { + if skip, container := s.skipRedundantDeploy(ctx, existingForSkip, resources.actualImageRef); skip { + return container, nil + } } } @@ -299,6 +290,22 @@ func (s *Service) Deploy(ctx context.Context, route domain.Route) (*domain.Conta } invalidated := s.activateDeployedContainer(ctx, route.Domain, newContainer) + + // Post-switch stabilization: verify new container stays running + if hasExisting { + stable, stabilizeErr := s.stabilizeNewContainer(ctx, route.Domain, newContainer, existing) + if stabilizeErr != nil { + // Both old and new containers are dead; assign to named return so + // the deferred recordDeployMetrics and span see the failure. + err = stabilizeErr + return nil, err + } + if !stable { + // Rollback performed — old container is restored + return existing, nil + } + } + s.finalizePreviousContainer(ctx, route.Domain, existing, hasExisting, invalidated, newContainer.ID) // Start container log collection (non-blocking, errors don't fail deployment) @@ -315,6 +322,29 @@ func (s *Service) Deploy(ctx context.Context, route domain.Route) (*domain.Conta return newContainer, nil } +// recordDeployMetrics records span error status and deploy metrics at the end of a Deploy call. +// It is called via defer and receives a pointer to the named return error so it can observe +// the final error value after all deferred functions have run. +func (s *Service) recordDeployMetrics(ctx context.Context, span trace.Span, route domain.Route, start time.Time, errPtr *error) { + err := *errPtr + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + } + if s.metrics == nil { + return + } + attrs := metric.WithAttributes( + attribute.String("domain", route.Domain), + attribute.String("image", route.Image), + ) + s.metrics.DeployTotal.Add(ctx, 1, attrs) + s.metrics.DeployDuration.Record(ctx, time.Since(start).Seconds(), attrs) + if err != nil { + s.metrics.DeployErrors.Add(ctx, 1, attrs) + } +} + // skipRedundantDeploy checks whether the existing container is already running // the same image (by Docker image ID) as the one we are about to deploy. // When a push triggers both an event-based deploy and an explicit CLI deploy, @@ -347,6 +377,26 @@ func (s *Service) skipRedundantDeploy(ctx context.Context, existing *domain.Cont return true, existing } +func (s *Service) containerForRedundantCheck(ctx context.Context, existing *domain.Container) *domain.Container { + if existing == nil || existing.ImageID != "" || existing.ID == "" { + return existing + } + + log := zerowrap.FromCtx(ctx) + inspected, err := s.runtime.InspectContainer(ctx, existing.ID) + if err != nil { + log.Debug().Err(err).Str("container_id", existing.ID).Msg("cannot inspect existing container for redundancy check") + return existing + } + if inspected == nil || inspected.ImageID == "" { + return existing + } + + existingCopy := *existing + existingCopy.ImageID = inspected.ImageID + return &existingCopy +} + type deployResources struct { networkName string actualImageRef string @@ -355,11 +405,68 @@ type deployResources struct { volumes map[string]string } -func (s *Service) getTrackedContainer(domainName string) (*domain.Container, bool) { +// resolveExistingContainer returns the currently running container for a domain. +// It first checks the in-memory map (fast path). If not found, it queries the +// runtime for a running container with matching name and managed label. This +// handles cases where Gordon restarted and in-memory state is stale. +// +// The slow path checks canonical, -new, and -next names because a previous +// deploy may have been interrupted (e.g. eventbus timeout, Gordon restart) +// leaving the active container under a temp name. Canonical is preferred when +// multiple matches exist. +func (s *Service) resolveExistingContainer(ctx context.Context, domainName string) (*domain.Container, bool) { + // Fast path: check in-memory state s.mu.RLock() container, ok := s.containers[domainName] s.mu.RUnlock() - return container, ok + if ok { + return container, true + } + + // Slow path: query runtime for running containers + log := zerowrap.FromCtx(ctx) + canonicalName := fmt.Sprintf("gordon-%s", domainName) + candidateNames := map[string]bool{ + canonicalName: true, + canonicalName + "-new": true, + canonicalName + "-next": true, + } + + running, err := s.runtime.ListContainers(ctx, false) + if err != nil { + log.Warn().Err(err).Msg("failed to list running containers for existing container resolution") + return nil, false + } + + // Prefer canonical name; fall back to any running managed temp container. + var best *domain.Container + for _, c := range running { + if !candidateNames[c.Name] || c.Labels[domain.LabelManaged] != "true" { + continue + } + if best == nil || c.Name == canonicalName { + best = c + } + } + + if best != nil { + log.Info(). + Str("container_id", best.ID). + Str("container_name", best.Name). + Msg("resolved existing container from runtime (in-memory state was stale)") + + // Update in-memory state so subsequent lookups are fast + s.mu.Lock() + if _, alreadyTracked := s.containers[domainName]; !alreadyTracked { + s.managedCount++ + } + s.containers[domainName] = best + s.mu.Unlock() + + return best, true + } + + return nil, false } func (s *Service) prepareDeployResources(ctx context.Context, route domain.Route, existing *domain.Container) (*deployResources, error) { @@ -438,7 +545,7 @@ func (s *Service) createStartedContainer(ctx context.Context, route domain.Route return nil, log.WrapErr(err, "failed to start container") } if !domain.IsSkipReadiness(ctx) { - if err := s.waitForReady(ctx, newContainer.ID); err != nil { + if err := s.waitForReady(ctx, newContainer.ID, containerConfig); err != nil { s.cleanupFailedContainer(ctx, newContainer.ID) return nil, log.WrapErr(err, "container failed readiness check") } @@ -480,6 +587,85 @@ func (s *Service) activateDeployedContainer(ctx context.Context, domainName stri return false } +// stabilizeNewContainer monitors the new container briefly after traffic switch. +// If it crashes during this window, rolls back to old container. +// Returns true if stabilization succeeded, false if rollback was performed. +// Returns an error only when both new and old containers are dead. +func (s *Service) stabilizeNewContainer(ctx context.Context, domainName string, newContainer, oldContainer *domain.Container) (bool, error) { + log := zerowrap.FromCtx(ctx) + + if oldContainer == nil { + return true, nil + } + + s.mu.RLock() + delay := s.config.StabilizationDelay + s.mu.RUnlock() + if delay == 0 { + delay = 2 * time.Second + } + + // Brief stabilization: verify new container is still running after delay + select { + case <-time.After(delay): + case <-ctx.Done(): + return true, nil + } + + running, err := s.runtime.IsContainerRunning(ctx, newContainer.ID) + if err != nil || !running { + log.Error(). + Str("new_container", newContainer.ID). + Str("old_container", oldContainer.ID). + Msg("new container crashed during stabilization, rolling back to old") + + // Verify old container is still running before restoring it. + // If both old and new are dead (e.g., OOM), returning a dead + // container as "existing" would leave the domain in a broken state. + oldRunning, oldErr := s.runtime.IsContainerRunning(ctx, oldContainer.ID) + if oldErr != nil || !oldRunning { + log.Error(). + Str("old_container", oldContainer.ID). + Msg("old container is also not running, cannot rollback") + + // Cleanup failed new container + if stopErr := s.runtime.StopContainer(ctx, newContainer.ID); stopErr != nil { + log.WrapErrWithFields(stopErr, "failed to stop failed new container during rollback", map[string]any{zerowrap.FieldEntityID: newContainer.ID}) + } + if removeErr := s.runtime.RemoveContainer(ctx, newContainer.ID, true); removeErr != nil { + log.WrapErrWithFields(removeErr, "failed to remove failed new container during rollback", map[string]any{zerowrap.FieldEntityID: newContainer.ID}) + } + + return false, fmt.Errorf("stabilization failed: new container crashed and old container %s is also not running", oldContainer.ID) + } + + // Rollback: restore old container as tracked + s.mu.Lock() + s.containers[domainName] = oldContainer + s.mu.Unlock() + + // Re-invalidate proxy cache to point back to old + s.mu.RLock() + inv := s.cacheInvalidator + s.mu.RUnlock() + if inv != nil { + inv.InvalidateTarget(ctx, domainName) + } + + // Cleanup failed new container + if stopErr := s.runtime.StopContainer(ctx, newContainer.ID); stopErr != nil { + log.WrapErrWithFields(stopErr, "failed to stop failed new container during rollback", map[string]any{zerowrap.FieldEntityID: newContainer.ID}) + } + if removeErr := s.runtime.RemoveContainer(ctx, newContainer.ID, true); removeErr != nil { + log.WrapErrWithFields(removeErr, "failed to remove failed new container during rollback", map[string]any{zerowrap.FieldEntityID: newContainer.ID}) + } + + return false, nil + } + + return true, nil +} + func (s *Service) finalizePreviousContainer(ctx context.Context, domainName string, existing *domain.Container, hasExisting, invalidated bool, newContainerID string) { if !hasExisting { return @@ -1587,7 +1773,19 @@ func (s *Service) cleanupOrphanedContainers(ctx context.Context, domainName stri for _, c := range allContainers { if (c.Name == expectedName || c.Name == expectedNewName || c.Name == expectedNextName) && c.ID != skipContainerID { - log.Info().Str(zerowrap.FieldEntityID, c.ID).Str(zerowrap.FieldStatus, c.Status).Msg("found orphaned container, removing") + // Skip any running container regardless of name — a running + // temp container may be actively serving traffic while the + // new container stabilizes, or may be from a concurrent deploy. + if c.Status == "running" { + log.Debug(). + Str(zerowrap.FieldEntityID, c.ID). + Str("container_name", c.Name). + Str(zerowrap.FieldStatus, c.Status). + Msg("skipping running container during orphan cleanup") + continue + } + + log.Info().Str(zerowrap.FieldEntityID, c.ID).Str("container_name", c.Name).Str(zerowrap.FieldStatus, c.Status).Msg("found orphaned container, removing") if err := s.runtime.StopContainer(ctx, c.ID); err != nil { log.WrapErrWithFields(err, "failed to stop orphaned container", map[string]any{zerowrap.FieldEntityID: c.ID}) @@ -2065,7 +2263,7 @@ func (s *Service) findContainerByName(ctx context.Context, name string) *domain. return nil } -func (s *Service) waitForReady(ctx context.Context, containerID string) error { +func (s *Service) waitForReady(ctx context.Context, containerID string, containerConfig *domain.ContainerConfig) error { if err := s.pollContainerRunning(ctx, containerID); err != nil { return err } @@ -2092,15 +2290,123 @@ func (s *Service) waitForReady(ctx context.Context, containerID string) error { } return s.waitForHealthy(ctx, containerID, cfg.HealthTimeout) default: // auto - _, hasHealthcheck, err := s.runtime.GetContainerHealthStatus(ctx, containerID) - if err != nil { - return err - } - if hasHealthcheck { - return s.waitForHealthy(ctx, containerID, cfg.HealthTimeout) - } - return s.waitForReadyByDelay(ctx, containerID) + return s.readinessCascade(ctx, containerID, containerConfig, cfg) + } +} + +// readinessCascade auto-detects the strongest available readiness signal: +// 1. Docker healthcheck (if present) → wait for healthy status +// 2. HTTP probe (if gordon.health label set) → GET until 2xx/3xx +// 3. TCP probe (if port info available) → connect until accepted +// 4. Delay fallback (last resort) → waitForReadyByDelay +func (s *Service) readinessCascade(ctx context.Context, containerID string, containerConfig *domain.ContainerConfig, cfg Config) error { + log := zerowrap.FromCtx(ctx) + + // 1. Docker healthcheck + _, hasHealthcheck, err := s.runtime.GetContainerHealthStatus(ctx, containerID) + if err != nil { + return err + } + if hasHealthcheck { + log.Info().Msg("readiness cascade: using Docker healthcheck") + return s.waitForHealthy(ctx, containerID, cfg.HealthTimeout) + } + + // 2. HTTP probe via gordon.health label + if probed, probeErr := s.tryHTTPProbe(ctx, containerID, containerConfig, cfg); probed { + return probeErr + } + + // 3. TCP probe (if port info available) + if probed, probeErr := s.tryTCPProbe(ctx, containerID, containerConfig, cfg); probed { + return probeErr + } + + // 4. Delay fallback + log.Info().Msg("readiness cascade: using delay fallback") + return s.waitForReadyByDelay(ctx, containerID) +} + +// tryHTTPProbe attempts an HTTP probe if the gordon.health label is set. +// Returns (true, err) if the probe was attempted, (false, nil) if skipped. +func (s *Service) tryHTTPProbe(ctx context.Context, containerID string, containerConfig *domain.ContainerConfig, cfg Config) (bool, error) { + log := zerowrap.FromCtx(ctx) + if containerConfig == nil { + return false, nil + } + healthPath, ok := containerConfig.Labels[domain.LabelHealth] + if !ok || healthPath == "" { + return false, nil + } + ip, port, probeErr := s.resolveContainerEndpoint(ctx, containerID) + if probeErr != nil || ip == "" || port <= 0 { + log.Debug().Err(probeErr).Msg("readiness cascade: HTTP probe skipped, could not resolve container endpoint") + return false, nil + } + url := fmt.Sprintf("http://%s:%d%s", ip, port, healthPath) + timeout := cfg.HTTPProbeTimeout + if timeout == 0 { + timeout = 60 * time.Second + } + log.Info().Str("url", url).Dur("timeout", timeout).Msg("readiness cascade: using HTTP probe") + return true, httpProbe(ctx, url, timeout) +} + +// tryTCPProbe attempts a TCP probe if port info is available. +// Returns (true, err) if the probe was attempted, (false, nil) if skipped. +func (s *Service) tryTCPProbe(ctx context.Context, containerID string, containerConfig *domain.ContainerConfig, cfg Config) (bool, error) { + log := zerowrap.FromCtx(ctx) + if containerConfig == nil || len(containerConfig.Ports) == 0 { + return false, nil + } + ip, port, probeErr := s.resolveContainerEndpoint(ctx, containerID) + if probeErr != nil || ip == "" || port <= 0 { + log.Debug().Err(probeErr).Msg("readiness cascade: TCP probe skipped, could not resolve container endpoint") + return false, nil + } + addr := fmt.Sprintf("%s:%d", ip, port) + timeout := cfg.TCPProbeTimeout + if timeout == 0 { + timeout = 30 * time.Second + } + log.Info().Str("addr", addr).Dur("timeout", timeout).Msg("readiness cascade: using TCP probe") + return true, tcpProbe(ctx, addr, timeout) +} + +// resolveContainerEndpoint returns a host-reachable address for probing. +// In rootless podman/Docker setups, container internal IPs are not routable +// from the host. We use the host port binding (127.0.0.1:) +// which is always reachable. Falls back to the container's internal IP +// only if no host port mapping exists (e.g. host-network mode). +func (s *Service) resolveContainerEndpoint(ctx context.Context, containerID string) (string, int, error) { + log := zerowrap.FromCtx(ctx) + + // First, get the container's internal port from network info + _, internalPort, err := s.runtime.GetContainerNetworkInfo(ctx, containerID) + if err != nil { + return "", 0, err + } + + // Try to resolve via host port binding (works in rootless podman/Docker) + hostPort, hostErr := s.runtime.GetContainerPort(ctx, containerID, internalPort) + if hostErr == nil && hostPort > 0 { + log.Debug(). + Int("internal_port", internalPort). + Int("host_port", hostPort). + Msg("resolved container endpoint via host port binding") + return "127.0.0.1", hostPort, nil + } + + // Fallback: use internal IP (works in Docker rootful / host network) + ip, _, fallbackErr := s.runtime.GetContainerNetworkInfo(ctx, containerID) + if fallbackErr != nil { + return "", 0, fallbackErr } + log.Debug(). + Str("ip", ip). + Int("port", internalPort). + Msg("resolved container endpoint via internal IP (no host port binding)") + return ip, internalPort, nil } // waitForReadyByDelay waits using the legacy running+delay strategy. diff --git a/internal/usecase/container/service_readiness_test.go b/internal/usecase/container/service_readiness_test.go index 1e613376..a7c3e2b6 100644 --- a/internal/usecase/container/service_readiness_test.go +++ b/internal/usecase/container/service_readiness_test.go @@ -1,16 +1,31 @@ package container import ( + "net" "strings" "testing" "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" "github.com/bnema/gordon/internal/boundaries/out/mocks" + "github.com/bnema/gordon/internal/domain" ) +// reserveEphemeralAddr binds on 127.0.0.1:0, captures the assigned port, closes +// the listener, and returns the address string. The port is guaranteed to be +// unreachable (no process listening) when the probe runs. +func reserveEphemeralAddr(t *testing.T) (ip string, port int) { + t.Helper() + ln, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + addr := ln.Addr().(*net.TCPAddr) + ln.Close() + return addr.IP.String(), addr.Port +} + func TestService_WaitForReady_AutoFallsBackToDelayWhenNoHealthcheck(t *testing.T) { runtime := mocks.NewMockContainerRuntime(t) svc := NewService(runtime, nil, nil, nil, Config{ @@ -28,7 +43,166 @@ func TestService_WaitForReady_AutoFallsBackToDelayWhenNoHealthcheck(t *testing.T // Delay-mode fallback verification runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() - err := svc.waitForReady(ctx, containerID) + err := svc.waitForReady(ctx, containerID, nil) + assert.NoError(t, err) +} + +func TestService_WaitForReady_AutoCascadeUsesTCPWhenNoHealthcheckAndNoHealthLabel(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + svc := NewService(runtime, nil, nil, nil, Config{ + ReadinessMode: "auto", + }) + + ctx := testContext() + containerID := "container-1" + + containerConfig := &domain.ContainerConfig{ + Ports: []int{8080}, + Labels: map[string]string{}, + } + + // Reserve an ephemeral loopback port that is guaranteed to be unreachable. + probeIP, probePort := reserveEphemeralAddr(t) + + // Initial running poll + runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() + // No Docker healthcheck + runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("", false, nil).Once() + // Cascade resolves container endpoint for TCP probe + runtime.EXPECT().GetContainerNetworkInfo(mock.Anything, containerID).Return(probeIP, probePort, nil).Once() + // Host port binding resolution — return the same loopback addr so probe hits it + runtime.EXPECT().GetContainerPort(mock.Anything, containerID, probePort).Return(probePort, nil).Once() + + // TCP probe will try to connect — which will fail since the port is closed. + // Use a short health timeout so the test doesn't hang. + svc.mu.Lock() + svc.config.TCPProbeTimeout = 50 * time.Millisecond + svc.mu.Unlock() + + err := svc.waitForReady(ctx, containerID, containerConfig) + // Expect TCP probe timeout (no server listening) + assert.Error(t, err) + assert.Contains(t, err.Error(), "TCP probe timeout") +} + +func TestService_WaitForReady_AutoCascadeUsesHTTPProbeWhenHealthLabelSet(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + svc := NewService(runtime, nil, nil, nil, Config{ + ReadinessMode: "auto", + HTTPProbeTimeout: 50 * time.Millisecond, + }) + + ctx := testContext() + containerID := "container-1" + + containerConfig := &domain.ContainerConfig{ + Ports: []int{8080}, + Labels: map[string]string{ + domain.LabelHealth: "/healthz", + }, + } + + // Reserve an ephemeral loopback port that is guaranteed to be unreachable. + probeIP, probePort := reserveEphemeralAddr(t) + + // Initial running poll + runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() + // No Docker healthcheck + runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("", false, nil).Once() + // Cascade resolves container endpoint for HTTP probe + runtime.EXPECT().GetContainerNetworkInfo(mock.Anything, containerID).Return(probeIP, probePort, nil).Once() + // Host port binding resolution — return the same loopback addr so probe hits it + runtime.EXPECT().GetContainerPort(mock.Anything, containerID, probePort).Return(probePort, nil).Once() + + err := svc.waitForReady(ctx, containerID, containerConfig) + // Expect HTTP probe timeout (no server listening) + assert.Error(t, err) + assert.Contains(t, err.Error(), "HTTP probe timeout") +} + +func TestService_WaitForReady_AutoCascadeUsesHealthcheckWhenPresent(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + svc := NewService(runtime, nil, nil, nil, Config{ + ReadinessMode: "auto", + HealthTimeout: 50 * time.Millisecond, + }) + + ctx := testContext() + containerID := "container-1" + + containerConfig := &domain.ContainerConfig{ + Ports: []int{8080}, + Labels: map[string]string{ + domain.LabelHealth: "/healthz", + }, + } + + // Initial running poll + runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() + // Docker healthcheck IS present — cascade detects it, then waitForHealthy polls it + // First call: cascade detection (hasHealthcheck=true) + // Second call: waitForHealthy loop (status=healthy) + runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("starting", true, nil).Once() + runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("healthy", true, nil).Once() + + err := svc.waitForReady(ctx, containerID, containerConfig) + assert.NoError(t, err) +} + +func TestService_WaitForReady_AutoCascadeFallsToDelayWhenNoEndpoint(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + svc := NewService(runtime, nil, nil, nil, Config{ + ReadinessMode: "auto", + ReadinessDelay: time.Millisecond, + }) + + ctx := testContext() + containerID := "container-1" + + // Container has ports but network info is unavailable + containerConfig := &domain.ContainerConfig{ + Ports: []int{8080}, + Labels: map[string]string{}, + } + + // Initial running poll + runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() + // No Docker healthcheck + runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("", false, nil).Once() + // Network info fails — can't do TCP probe, fall through to delay + runtime.EXPECT().GetContainerNetworkInfo(mock.Anything, containerID).Return("", 0, assert.AnError).Once() + // Delay-mode fallback verification + runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() + + err := svc.waitForReady(ctx, containerID, containerConfig) + assert.NoError(t, err) +} + +func TestService_WaitForReady_ExplicitDelayModeIgnoresCascade(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + svc := NewService(runtime, nil, nil, nil, Config{ + ReadinessMode: "delay", + ReadinessDelay: time.Millisecond, + }) + + ctx := testContext() + containerID := "container-1" + + // Even with health label and ports, "delay" mode skips cascade entirely + containerConfig := &domain.ContainerConfig{ + Ports: []int{8080}, + Labels: map[string]string{ + domain.LabelHealth: "/healthz", + }, + } + + // Initial running poll + runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() + // No GetContainerHealthStatus call — delay mode skips it + // Delay-mode verification + runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() + + err := svc.waitForReady(ctx, containerID, containerConfig) assert.NoError(t, err) } diff --git a/internal/usecase/container/service_test.go b/internal/usecase/container/service_test.go index fae2eeb8..c1a9feb9 100644 --- a/internal/usecase/container/service_test.go +++ b/internal/usecase/container/service_test.go @@ -25,6 +25,17 @@ func testContext() context.Context { return zerowrap.WithCtx(context.Background(), zerowrap.Default()) } +// testMinDelayConfig returns a Config with all timing delays set to 1ms, +// suitable for unit tests that don't want to wait for real timeouts. +func testMinDelayConfig() Config { + return Config{ + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, + StabilizationDelay: time.Millisecond, + } +} + func setupMetricsTest(t *testing.T) (*telemetry.Metrics, *sdkmetric.ManualReader) { t.Helper() @@ -184,10 +195,11 @@ func TestService_Deploy_Success(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - NetworkIsolation: false, - VolumeAutoCreate: false, - ReadinessDelay: time.Millisecond, // Minimal delay for tests - DrainDelay: time.Millisecond, + NetworkIsolation: false, + VolumeAutoCreate: false, + ReadinessDelay: time.Millisecond, // Minimal delay for tests + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) @@ -198,6 +210,9 @@ func TestService_Deploy_Success(t *testing.T) { Image: "myapp:latest", } + // No container in memory — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + // Setup mocks - no orphaned containers runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) @@ -261,10 +276,11 @@ func TestService_Deploy_ReadinessRecoveryWindow_AllowsTransientFlap(t *testing.T eventBus := mocks.NewMockEventPublisher(t) config := Config{ - NetworkIsolation: false, - VolumeAutoCreate: false, - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + NetworkIsolation: false, + VolumeAutoCreate: false, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) @@ -275,6 +291,9 @@ func TestService_Deploy_ReadinessRecoveryWindow_AllowsTransientFlap(t *testing.T Image: "myapp:latest", } + // No container in memory — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) runtime.EXPECT().PullImage(mock.Anything, "myapp:latest").Return(nil) @@ -329,6 +348,9 @@ func TestService_Deploy_ImagePullFailure(t *testing.T) { Image: "myapp:latest", } + // No container in memory — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) runtime.EXPECT().PullImage(mock.Anything, "myapp:latest").Return(errors.New("image not found")) @@ -406,8 +428,10 @@ func TestService_Deploy_ReplacesExistingContainer(t *testing.T) { cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) config := Config{ - ReadinessDelay: time.Millisecond, // Minimal delay for tests - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, // Minimal delay for tests + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, + StabilizationDelay: time.Millisecond, } svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) @@ -445,8 +469,8 @@ func TestService_Deploy_ReplacesExistingContainer(t *testing.T) { })).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - // Wait for ready - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) // Inspect after ready runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ @@ -460,7 +484,7 @@ func TestService_Deploy_ReplacesExistingContainer(t *testing.T) { // Synchronous cache invalidation before old container cleanup cacheInvalidator.EXPECT().InvalidateTarget(mock.Anything, "test.example.com").Return() - // Now cleanup old container (after new one is ready + cache invalidated + drain delay) + // Now cleanup old container (after new one is ready + cache invalidated + stabilization + drain delay) runtime.EXPECT().StopContainer(mock.Anything, "old-container").Return(nil) runtime.EXPECT().RemoveContainer(mock.Anything, "old-container", true).Return(nil) @@ -471,10 +495,106 @@ func TestService_Deploy_ReplacesExistingContainer(t *testing.T) { assert.NoError(t, err) assert.Equal(t, "new-container", result.ID) +} + +// TestService_Deploy_ResolvesExistingFromRuntime_WhenMemoryStale verifies that +// when in-memory state has no tracked container for a domain, Deploy queries +// the runtime and discovers the running container. This prevents orphan cleanup +// from killing the real active container after a Gordon restart. +func TestService_Deploy_ResolvesExistingFromRuntime_WhenMemoryStale(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + envLoader := mocks.NewMockEnvLoader(t) + eventBus := mocks.NewMockEventPublisher(t) + cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) + + config := testMinDelayConfig() + svc := NewService(runtime, envLoader, eventBus, nil, config) + svc.SetProxyCacheInvalidator(cacheInvalidator) + ctx := testContext() + + // NO tracked container in memory — simulates Gordon restart + // (svc.containers is empty for "test.example.com") + + route := domain.Route{ + Domain: "test.example.com", + Image: "myapp:v2", + } + + // resolveExistingContainer should query running containers from runtime + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{ + { + ID: "runtime-active-container", + Name: "gordon-test.example.com", + Status: "running", + Labels: map[string]string{ + domain.LabelDomain: "test.example.com", + domain.LabelManaged: "true", + }, + }, + }, nil) + + // Orphan cleanup lists ALL containers (including stopped) — the runtime-active + // container should be skipped because it is now recognized as the existing one + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{ + { + ID: "runtime-active-container", + Name: "gordon-test.example.com", + Status: "running", + Labels: map[string]string{ + domain.LabelDomain: "test.example.com", + domain.LabelManaged: "true", + }, + }, + }, nil) + + // Image operations + runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) + runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil) + runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil) + + // Environment + envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil) + runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil) + + // Create new container with -new suffix (zero-downtime: existing was resolved from runtime) + newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} + runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool { + return cfg.Name == "gordon-test.example.com-new" + })).Return(newContainer, nil) + runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) + + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) + + // Inspect after ready + runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ + ID: "new-container", + Status: "running", + }, nil) + + // Publish event + eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil) + + // Synchronous cache invalidation + cacheInvalidator.EXPECT().InvalidateTarget(mock.Anything, "test.example.com").Return() + + // AFTER new container is ready: stop and remove old runtime-discovered container + // This is the key assertion — StopContainer on the runtime-active container + // happens during finalizePreviousContainer, NOT during orphan cleanup + runtime.EXPECT().StopContainer(mock.Anything, "runtime-active-container").Return(nil) + runtime.EXPECT().RemoveContainer(mock.Anything, "runtime-active-container", true).Return(nil) + + // Rename new container to canonical name + runtime.EXPECT().RenameContainer(mock.Anything, "new-container", "gordon-test.example.com").Return(nil) + + result, err := svc.Deploy(ctx, route) + + assert.NoError(t, err) + assert.Equal(t, "new-container", result.ID) - // Ensure the service's tracked container entry now points to the new container. - tracked, ok := svc.Get(ctx, "test.example.com") - assert.True(t, ok) + // Verify new container is now tracked + tracked, exists := svc.Get(ctx, "test.example.com") + assert.True(t, exists) assert.Equal(t, "new-container", tracked.ID) } @@ -483,10 +603,7 @@ func TestService_Deploy_SkipRedundantDeploy_GetImageIDError(t *testing.T) { envLoader := mocks.NewMockEnvLoader(t) eventBus := mocks.NewMockEventPublisher(t) - config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - } + config := testMinDelayConfig() svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -519,7 +636,8 @@ func TestService_Deploy_SkipRedundantDeploy_GetImageIDError(t *testing.T) { newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} runtime.EXPECT().CreateContainer(mock.Anything, mock.AnythingOfType("*domain.ContainerConfig")).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ ID: "new-container", Status: "running", }, nil) @@ -542,10 +660,11 @@ func TestService_Deploy_WithNetworkIsolation(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - NetworkIsolation: true, - NetworkPrefix: "gordon", - ReadinessDelay: time.Millisecond, // Minimal delay for tests - DrainDelay: time.Millisecond, + NetworkIsolation: true, + NetworkPrefix: "gordon", + ReadinessDelay: time.Millisecond, // Minimal delay for tests + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -555,6 +674,9 @@ func TestService_Deploy_WithNetworkIsolation(t *testing.T) { Image: "myapp:latest", } + // No container in memory — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp:latest"}, nil) runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:latest").Return([]int{8080}, nil) @@ -594,10 +716,11 @@ func TestService_Deploy_WithVolumeAutoCreate(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - VolumeAutoCreate: true, - VolumePrefix: "gordon", - ReadinessDelay: time.Millisecond, // Minimal delay for tests - DrainDelay: time.Millisecond, + VolumeAutoCreate: true, + VolumePrefix: "gordon", + ReadinessDelay: time.Millisecond, // Minimal delay for tests + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -607,6 +730,9 @@ func TestService_Deploy_WithVolumeAutoCreate(t *testing.T) { Image: "myapp:latest", } + // No container in memory — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp:latest"}, nil) runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:latest").Return([]int{8080}, nil) @@ -1312,6 +1438,7 @@ func TestService_Deploy_InternalDeployForcesPull(t *testing.T) { InternalRegistryPassword: "secret", ReadinessDelay: time.Millisecond, DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) @@ -1324,6 +1451,9 @@ func TestService_Deploy_InternalDeployForcesPull(t *testing.T) { Image: "myapp:latest", } + // No container in memory — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + // Setup mocks - no orphaned containers runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) @@ -1379,8 +1509,9 @@ func TestService_AutoStart_StartsNewContainers(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -1390,6 +1521,9 @@ func TestService_AutoStart_StartsNewContainers(t *testing.T) { {Domain: "app2.example.com", Image: "myapp2:latest"}, } + // No container in memory — runtime resolution returns nothing (one per route) + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil).Times(2) + // Setup mocks for route deployments runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil).Times(2) runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp1:latest", "myapp2:latest"}, nil).Times(2) @@ -1422,8 +1556,9 @@ func TestService_AutoStart_SkipsExistingContainers(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -1438,6 +1573,9 @@ func TestService_AutoStart_SkipsExistingContainers(t *testing.T) { {Domain: "app2.example.com", Image: "myapp2:latest"}, // New route } + // No container in memory for app2 — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil).Once() + // Only deploy for app2 (app1 is skipped). Readiness is skipped — no IsContainerRunning. runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil).Once() runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp2:latest"}, nil).Once() @@ -1472,6 +1610,9 @@ func TestService_AutoStart_HandlesDeployErrors(t *testing.T) { {Domain: "app1.example.com", Image: "myapp1:latest"}, } + // No container in memory — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + // Setup mocks for failure runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) @@ -1490,10 +1631,11 @@ func TestService_AutoStart_UsesInternalDeployContext(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - RegistryDomain: "reg.example.com", - RegistryPort: 5000, - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + RegistryDomain: "reg.example.com", + RegistryPort: 5000, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) @@ -1504,6 +1646,9 @@ func TestService_AutoStart_UsesInternalDeployContext(t *testing.T) { {Domain: "app1.example.com", Image: "reg.example.com/myapp:latest"}, } + // No container in memory — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + // Key assertion: PullImage should be called with localhost:5000 rewrite, // NOT the original reg.example.com/myapp:latest. // Readiness is skipped — no IsContainerRunning calls. @@ -1538,8 +1683,10 @@ func TestService_Deploy_OrphanCleanupSkipsTrackedContainer(t *testing.T) { cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) config := Config{ - ReadinessDelay: time.Millisecond, // Minimal delay for tests - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, // Minimal delay for tests + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, + StabilizationDelay: time.Millisecond, } svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) @@ -1585,8 +1732,8 @@ func TestService_Deploy_OrphanCleanupSkipsTrackedContainer(t *testing.T) { })).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - // Wait for ready - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) // Inspect after ready runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ @@ -1619,16 +1766,18 @@ func TestService_Deploy_OrphanCleanupSkipsTrackedContainer(t *testing.T) { assert.Equal(t, "new-container", tracked.ID) } -// TestService_Deploy_OrphanCleanupRemovesTrueOrphans verifies that containers -// with the same name but NOT tracked are properly removed as orphans. +// TestService_Deploy_OrphanCleanupRemovesTrueOrphans verifies that stopped canonical +// containers with the same name but NOT tracked are properly removed as orphans. +// Running canonical containers are preserved (see NeverKillsRunningCanonical test). func TestService_Deploy_OrphanCleanupRemovesTrueOrphans(t *testing.T) { runtime := mocks.NewMockContainerRuntime(t) envLoader := mocks.NewMockEnvLoader(t) eventBus := mocks.NewMockEventPublisher(t) config := Config{ - ReadinessDelay: time.Millisecond, // Minimal delay for tests - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, // Minimal delay for tests + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -1640,17 +1789,21 @@ func TestService_Deploy_OrphanCleanupRemovesTrueOrphans(t *testing.T) { Image: "myapp:v1", } - // ListContainers returns an orphaned container (same name, but not tracked) - // This could happen if Gordon crashed and restarted, or container was created manually + // No container in memory — runtime resolution finds no managed container + // (the orphan is stopped so it won't appear in running-only query) + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + + // ListContainers(true) returns a STOPPED orphaned container (same name, but not tracked) + // Stopped canonical containers are safe to remove during orphan cleanup runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{ { ID: "orphan-container", Name: "gordon-test.example.com", - Status: "running", + Status: "exited", }, }, nil) - // Orphan should be stopped and removed BEFORE we proceed + // Stopped orphan should be stopped and removed BEFORE we proceed runtime.EXPECT().StopContainer(mock.Anything, "orphan-container").Return(nil) runtime.EXPECT().RemoveContainer(mock.Anything, "orphan-container", true).Return(nil) @@ -1700,10 +1853,7 @@ func TestService_Deploy_OrphanCleanupRemovesStaleNewContainer(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) - config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - } + config := testMinDelayConfig() svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) ctx := testContext() @@ -1746,7 +1896,8 @@ func TestService_Deploy_OrphanCleanupRemovesStaleNewContainer(t *testing.T) { return cfg.Name == "gordon-test.example.com-new" })).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ ID: "new-container", Status: "running", @@ -1772,10 +1923,7 @@ func TestService_Deploy_TrackedTempContainerUsesAlternateTempName(t *testing.T) eventBus := mocks.NewMockEventPublisher(t) cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) - config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - } + config := testMinDelayConfig() svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) ctx := testContext() @@ -1811,7 +1959,8 @@ func TestService_Deploy_TrackedTempContainerUsesAlternateTempName(t *testing.T) return cfg.Name == "gordon-test.example.com-next" })).Return(created, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ ID: "new-container", Status: "running", @@ -1961,10 +2110,7 @@ func TestService_Deploy_ConcurrentSameDomain(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) - config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - } + config := testMinDelayConfig() svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) ctx := testContext() @@ -1982,6 +2128,9 @@ func TestService_Deploy_ConcurrentSameDomain(t *testing.T) { // First deploy: no existing container → creates gordon-test.example.com // Second deploy: sees first container → creates gordon-test.example.com-new + // First deploy resolves from runtime (no container in memory yet) + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil).Once() + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil).Times(2) runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp:latest"}, nil).Times(2) runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:latest").Return([]int{8080}, nil).Times(2) @@ -2005,8 +2154,8 @@ func TestService_Deploy_ConcurrentSameDomain(t *testing.T) { }) runtime.EXPECT().StartContainer(mock.Anything, mock.AnythingOfType("string")).Return(nil).Times(2) - // IsContainerRunning is called 2x per deploy in waitForReady (initial check + after delay) - runtime.EXPECT().IsContainerRunning(mock.Anything, mock.AnythingOfType("string")).Return(true, nil).Times(4) + // IsContainerRunning: 2x per deploy in waitForReady + 1x stabilization for the second deploy (which has existing) + runtime.EXPECT().IsContainerRunning(mock.Anything, mock.AnythingOfType("string")).Return(true, nil).Times(5) runtime.EXPECT().InspectContainer(mock.Anything, mock.AnythingOfType("string")). RunAndReturn(func(_ context.Context, id string) (*domain.Container, error) { @@ -2069,8 +2218,9 @@ func TestService_Deploy_ContextCancellation(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) @@ -2097,10 +2247,7 @@ func TestService_Deploy_CacheInvalidationBeforeOldContainerStop(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) - config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - } + config := testMinDelayConfig() svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) ctx := testContext() @@ -2132,7 +2279,8 @@ func TestService_Deploy_CacheInvalidationBeforeOldContainerStop(t *testing.T) { newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} runtime.EXPECT().CreateContainer(mock.Anything, mock.AnythingOfType("*domain.ContainerConfig")).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ ID: "new-container", Status: "running", }, nil) @@ -2176,10 +2324,7 @@ func TestService_Deploy_NilCacheInvalidator(t *testing.T) { envLoader := mocks.NewMockEnvLoader(t) eventBus := mocks.NewMockEventPublisher(t) - config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - } + config := testMinDelayConfig() // Intentionally NOT setting cache invalidator svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -2205,7 +2350,8 @@ func TestService_Deploy_NilCacheInvalidator(t *testing.T) { newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} runtime.EXPECT().CreateContainer(mock.Anything, mock.AnythingOfType("*domain.ContainerConfig")).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ ID: "new-container", Status: "running", }, nil) @@ -2226,8 +2372,9 @@ func TestService_Deploy_SkipsRedundantDeploy(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -2273,16 +2420,68 @@ func TestService_Deploy_SkipsRedundantDeploy(t *testing.T) { assert.Equal(t, "existing-container", tracked.ID) } -func TestService_Deploy_DoesNotSkipWhenImageIDDiffers(t *testing.T) { +func TestService_Deploy_SkipsRedundantDeploy_WhenTrackedImageIDMissing(t *testing.T) { runtime := mocks.NewMockContainerRuntime(t) envLoader := mocks.NewMockEnvLoader(t) eventBus := mocks.NewMockEventPublisher(t) - cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, + } + svc := NewService(runtime, envLoader, eventBus, nil, config) + ctx := testContext() + + // Pre-populate with existing container missing ImageID (common after stale in-memory state). + existingContainer := &domain.Container{ + ID: "existing-container", + Name: "gordon-test.example.com", + Image: "myapp:latest", + Status: "running", } + svc.containers["test.example.com"] = existingContainer + + route := domain.Route{ + Domain: "test.example.com", + Image: "myapp:latest", + } + + // prepareDeployResources + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) + runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp:latest"}, nil) + runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:latest").Return([]int{8080}, nil) + envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil) + runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:latest").Return([]string{}, nil) + + // Existing container image ID is recovered from runtime inspection, + // then redundancy check can skip the deploy. + runtime.EXPECT().InspectContainer(mock.Anything, "existing-container").Return(&domain.Container{ + ID: "existing-container", + ImageID: "sha256:abc123", + Status: "running", + }, nil) + runtime.EXPECT().GetImageID(mock.Anything, "myapp:latest").Return("sha256:abc123", nil) + runtime.EXPECT().IsContainerRunning(mock.Anything, "existing-container").Return(true, nil) + + result, err := svc.Deploy(ctx, route) + + assert.NoError(t, err) + assert.NotNil(t, result) + assert.Equal(t, "existing-container", result.ID) + + tracked, exists := svc.Get(ctx, "test.example.com") + assert.True(t, exists) + assert.Equal(t, "existing-container", tracked.ID) +} + +func TestService_Deploy_DoesNotSkipWhenImageIDDiffers(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + envLoader := mocks.NewMockEnvLoader(t) + eventBus := mocks.NewMockEventPublisher(t) + cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) + + config := testMinDelayConfig() svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) ctx := testContext() @@ -2316,7 +2515,8 @@ func TestService_Deploy_DoesNotSkipWhenImageIDDiffers(t *testing.T) { newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} runtime.EXPECT().CreateContainer(mock.Anything, mock.AnythingOfType("*domain.ContainerConfig")).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ ID: "new-container", Status: "running", }, nil) @@ -2380,10 +2580,7 @@ func TestService_Deploy_SkipRedundantDeploy_ContainerNotRunning(t *testing.T) { envLoader := mocks.NewMockEnvLoader(t) eventBus := mocks.NewMockEventPublisher(t) - config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - } + config := testMinDelayConfig() svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -2417,7 +2614,8 @@ func TestService_Deploy_SkipRedundantDeploy_ContainerNotRunning(t *testing.T) { newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} runtime.EXPECT().CreateContainer(mock.Anything, mock.AnythingOfType("*domain.ContainerConfig")).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ ID: "new-container", Status: "running", }, nil) @@ -2433,3 +2631,527 @@ func TestService_Deploy_SkipRedundantDeploy_ContainerNotRunning(t *testing.T) { assert.NoError(t, err) assert.Equal(t, "new-container", result.ID) } + +// TestService_Deploy_OrphanCleanup_NeverKillsRunningCanonical verifies that orphan +// cleanup never stops a running container with the canonical name. Only stopped +// canonical containers and temp containers (-new/-next) should be cleaned up. +// The active canonical container is resolved via resolveExistingContainer (it has the +// managed label), so the new container is created with the "-new" suffix — Docker +// disallows creating a container with the same name as an existing running container. +func TestService_Deploy_OrphanCleanup_NeverKillsRunningCanonical(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + envLoader := mocks.NewMockEnvLoader(t) + eventBus := mocks.NewMockEventPublisher(t) + + config := testMinDelayConfig() + svc := NewService(runtime, envLoader, eventBus, nil, config) + ctx := testContext() + + // The canonical container is running and has the managed label — it will be + // discovered by resolveExistingContainer's slow path (no in-memory state). + existingContainer := &domain.Container{ + ID: "active-canonical", + Name: "gordon-test.example.com", + Status: "running", + Labels: map[string]string{domain.LabelManaged: "true"}, + } + + route := domain.Route{ + Domain: "test.example.com", + Image: "myapp:v2", + } + + // resolveExistingContainer: slow path finds the running canonical container via label. + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{existingContainer}, nil) + + // cleanupOrphanedContainers: finds the running canonical AND a stale -new leftover. + // The canonical container must NOT be killed because it's running. + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{ + existingContainer, + { + ID: "stale-leftover", + Name: "gordon-test.example.com-new", + Status: "exited", + }, + }, nil) + + // Only the stale -new container should be stopped and removed during orphan cleanup. + runtime.EXPECT().StopContainer(mock.Anything, "stale-leftover").Return(nil).Once() + runtime.EXPECT().RemoveContainer(mock.Anything, "stale-leftover", true).Return(nil).Once() + + // Image operations: pull new image (existing.Image is empty so redundancy check is skipped) + runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) + runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil) + runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil) + + // Environment + envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil) + runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil) + + // Create container — existing is the canonical container, so the new one gets "-new" suffix. + // Docker enforces unique container names; using "-new" avoids a name collision. + newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} + runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool { + return cfg.Name == "gordon-test.example.com-new" + })).Return(newContainer, nil) + runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) + + // Wait for ready (delay mode): pollContainerRunning (1×) + waitForReadyByDelay (1×) + // Post-switch stabilization: 1× — total 3 calls. + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) + + // Inspect after ready + runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ + ID: "new-container", + Status: "running", + Ports: []int{8080}, + }, nil) + + // Rename new container to canonical name after stabilization. + runtime.EXPECT().RenameContainer(mock.Anything, "new-container", "gordon-test.example.com").Return(nil).Once() + + // Stop and remove old canonical container. + runtime.EXPECT().StopContainer(mock.Anything, "active-canonical").Return(nil).Once() + runtime.EXPECT().RemoveContainer(mock.Anything, "active-canonical", true).Return(nil).Once() + + // Publish event + eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil) + + result, err := svc.Deploy(ctx, route) + + assert.NoError(t, err) + assert.Equal(t, "new-container", result.ID) + + // Verify new container is tracked + tracked, exists := svc.Get(ctx, "test.example.com") + assert.True(t, exists) + assert.Equal(t, "new-container", tracked.ID) +} + +// TestService_Deploy_RollbackOnPostSwitchCrash verifies that if the new container crashes +// during the post-switch stabilization window, traffic is automatically rolled back to the +// old container: the old container is restored as tracked, the proxy cache is re-invalidated, +// and the failed new container is cleaned up. +func TestService_Deploy_RollbackOnPostSwitchCrash(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + envLoader := mocks.NewMockEnvLoader(t) + eventBus := mocks.NewMockEventPublisher(t) + cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) + + config := Config{ + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, + StabilizationDelay: time.Millisecond, + } + svc := NewService(runtime, envLoader, eventBus, nil, config) + svc.SetProxyCacheInvalidator(cacheInvalidator) + ctx := testContext() + + // Pre-populate with existing container + existing := &domain.Container{ + ID: "old-container", + Name: "gordon-test.example.com", + Status: "running", + } + svc.containers["test.example.com"] = existing + + route := domain.Route{ + Domain: "test.example.com", + Image: "myapp:v2", + } + + // Cleanup orphans + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) + + // Image operations + runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) + runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil) + runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil) + + // Environment + envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil) + runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil) + + // Create new container + newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} + runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool { + return cfg.Name == "gordon-test.example.com-new" + })).Return(newContainer, nil) + runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) + + // Readiness: new container is running during readiness checks (2 calls) + // Stabilization: new container has crashed (returns false) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(false, nil).Once() + + // Rollback: verify old container is still running before restoring + runtime.EXPECT().IsContainerRunning(mock.Anything, "old-container").Return(true, nil).Once() + + // Inspect after ready + runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ + ID: "new-container", + Name: "gordon-test.example.com-new", + Status: "running", + }, nil) + + // Publish event (happens during activateDeployedContainer) + eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil) + + // InvalidateTarget called TWICE: once during activate, once during rollback + cacheInvalidator.EXPECT().InvalidateTarget(mock.Anything, "test.example.com").Return().Times(2) + + // Cleanup failed new container during rollback + runtime.EXPECT().StopContainer(mock.Anything, "new-container").Return(nil) + runtime.EXPECT().RemoveContainer(mock.Anything, "new-container", true).Return(nil) + + // StopContainer on old-container should NOT be called (old stays running) + // (no expectation for StopContainer("old-container")) + + result, err := svc.Deploy(ctx, route) + + // Deploy returns the old (existing) container after rollback + assert.NoError(t, err) + assert.NotNil(t, result) + assert.Equal(t, "old-container", result.ID, "should return old container after rollback") + + // Verify old container is restored as tracked + trackedRB, existsRB := svc.Get(ctx, "test.example.com") + assert.True(t, existsRB) + assert.Equal(t, "old-container", trackedRB.ID, "old container should be restored after rollback") +} + +// TestService_Deploy_StabilizationSuccess verifies that when the new container remains +// healthy during the stabilization window, deployment proceeds normally and the old +// container is finalized (stopped and removed). +func TestService_Deploy_StabilizationSuccess(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + envLoader := mocks.NewMockEnvLoader(t) + eventBus := mocks.NewMockEventPublisher(t) + cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) + + config := Config{ + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, + StabilizationDelay: time.Millisecond, + } + svc := NewService(runtime, envLoader, eventBus, nil, config) + svc.SetProxyCacheInvalidator(cacheInvalidator) + ctx := testContext() + + // Pre-populate with existing container + existing := &domain.Container{ + ID: "old-container", + Name: "gordon-test.example.com", + Status: "running", + } + svc.containers["test.example.com"] = existing + + route := domain.Route{ + Domain: "test.example.com", + Image: "myapp:v2", + } + + // Cleanup orphans + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) + + // Image operations + runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) + runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil) + runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil) + + // Environment + envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil) + runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil) + + // Create new container + newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} + runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool { + return cfg.Name == "gordon-test.example.com-new" + })).Return(newContainer, nil) + runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) + + // Readiness (2 calls) + stabilization check (1 call) — all return running + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) + + // Inspect after ready + runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ + ID: "new-container", + Name: "gordon-test.example.com-new", + Status: "running", + }, nil) + + // Publish event + eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil) + + // Cache invalidation (once, during activate) + cacheInvalidator.EXPECT().InvalidateTarget(mock.Anything, "test.example.com").Return() + + // Old container finalized normally + runtime.EXPECT().StopContainer(mock.Anything, "old-container").Return(nil) + runtime.EXPECT().RemoveContainer(mock.Anything, "old-container", true).Return(nil) + runtime.EXPECT().RenameContainer(mock.Anything, "new-container", "gordon-test.example.com").Return(nil) + + result, err := svc.Deploy(ctx, route) + + assert.NoError(t, err) + assert.NotNil(t, result) + assert.Equal(t, "new-container", result.ID, "should return new container on successful stabilization") + + // Verify new container is tracked + trackedSS, existsSS := svc.Get(ctx, "test.example.com") + assert.True(t, existsSS) + assert.Equal(t, "new-container", trackedSS.ID) +} + +// TestService_Deploy_ZeroDowntime_OldNeverStoppedBeforeNewReady verifies the strict +// ordering invariant: StopContainer("old-container") must NEVER happen before +// InspectContainer("new-container") completes (the last step of readiness). +// Uses testify's NotBefore() to enforce mock call ordering. +func TestService_Deploy_ZeroDowntime_OldNeverStoppedBeforeNewReady(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + envLoader := mocks.NewMockEnvLoader(t) + eventBus := mocks.NewMockEventPublisher(t) + cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) + + config := Config{ + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, + StabilizationDelay: time.Millisecond, + } + svc := NewService(runtime, envLoader, eventBus, nil, config) + svc.SetProxyCacheInvalidator(cacheInvalidator) + ctx := testContext() + + // Pre-populate with existing tracked container + existingContainer := &domain.Container{ + ID: "old-container", + Name: "gordon-test.example.com", + Status: "running", + } + svc.containers["test.example.com"] = existingContainer + + route := domain.Route{ + Domain: "test.example.com", + Image: "myapp:v2", + } + + // Cleanup orphans + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) + + // Image operations + runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) + runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil) + runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil) + + // Environment + envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil) + runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil) + + // Create new container with -new suffix for zero-downtime + newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} + runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool { + return cfg.Name == "gordon-test.example.com-new" + })).Return(newContainer, nil) + runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) + + // Readiness: pollContainerRunning (1 call) + waitForReadyByDelay verification (1 call) + // Stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) + + // KEY ORDERING CONSTRAINT: InspectContainer("new-container") is the last step + // of readiness inside createStartedContainer. StopContainer("old-container") + // must happen AFTER this call completes. + inspectCall := runtime.EXPECT().InspectContainer(mock.Anything, "new-container"). + Return(&domain.Container{ + ID: "new-container", + Name: "gordon-test.example.com-new", + Status: "running", + }, nil) + + // Publish event + eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil) + + // Synchronous cache invalidation + cacheInvalidator.EXPECT().InvalidateTarget(mock.Anything, "test.example.com").Return() + + // STRICT ORDERING: StopContainer on old container must NOT happen before + // InspectContainer on new container (readiness completion) + runtime.EXPECT().StopContainer(mock.Anything, "old-container"). + Return(nil). + NotBefore(inspectCall.Call) + + runtime.EXPECT().RemoveContainer(mock.Anything, "old-container", true).Return(nil) + runtime.EXPECT().RenameContainer(mock.Anything, "new-container", "gordon-test.example.com").Return(nil) + + result, err := svc.Deploy(ctx, route) + + assert.NoError(t, err) + assert.NotNil(t, result) + assert.Equal(t, "new-container", result.ID) + + // Verify new container is tracked + tracked, exists := svc.Get(ctx, "test.example.com") + assert.True(t, exists) + assert.Equal(t, "new-container", tracked.ID) +} + +// TestService_Deploy_ZeroDowntime_ReadinessFailure_OldUntouched verifies that when a +// new container fails readiness (IsContainerRunning returns false), the old container +// is completely untouched: never stopped, never removed. Deploy returns an error and +// the old container remains as the tracked container. +func TestService_Deploy_ZeroDowntime_ReadinessFailure_OldUntouched(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + envLoader := mocks.NewMockEnvLoader(t) + eventBus := mocks.NewMockEventPublisher(t) + + config := Config{ + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, + StabilizationDelay: time.Millisecond, + } + svc := NewService(runtime, envLoader, eventBus, nil, config) + ctx := testContext() + + // Pre-populate with existing tracked container + existingContainer := &domain.Container{ + ID: "old-container", + Name: "gordon-test.example.com", + Status: "running", + } + svc.containers["test.example.com"] = existingContainer + + route := domain.Route{ + Domain: "test.example.com", + Image: "myapp:v2", + } + + // Cleanup orphans + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) + + // Image operations + runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) + runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil) + runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil) + + // Environment + envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil) + runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil) + + // Create new container + newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} + runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool { + return cfg.Name == "gordon-test.example.com-new" + })).Return(newContainer, nil) + runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) + + // Readiness FAILS: pollContainerRunning succeeds (container starts) but + // waitForReadyByDelay verification finds it not running. + // pollContainerRunning: returns true (container initially starts) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Once() + // waitForReadyByDelay post-delay verification: returns false (container crashed) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(false, nil).Once() + // Recovery window poll: return error to fail fast instead of waiting 30s + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container"). + Return(false, errors.New("container exited")).Once() + + // cleanupFailedContainer: stop and remove the failed new container + runtime.EXPECT().StopContainer(mock.Anything, "new-container").Return(nil) + runtime.EXPECT().RemoveContainer(mock.Anything, "new-container", true).Return(nil) + + // NOTE: StopContainer("old-container") and RemoveContainer("old-container", true) + // are intentionally NOT mocked. If either is called, testify will panic with + // "unexpected method call" — verifying the old container is never touched. + + result, err := svc.Deploy(ctx, route) + + // Deploy should return an error (readiness failure) + assert.Error(t, err) + assert.Nil(t, result) + assert.Contains(t, err.Error(), "readiness check") + + // Old container must still be tracked — completely untouched + tracked, exists := svc.Get(ctx, "test.example.com") + assert.True(t, exists, "old container should still be tracked after readiness failure") + assert.Equal(t, "old-container", tracked.ID, "old container ID should be unchanged") +} + +// TestService_Deploy_ZeroDowntime_NoExisting_FirstDeploy is a sanity check: +// first deploy with no existing container should work normally without any +// finalize/drain logic being triggered. +func TestService_Deploy_ZeroDowntime_NoExisting_FirstDeploy(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + envLoader := mocks.NewMockEnvLoader(t) + eventBus := mocks.NewMockEventPublisher(t) + + config := Config{ + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, + StabilizationDelay: time.Millisecond, + } + svc := NewService(runtime, envLoader, eventBus, nil, config) + ctx := testContext() + + // NO existing container — this is a first deploy + + route := domain.Route{ + Domain: "test.example.com", + Image: "myapp:v2", + } + + // resolveExistingContainer: no container in memory, runtime returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + + // Cleanup orphans — nothing found + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) + + // Image operations + runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) + runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil) + runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil) + + // Environment + envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil) + runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil) + + // Create container — canonical name (no -new suffix since no existing container) + newContainer := &domain.Container{ID: "first-container", Name: "gordon-test.example.com", Status: "created"} + runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool { + return cfg.Name == "gordon-test.example.com" + })).Return(newContainer, nil) + runtime.EXPECT().StartContainer(mock.Anything, "first-container").Return(nil) + + // Readiness: pollContainerRunning (1 call) + waitForReadyByDelay verification (1 call) + // No stabilization check for first deploy (hasExisting is false) + runtime.EXPECT().IsContainerRunning(mock.Anything, "first-container").Return(true, nil).Times(2) + + // Inspect after readiness + runtime.EXPECT().InspectContainer(mock.Anything, "first-container").Return(&domain.Container{ + ID: "first-container", + Name: "gordon-test.example.com", + Status: "running", + Ports: []int{8080}, + }, nil) + + // Publish event + eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil) + + // NOTE: No StopContainer, RemoveContainer, or RenameContainer expectations — + // first deploy has no previous container to finalize, and no stabilization check. + + result, err := svc.Deploy(ctx, route) + + assert.NoError(t, err) + assert.NotNil(t, result) + assert.Equal(t, "first-container", result.ID) + assert.Equal(t, "running", result.Status) + + // Verify container is tracked + tracked, exists := svc.Get(ctx, "test.example.com") + assert.True(t, exists) + assert.Equal(t, "first-container", tracked.ID) +} diff --git a/internal/usecase/proxy/service_test.go b/internal/usecase/proxy/service_test.go index 841bb907..ddb1b606 100644 --- a/internal/usecase/proxy/service_test.go +++ b/internal/usecase/proxy/service_test.go @@ -539,14 +539,9 @@ func TestDrainRegistryInFlight(t *testing.T) { svc.registryInFlight.Add(2) - done := make(chan struct{}) + result := make(chan bool, 1) go func() { - drained := svc.DrainRegistryInFlight(50 * time.Millisecond) - if !drained { - // Signal failure via done channel by leaving it open — test will time out - return - } - close(done) + result <- svc.DrainRegistryInFlight(50 * time.Millisecond) }() time.Sleep(5 * time.Millisecond) @@ -554,10 +549,12 @@ func TestDrainRegistryInFlight(t *testing.T) { svc.registryInFlight.Add(-1) select { - case <-done: - // good — drained cleanly + case drained := <-result: + if !drained { + t.Fatalf("DrainRegistryInFlight returned false; expected true after all requests completed") + } case <-time.After(500 * time.Millisecond): - t.Fatal("DrainRegistryInFlight did not return true after requests completed") + t.Fatal("DrainRegistryInFlight did not return within timeout") } } diff --git a/internal/usecase/registry/service.go b/internal/usecase/registry/service.go index e1efff25..66602856 100644 --- a/internal/usecase/registry/service.go +++ b/internal/usecase/registry/service.go @@ -7,6 +7,8 @@ import ( "fmt" "io" "strings" + "sync" + "time" "github.com/bnema/zerowrap" "go.opentelemetry.io/otel" @@ -24,10 +26,11 @@ var registryTracer = otel.Tracer("gordon.registry") // Service implements the RegistryService interface. type Service struct { - blobStorage out.BlobStorage - manifestStorage out.ManifestStorage - eventBus out.EventPublisher - metrics *telemetry.Metrics + blobStorage out.BlobStorage + manifestStorage out.ManifestStorage + eventBus out.EventPublisher + metrics *telemetry.Metrics + suppressedImages sync.Map // imageName -> *time.Timer } // SetMetrics sets the telemetry metrics for the registry service. @@ -48,6 +51,85 @@ func NewService( } } +// SuppressDeployEvent marks an image name to skip image.pushed events. +// The suppression auto-expires after 2 minutes to prevent leaks. +func (s *Service) SuppressDeployEvent(imageName string) { + imageName = ExtractImageName(strings.TrimSpace(imageName)) + if imageName == "" { + return + } + + var timer *time.Timer + timer = time.AfterFunc(2*time.Minute, func() { + // Only delete if this timer is still the current one, preventing an + // old timer's callback from removing a newer suppression entry. + if v, ok := s.suppressedImages.Load(imageName); ok && v == timer { + s.suppressedImages.Delete(imageName) + } + }) + if existing, loaded := s.suppressedImages.LoadOrStore(imageName, timer); loaded { + existing.(*time.Timer).Stop() + s.suppressedImages.Store(imageName, timer) + } +} + +// ClearDeployEventSuppression removes event suppression for an image. +func (s *Service) ClearDeployEventSuppression(imageName string) { + imageName = ExtractImageName(strings.TrimSpace(imageName)) + if imageName == "" { + return + } + + if v, loaded := s.suppressedImages.LoadAndDelete(imageName); loaded { + v.(*time.Timer).Stop() + } +} + +// ExtractImageName returns just the repository path of a container image +// reference, stripping any registry host prefix, tag, and digest. +// Examples: +// +// "reg.example.com/team/my-app:latest" -> "team/my-app" +// "reg.example.com/my-app@sha256:abc" -> "my-app" +// "my-app:v1.2" -> "my-app" +func ExtractImageName(imageRef string) string { + name := imageRef + // Strip digest + if idx := strings.Index(name, "@"); idx != -1 { + name = name[:idx] + } + // Strip tag + // Find the last colon, but only strip it if it comes after any slash + // (to avoid treating a port number in the host as a tag). + if idx := strings.LastIndex(name, ":"); idx != -1 { + slashIdx := strings.LastIndex(name, "/") + if idx > slashIdx { + name = name[:idx] + } + } + // Strip registry host: if the first segment contains a dot or colon it is + // a registry hostname; remove it. + parts := strings.SplitN(name, "/", 2) + if len(parts) == 2 { + host := parts[0] + if strings.ContainsAny(host, ".:") || host == "localhost" { + name = parts[1] + } + } + return name +} + +// IsDeployEventSuppressed checks if deploy events are suppressed for an image. +func (s *Service) IsDeployEventSuppressed(imageName string) bool { + imageName = ExtractImageName(strings.TrimSpace(imageName)) + if imageName == "" { + return false + } + + _, exists := s.suppressedImages.Load(imageName) + return exists +} + // GetManifest retrieves a manifest by name and reference. func (s *Service) GetManifest(ctx context.Context, name, reference string) (*domain.Manifest, error) { ctx = zerowrap.CtxWithFields(ctx, map[string]any{ @@ -110,13 +192,17 @@ func (s *Service) PutManifest(ctx context.Context, manifest *domain.Manifest) (s // A docker push sends manifests by both digest and tag; firing only on // tag prevents duplicate deploy triggers for the same push. if s.eventBus != nil && !strings.HasPrefix(manifest.Reference, "sha256:") { - if err := s.eventBus.Publish(domain.EventImagePushed, domain.ImagePushedPayload{ - Name: manifest.Name, - Reference: manifest.Reference, - Manifest: manifest.Data, - Annotations: manifest.Annotations, - }); err != nil { - log.Warn().Err(err).Msg("failed to publish image pushed event") + if s.IsDeployEventSuppressed(manifest.Name) { + log.Info().Str("image", manifest.Name).Msg("skipping image.pushed event: CLI deploy intent active") + } else { + if err := s.eventBus.Publish(domain.EventImagePushed, domain.ImagePushedPayload{ + Name: manifest.Name, + Reference: manifest.Reference, + Manifest: manifest.Data, + Annotations: manifest.Annotations, + }); err != nil { + log.Warn().Err(err).Msg("failed to publish image pushed event") + } } } diff --git a/internal/usecase/registry/service_test.go b/internal/usecase/registry/service_test.go index 4b6efb16..b29f8401 100644 --- a/internal/usecase/registry/service_test.go +++ b/internal/usecase/registry/service_test.go @@ -11,6 +11,7 @@ import ( "github.com/bnema/zerowrap" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" "github.com/bnema/gordon/internal/boundaries/out/mocks" "github.com/bnema/gordon/internal/domain" @@ -437,3 +438,41 @@ func TestService_ListRepositories_Error(t *testing.T) { assert.Nil(t, repos) assert.Contains(t, err.Error(), "failed to list repositories") } + +func TestPutManifest_SkipsEventWhenDeployIntentSuppressed(t *testing.T) { + blobStorage := mocks.NewMockBlobStorage(t) + manifestStorage := mocks.NewMockManifestStorage(t) + eventBus := mocks.NewMockEventPublisher(t) + + svc := NewService(blobStorage, manifestStorage, eventBus) + + svc.SuppressDeployEvent("my-app") + + manifestStorage.EXPECT().PutManifest("my-app", "latest", "application/vnd.oci.image.manifest.v1+json", mock.Anything).Return(nil) + + manifest := &domain.Manifest{ + Name: "my-app", + Reference: "latest", + ContentType: "application/vnd.oci.image.manifest.v1+json", + Data: []byte(`{"schemaVersion":2}`), + } + + _, err := svc.PutManifest(context.Background(), manifest) + require.NoError(t, err) + + eventBus.AssertNotCalled(t, "Publish", mock.Anything, mock.Anything) +} + +func TestSuppressDeployEvent_ClearsCorrectly(t *testing.T) { + blobStorage := mocks.NewMockBlobStorage(t) + manifestStorage := mocks.NewMockManifestStorage(t) + eventBus := mocks.NewMockEventPublisher(t) + + svc := NewService(blobStorage, manifestStorage, eventBus) + + svc.SuppressDeployEvent("my-app") + assert.True(t, svc.IsDeployEventSuppressed("my-app")) + + svc.ClearDeployEventSuppression("my-app") + assert.False(t, svc.IsDeployEventSuppressed("my-app")) +} diff --git a/wiki/examples/production.md b/wiki/examples/production.md index 4957894e..bb802d83 100644 --- a/wiki/examples/production.md +++ b/wiki/examples/production.md @@ -58,7 +58,6 @@ preserve = true [network_isolation] enabled = true network_prefix = "prod" -dns_suffix = ".internal" # Application routes with pinned versions [routes]