From 84f57999d27263c1e99f5486d091dfa566d79cab Mon Sep 17 00:00:00 2001 From: brice Date: Sun, 1 Mar 2026 15:01:41 +0100 Subject: [PATCH 01/26] test: surface DrainRegistryInFlight failures immediately via result channel --- internal/usecase/proxy/service_test.go | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/internal/usecase/proxy/service_test.go b/internal/usecase/proxy/service_test.go index 841bb907..ddb1b606 100644 --- a/internal/usecase/proxy/service_test.go +++ b/internal/usecase/proxy/service_test.go @@ -539,14 +539,9 @@ func TestDrainRegistryInFlight(t *testing.T) { svc.registryInFlight.Add(2) - done := make(chan struct{}) + result := make(chan bool, 1) go func() { - drained := svc.DrainRegistryInFlight(50 * time.Millisecond) - if !drained { - // Signal failure via done channel by leaving it open — test will time out - return - } - close(done) + result <- svc.DrainRegistryInFlight(50 * time.Millisecond) }() time.Sleep(5 * time.Millisecond) @@ -554,10 +549,12 @@ func TestDrainRegistryInFlight(t *testing.T) { svc.registryInFlight.Add(-1) select { - case <-done: - // good — drained cleanly + case drained := <-result: + if !drained { + t.Fatalf("DrainRegistryInFlight returned false; expected true after all requests completed") + } case <-time.After(500 * time.Millisecond): - t.Fatal("DrainRegistryInFlight did not return true after requests completed") + t.Fatal("DrainRegistryInFlight did not return within timeout") } } From c7a2b36b57d5f4511ef0c79dba58e5981a02244d Mon Sep 17 00:00:00 2001 From: brice Date: Mon, 2 Mar 2026 16:46:15 +0100 Subject: [PATCH 02/26] fix: recover missing ImageID via inspect for redundant deploy skip When tracked container has no ImageID in memory (stale state after restart/sync), Deploy now inspects the running container to recover its ImageID before the redundancy check. This prevents unnecessary full redeploys when the same image is already running. --- internal/usecase/container/service.go | 32 +++++++++++-- internal/usecase/container/service_test.go | 54 ++++++++++++++++++++++ 2 files changed, 83 insertions(+), 3 deletions(-) diff --git a/internal/usecase/container/service.go b/internal/usecase/container/service.go index 0e5cf71f..a2694196 100644 --- a/internal/usecase/container/service.go +++ b/internal/usecase/container/service.go @@ -287,9 +287,15 @@ func (s *Service) Deploy(ctx context.Context, route domain.Route) (*domain.Conta // the exact same image (by Docker image ID), return it immediately. // This prevents the double-deploy caused by the event-based deploy // (triggered by image.pushed) racing with the explicit CLI deploy call. - if hasExisting && existing.ImageID != "" { - if skip, container := s.skipRedundantDeploy(ctx, existing, resources.actualImageRef); skip { - return container, nil + if hasExisting { + existingForSkip := existing + if existingForSkip.ImageID == "" && existingForSkip.Image != "" && normalizeImageRef(existingForSkip.Image) == normalizeImageRef(route.Image) { + existingForSkip = s.containerForRedundantCheck(ctx, existingForSkip) + } + if existingForSkip.ImageID != "" { + if skip, container := s.skipRedundantDeploy(ctx, existingForSkip, resources.actualImageRef); skip { + return container, nil + } } } @@ -347,6 +353,26 @@ func (s *Service) skipRedundantDeploy(ctx context.Context, existing *domain.Cont return true, existing } +func (s *Service) containerForRedundantCheck(ctx context.Context, existing *domain.Container) *domain.Container { + if existing == nil || existing.ImageID != "" || existing.ID == "" { + return existing + } + + log := zerowrap.FromCtx(ctx) + inspected, err := s.runtime.InspectContainer(ctx, existing.ID) + if err != nil { + log.Debug().Err(err).Str("container_id", existing.ID).Msg("cannot inspect existing container for redundancy check") + return existing + } + if inspected == nil || inspected.ImageID == "" { + return existing + } + + existingCopy := *existing + existingCopy.ImageID = inspected.ImageID + return &existingCopy +} + type deployResources struct { networkName string actualImageRef string diff --git a/internal/usecase/container/service_test.go b/internal/usecase/container/service_test.go index fae2eeb8..49b1e53a 100644 --- a/internal/usecase/container/service_test.go +++ b/internal/usecase/container/service_test.go @@ -2273,6 +2273,60 @@ func TestService_Deploy_SkipsRedundantDeploy(t *testing.T) { assert.Equal(t, "existing-container", tracked.ID) } +func TestService_Deploy_SkipsRedundantDeploy_WhenTrackedImageIDMissing(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + envLoader := mocks.NewMockEnvLoader(t) + eventBus := mocks.NewMockEventPublisher(t) + + config := Config{ + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + } + svc := NewService(runtime, envLoader, eventBus, nil, config) + ctx := testContext() + + // Pre-populate with existing container missing ImageID (common after stale in-memory state). + existingContainer := &domain.Container{ + ID: "existing-container", + Name: "gordon-test.example.com", + Image: "myapp:latest", + Status: "running", + } + svc.containers["test.example.com"] = existingContainer + + route := domain.Route{ + Domain: "test.example.com", + Image: "myapp:latest", + } + + // prepareDeployResources + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) + runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp:latest"}, nil) + runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:latest").Return([]int{8080}, nil) + envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil) + runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:latest").Return([]string{}, nil) + + // Existing container image ID is recovered from runtime inspection, + // then redundancy check can skip the deploy. + runtime.EXPECT().InspectContainer(mock.Anything, "existing-container").Return(&domain.Container{ + ID: "existing-container", + ImageID: "sha256:abc123", + Status: "running", + }, nil) + runtime.EXPECT().GetImageID(mock.Anything, "myapp:latest").Return("sha256:abc123", nil) + runtime.EXPECT().IsContainerRunning(mock.Anything, "existing-container").Return(true, nil) + + result, err := svc.Deploy(ctx, route) + + assert.NoError(t, err) + assert.NotNil(t, result) + assert.Equal(t, "existing-container", result.ID) + + tracked, exists := svc.Get(ctx, "test.example.com") + assert.True(t, exists) + assert.Equal(t, "existing-container", tracked.ID) +} + func TestService_Deploy_DoesNotSkipWhenImageIDDiffers(t *testing.T) { runtime := mocks.NewMockContainerRuntime(t) envLoader := mocks.NewMockEnvLoader(t) From 5f1ff6a78c8aa6b1281049e1c06b647d3e916c98 Mon Sep 17 00:00:00 2001 From: brice Date: Mon, 2 Mar 2026 17:15:55 +0100 Subject: [PATCH 03/26] fix: resolve existing container from runtime when memory is stale Deploy now queries the runtime for a running container with matching labels when in-memory state is empty. This prevents orphan cleanup from killing the active container after Gordon restarts. --- internal/usecase/container/service.go | 45 ++++++- internal/usecase/container/service_test.go | 148 ++++++++++++++++++++- 2 files changed, 189 insertions(+), 4 deletions(-) diff --git a/internal/usecase/container/service.go b/internal/usecase/container/service.go index a2694196..b51d7772 100644 --- a/internal/usecase/container/service.go +++ b/internal/usecase/container/service.go @@ -276,7 +276,7 @@ func (s *Service) Deploy(ctx context.Context, route domain.Route) (*domain.Conta } }() - existing, hasExisting := s.getTrackedContainer(route.Domain) + existing, hasExisting := s.resolveExistingContainer(ctx, route.Domain) resources, err := s.prepareDeployResources(ctx, route, existing) if err != nil { @@ -388,6 +388,49 @@ func (s *Service) getTrackedContainer(domainName string) (*domain.Container, boo return container, ok } +// resolveExistingContainer returns the currently running container for a domain. +// It first checks the in-memory map (fast path). If not found, it queries the +// runtime for a running container with matching name and managed label. This +// handles cases where Gordon restarted and in-memory state is stale. +func (s *Service) resolveExistingContainer(ctx context.Context, domainName string) (*domain.Container, bool) { + // Fast path: check in-memory state + s.mu.RLock() + container, ok := s.containers[domainName] + s.mu.RUnlock() + if ok { + return container, true + } + + // Slow path: query runtime for running containers + log := zerowrap.FromCtx(ctx) + expectedName := fmt.Sprintf("gordon-%s", domainName) + + running, err := s.runtime.ListContainers(ctx, false) + if err != nil { + log.Warn().Err(err).Msg("failed to list running containers for existing container resolution") + return nil, false + } + + for _, c := range running { + if c.Name == expectedName && c.Labels[domain.LabelManaged] == "true" { + log.Info(). + Str("container_id", c.ID). + Str("container_name", c.Name). + Msg("resolved existing container from runtime (in-memory state was stale)") + + // Update in-memory state so subsequent lookups are fast + s.mu.Lock() + s.containers[domainName] = c + s.managedCount++ + s.mu.Unlock() + + return c, true + } + } + + return nil, false +} + func (s *Service) prepareDeployResources(ctx context.Context, route domain.Route, existing *domain.Container) (*deployResources, error) { log := zerowrap.FromCtx(ctx) diff --git a/internal/usecase/container/service_test.go b/internal/usecase/container/service_test.go index 49b1e53a..ba3a2d8c 100644 --- a/internal/usecase/container/service_test.go +++ b/internal/usecase/container/service_test.go @@ -198,6 +198,9 @@ func TestService_Deploy_Success(t *testing.T) { Image: "myapp:latest", } + // No container in memory — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + // Setup mocks - no orphaned containers runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) @@ -275,6 +278,9 @@ func TestService_Deploy_ReadinessRecoveryWindow_AllowsTransientFlap(t *testing.T Image: "myapp:latest", } + // No container in memory — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) runtime.EXPECT().PullImage(mock.Anything, "myapp:latest").Return(nil) @@ -329,6 +335,9 @@ func TestService_Deploy_ImagePullFailure(t *testing.T) { Image: "myapp:latest", } + // No container in memory — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) runtime.EXPECT().PullImage(mock.Anything, "myapp:latest").Return(errors.New("image not found")) @@ -471,10 +480,109 @@ func TestService_Deploy_ReplacesExistingContainer(t *testing.T) { assert.NoError(t, err) assert.Equal(t, "new-container", result.ID) +} + +// TestService_Deploy_ResolvesExistingFromRuntime_WhenMemoryStale verifies that +// when in-memory state has no tracked container for a domain, Deploy queries +// the runtime and discovers the running container. This prevents orphan cleanup +// from killing the real active container after a Gordon restart. +func TestService_Deploy_ResolvesExistingFromRuntime_WhenMemoryStale(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + envLoader := mocks.NewMockEnvLoader(t) + eventBus := mocks.NewMockEventPublisher(t) + cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) + + config := Config{ + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + } + svc := NewService(runtime, envLoader, eventBus, nil, config) + svc.SetProxyCacheInvalidator(cacheInvalidator) + ctx := testContext() + + // NO tracked container in memory — simulates Gordon restart + // (svc.containers is empty for "test.example.com") + + route := domain.Route{ + Domain: "test.example.com", + Image: "myapp:v2", + } + + // resolveExistingContainer should query running containers from runtime + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{ + { + ID: "runtime-active-container", + Name: "gordon-test.example.com", + Status: "running", + Labels: map[string]string{ + domain.LabelDomain: "test.example.com", + domain.LabelManaged: "true", + }, + }, + }, nil) + + // Orphan cleanup lists ALL containers (including stopped) — the runtime-active + // container should be skipped because it is now recognized as the existing one + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{ + { + ID: "runtime-active-container", + Name: "gordon-test.example.com", + Status: "running", + Labels: map[string]string{ + domain.LabelDomain: "test.example.com", + domain.LabelManaged: "true", + }, + }, + }, nil) + + // Image operations + runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) + runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil) + runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil) + + // Environment + envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil) + runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil) - // Ensure the service's tracked container entry now points to the new container. - tracked, ok := svc.Get(ctx, "test.example.com") - assert.True(t, ok) + // Create new container with -new suffix (zero-downtime: existing was resolved from runtime) + newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} + runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool { + return cfg.Name == "gordon-test.example.com-new" + })).Return(newContainer, nil) + runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) + + // Wait for ready + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + + // Inspect after ready + runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ + ID: "new-container", + Status: "running", + }, nil) + + // Publish event + eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil) + + // Synchronous cache invalidation + cacheInvalidator.EXPECT().InvalidateTarget(mock.Anything, "test.example.com").Return() + + // AFTER new container is ready: stop and remove old runtime-discovered container + // This is the key assertion — StopContainer on the runtime-active container + // happens during finalizePreviousContainer, NOT during orphan cleanup + runtime.EXPECT().StopContainer(mock.Anything, "runtime-active-container").Return(nil) + runtime.EXPECT().RemoveContainer(mock.Anything, "runtime-active-container", true).Return(nil) + + // Rename new container to canonical name + runtime.EXPECT().RenameContainer(mock.Anything, "new-container", "gordon-test.example.com").Return(nil) + + result, err := svc.Deploy(ctx, route) + + assert.NoError(t, err) + assert.Equal(t, "new-container", result.ID) + + // Verify new container is now tracked + tracked, exists := svc.Get(ctx, "test.example.com") + assert.True(t, exists) assert.Equal(t, "new-container", tracked.ID) } @@ -555,6 +663,9 @@ func TestService_Deploy_WithNetworkIsolation(t *testing.T) { Image: "myapp:latest", } + // No container in memory — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp:latest"}, nil) runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:latest").Return([]int{8080}, nil) @@ -607,6 +718,9 @@ func TestService_Deploy_WithVolumeAutoCreate(t *testing.T) { Image: "myapp:latest", } + // No container in memory — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp:latest"}, nil) runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:latest").Return([]int{8080}, nil) @@ -1324,6 +1438,9 @@ func TestService_Deploy_InternalDeployForcesPull(t *testing.T) { Image: "myapp:latest", } + // No container in memory — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + // Setup mocks - no orphaned containers runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) @@ -1390,6 +1507,9 @@ func TestService_AutoStart_StartsNewContainers(t *testing.T) { {Domain: "app2.example.com", Image: "myapp2:latest"}, } + // No container in memory — runtime resolution returns nothing (one per route) + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil).Times(2) + // Setup mocks for route deployments runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil).Times(2) runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp1:latest", "myapp2:latest"}, nil).Times(2) @@ -1438,6 +1558,9 @@ func TestService_AutoStart_SkipsExistingContainers(t *testing.T) { {Domain: "app2.example.com", Image: "myapp2:latest"}, // New route } + // No container in memory for app2 — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil).Once() + // Only deploy for app2 (app1 is skipped). Readiness is skipped — no IsContainerRunning. runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil).Once() runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp2:latest"}, nil).Once() @@ -1472,6 +1595,9 @@ func TestService_AutoStart_HandlesDeployErrors(t *testing.T) { {Domain: "app1.example.com", Image: "myapp1:latest"}, } + // No container in memory — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + // Setup mocks for failure runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) @@ -1504,6 +1630,9 @@ func TestService_AutoStart_UsesInternalDeployContext(t *testing.T) { {Domain: "app1.example.com", Image: "reg.example.com/myapp:latest"}, } + // No container in memory — runtime resolution returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + // Key assertion: PullImage should be called with localhost:5000 rewrite, // NOT the original reg.example.com/myapp:latest. // Readiness is skipped — no IsContainerRunning calls. @@ -1640,6 +1769,16 @@ func TestService_Deploy_OrphanCleanupRemovesTrueOrphans(t *testing.T) { Image: "myapp:v1", } + // No container in memory — runtime resolution finds no managed container + // (the orphan lacks LabelManaged so it won't match) + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{ + { + ID: "orphan-container", + Name: "gordon-test.example.com", + Status: "running", + }, + }, nil) + // ListContainers returns an orphaned container (same name, but not tracked) // This could happen if Gordon crashed and restarted, or container was created manually runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{ @@ -1982,6 +2121,9 @@ func TestService_Deploy_ConcurrentSameDomain(t *testing.T) { // First deploy: no existing container → creates gordon-test.example.com // Second deploy: sees first container → creates gordon-test.example.com-new + // First deploy resolves from runtime (no container in memory yet) + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil).Once() + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil).Times(2) runtime.EXPECT().ListImages(mock.Anything).Return([]string{"myapp:latest"}, nil).Times(2) runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:latest").Return([]int{8080}, nil).Times(2) From dca4ad56d48e0b38187a210f9637a54d1ee62e45 Mon Sep 17 00:00:00 2001 From: brice Date: Mon, 2 Mar 2026 17:20:18 +0100 Subject: [PATCH 04/26] fix: orphan cleanup never kills running canonical container Running containers with the canonical name are now preserved during orphan cleanup even if not tracked in memory. Only stopped canonical containers and temp (-new/-next) containers are cleaned up. --- internal/usecase/container/service.go | 12 +++ internal/usecase/container/service_test.go | 111 ++++++++++++++++++--- 2 files changed, 109 insertions(+), 14 deletions(-) diff --git a/internal/usecase/container/service.go b/internal/usecase/container/service.go index b51d7772..8210b291 100644 --- a/internal/usecase/container/service.go +++ b/internal/usecase/container/service.go @@ -1656,6 +1656,18 @@ func (s *Service) cleanupOrphanedContainers(ctx context.Context, domainName stri for _, c := range allContainers { if (c.Name == expectedName || c.Name == expectedNewName || c.Name == expectedNextName) && c.ID != skipContainerID { + // Never kill a running container with the canonical name — it may be + // the active container serving traffic. Only temp (-new/-next) containers + // and stopped canonical containers are safe to remove. + isCanonical := c.Name == expectedName + if isCanonical && c.Status == "running" { + log.Debug(). + Str(zerowrap.FieldEntityID, c.ID). + Str(zerowrap.FieldStatus, c.Status). + Msg("skipping running canonical container during orphan cleanup") + continue + } + log.Info().Str(zerowrap.FieldEntityID, c.ID).Str(zerowrap.FieldStatus, c.Status).Msg("found orphaned container, removing") if err := s.runtime.StopContainer(ctx, c.ID); err != nil { diff --git a/internal/usecase/container/service_test.go b/internal/usecase/container/service_test.go index ba3a2d8c..a8e12d15 100644 --- a/internal/usecase/container/service_test.go +++ b/internal/usecase/container/service_test.go @@ -1748,8 +1748,9 @@ func TestService_Deploy_OrphanCleanupSkipsTrackedContainer(t *testing.T) { assert.Equal(t, "new-container", tracked.ID) } -// TestService_Deploy_OrphanCleanupRemovesTrueOrphans verifies that containers -// with the same name but NOT tracked are properly removed as orphans. +// TestService_Deploy_OrphanCleanupRemovesTrueOrphans verifies that stopped canonical +// containers with the same name but NOT tracked are properly removed as orphans. +// Running canonical containers are preserved (see NeverKillsRunningCanonical test). func TestService_Deploy_OrphanCleanupRemovesTrueOrphans(t *testing.T) { runtime := mocks.NewMockContainerRuntime(t) envLoader := mocks.NewMockEnvLoader(t) @@ -1770,26 +1771,20 @@ func TestService_Deploy_OrphanCleanupRemovesTrueOrphans(t *testing.T) { } // No container in memory — runtime resolution finds no managed container - // (the orphan lacks LabelManaged so it won't match) - runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{ - { - ID: "orphan-container", - Name: "gordon-test.example.com", - Status: "running", - }, - }, nil) + // (the orphan is stopped so it won't appear in running-only query) + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) - // ListContainers returns an orphaned container (same name, but not tracked) - // This could happen if Gordon crashed and restarted, or container was created manually + // ListContainers(true) returns a STOPPED orphaned container (same name, but not tracked) + // Stopped canonical containers are safe to remove during orphan cleanup runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{ { ID: "orphan-container", Name: "gordon-test.example.com", - Status: "running", + Status: "exited", }, }, nil) - // Orphan should be stopped and removed BEFORE we proceed + // Stopped orphan should be stopped and removed BEFORE we proceed runtime.EXPECT().StopContainer(mock.Anything, "orphan-container").Return(nil) runtime.EXPECT().RemoveContainer(mock.Anything, "orphan-container", true).Return(nil) @@ -2629,3 +2624,91 @@ func TestService_Deploy_SkipRedundantDeploy_ContainerNotRunning(t *testing.T) { assert.NoError(t, err) assert.Equal(t, "new-container", result.ID) } + +// TestService_Deploy_OrphanCleanup_NeverKillsRunningCanonical verifies that orphan +// cleanup never stops a running container with the canonical name, even if it is not +// tracked in memory and was not resolved by resolveExistingContainer. Only stopped +// canonical containers and temp containers (-new/-next) should be cleaned up. +func TestService_Deploy_OrphanCleanup_NeverKillsRunningCanonical(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + envLoader := mocks.NewMockEnvLoader(t) + eventBus := mocks.NewMockEventPublisher(t) + + config := Config{ + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + } + svc := NewService(runtime, envLoader, eventBus, nil, config) + ctx := testContext() + + // NO tracked container in memory. resolveExistingContainer will NOT find it either + // because the running container lacks the managed label in this edge case. + + route := domain.Route{ + Domain: "test.example.com", + Image: "myapp:v2", + } + + // resolveExistingContainer: ListContainers(ctx, false) returns empty — + // the canonical container is running but not discovered (e.g., missing label) + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + + // cleanupOrphanedContainers: ListContainers(ctx, true) finds both the running + // canonical container AND a stale -new leftover. The canonical container should + // NOT be killed because it's running and has the canonical name. + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{ + { + ID: "active-canonical", + Name: "gordon-test.example.com", + Status: "running", + }, + { + ID: "stale-leftover", + Name: "gordon-test.example.com-new", + Status: "exited", + }, + }, nil) + + // Only the stale -new container should be stopped and removed during orphan cleanup + runtime.EXPECT().StopContainer(mock.Anything, "stale-leftover").Return(nil).Once() + runtime.EXPECT().RemoveContainer(mock.Anything, "stale-leftover", true).Return(nil).Once() + + // Image operations + runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) + runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil) + runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil) + + // Environment + envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil) + runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil) + + // Create container — no existing tracked, so canonical name is used + newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com", Status: "created"} + runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool { + return cfg.Name == "gordon-test.example.com" + })).Return(newContainer, nil) + runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) + + // Wait for ready + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + + // Inspect after ready + runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ + ID: "new-container", + Status: "running", + Ports: []int{8080}, + }, nil) + + // Publish event + eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil) + + result, err := svc.Deploy(ctx, route) + + assert.NoError(t, err) + assert.Equal(t, "new-container", result.ID) + + // Verify new container is tracked + tracked, exists := svc.Get(ctx, "test.example.com") + assert.True(t, exists) + assert.Equal(t, "new-container", tracked.ID) +} From 1c38208c82c9f0645e43f613dc6be379a6baf70b Mon Sep 17 00:00:00 2001 From: brice Date: Mon, 2 Mar 2026 17:23:12 +0100 Subject: [PATCH 05/26] feat: add TCP and HTTP readiness probes TCP probe retries every 500ms until port accepts connections. HTTP probe retries GET every 1s, expects 2xx/3xx. Both are universal readiness checks for deploy pipeline. --- internal/usecase/container/readiness.go | 68 +++++++++ internal/usecase/container/readiness_test.go | 152 +++++++++++++++++++ 2 files changed, 220 insertions(+) create mode 100644 internal/usecase/container/readiness.go create mode 100644 internal/usecase/container/readiness_test.go diff --git a/internal/usecase/container/readiness.go b/internal/usecase/container/readiness.go new file mode 100644 index 00000000..529effb5 --- /dev/null +++ b/internal/usecase/container/readiness.go @@ -0,0 +1,68 @@ +package container + +import ( + "context" + "fmt" + "net" + "net/http" + "time" +) + +// tcpProbe attempts a TCP connection to addr, retrying every 500ms until +// success or timeout. This is the universal fallback readiness check — +// it verifies the process is at least accepting connections. +func tcpProbe(ctx context.Context, addr string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for { + if err := ctx.Err(); err != nil { + return err + } + if time.Now().After(deadline) { + return fmt.Errorf("TCP probe timeout after %s: %s not reachable", timeout, addr) + } + + conn, err := net.DialTimeout("tcp", addr, time.Second) + if err == nil { + conn.Close() + return nil + } + + select { + case <-time.After(500 * time.Millisecond): + case <-ctx.Done(): + return ctx.Err() + } + } +} + +// httpProbe performs HTTP GET requests to url, retrying every 1s until a +// 2xx/3xx response or timeout. Used when gordon.health label is set. +func httpProbe(ctx context.Context, url string, timeout time.Duration) error { + client := &http.Client{Timeout: 2 * time.Second} + deadline := time.Now().Add(timeout) + var lastStatus int + + for { + if err := ctx.Err(); err != nil { + return err + } + if time.Now().After(deadline) { + return fmt.Errorf("HTTP probe timeout after %s: last status %d from %s", timeout, lastStatus, url) + } + + resp, err := client.Get(url) + if err == nil { + lastStatus = resp.StatusCode + resp.Body.Close() + if lastStatus >= 200 && lastStatus < 400 { + return nil + } + } + + select { + case <-time.After(time.Second): + case <-ctx.Done(): + return ctx.Err() + } + } +} diff --git a/internal/usecase/container/readiness_test.go b/internal/usecase/container/readiness_test.go new file mode 100644 index 00000000..d3b9fe08 --- /dev/null +++ b/internal/usecase/container/readiness_test.go @@ -0,0 +1,152 @@ +package container + +import ( + "context" + "net" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// --- TCP Probe Tests --- + +func TestTCPProbe_Success(t *testing.T) { + ln, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + defer ln.Close() + + // Accept connections in background so the probe succeeds + go func() { + for { + conn, err := ln.Accept() + if err != nil { + return + } + conn.Close() + } + }() + + ctx := testContext() + err = tcpProbe(ctx, ln.Addr().String(), 5*time.Second) + assert.NoError(t, err) +} + +func TestTCPProbe_Timeout(t *testing.T) { + // Use a port that nothing listens on (port 1 is privileged, won't have a listener) + ctx := testContext() + err := tcpProbe(ctx, "127.0.0.1:1", 200*time.Millisecond) + assert.Error(t, err) + assert.Contains(t, err.Error(), "TCP probe timeout") +} + +func TestTCPProbe_ContextCancelled(t *testing.T) { + ctx, cancel := context.WithCancel(testContext()) + cancel() + err := tcpProbe(ctx, "127.0.0.1:1", 5*time.Second) + assert.Error(t, err) +} + +func TestTCPProbe_DelayedListener(t *testing.T) { + ln, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + // Close initially so probe fails at first + addr := ln.Addr().String() + ln.Close() + + // Re-open after 1 second + go func() { + time.Sleep(time.Second) + newLn, err := net.Listen("tcp", addr) + if err != nil { + return + } + defer newLn.Close() + for { + conn, err := newLn.Accept() + if err != nil { + return + } + conn.Close() + } + }() + + ctx := testContext() + err = tcpProbe(ctx, addr, 5*time.Second) + assert.NoError(t, err) +} + +// --- HTTP Probe Tests --- + +func TestHTTPProbe_Success(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/healthz" { + w.WriteHeader(200) + return + } + w.WriteHeader(404) + })) + defer srv.Close() + + ctx := testContext() + err := httpProbe(ctx, srv.URL+"/healthz", 5*time.Second) + assert.NoError(t, err) +} + +func TestHTTPProbe_ServerError_ThenSuccess(t *testing.T) { + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + n := atomic.AddInt32(&calls, 1) + if n < 3 { + w.WriteHeader(503) + return + } + w.WriteHeader(200) + })) + defer srv.Close() + + ctx := testContext() + err := httpProbe(ctx, srv.URL+"/healthz", 10*time.Second) + assert.NoError(t, err) + assert.GreaterOrEqual(t, atomic.LoadInt32(&calls), int32(3)) +} + +func TestHTTPProbe_Timeout(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(503) + })) + defer srv.Close() + + ctx := testContext() + err := httpProbe(ctx, srv.URL+"/healthz", 500*time.Millisecond) + assert.Error(t, err) + assert.Contains(t, err.Error(), "HTTP probe timeout") +} + +func TestHTTPProbe_ContextCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(503) + })) + defer srv.Close() + + ctx, cancel := context.WithCancel(testContext()) + cancel() + err := httpProbe(ctx, srv.URL+"/healthz", 5*time.Second) + assert.Error(t, err) +} + +func TestHTTPProbe_RedirectIsSuccess(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(302) + })) + defer srv.Close() + + ctx := testContext() + // Disable follow redirects for the test — the probe should treat 3xx as success + err := httpProbe(ctx, srv.URL+"/healthz", 5*time.Second) + assert.NoError(t, err) +} From 43c7dfb49f99328bec7fc5f5e62cacc2d62d0d23 Mon Sep 17 00:00:00 2001 From: brice Date: Mon, 2 Mar 2026 17:29:56 +0100 Subject: [PATCH 06/26] =?UTF-8?q?feat:=20readiness=20cascade=20=E2=80=94?= =?UTF-8?q?=20healthcheck=20>=20HTTP=20probe=20>=20TCP=20probe=20>=20delay?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit waitForReady now auto-detects the strongest available readiness signal in auto mode. Docker healthcheck is preferred, then HTTP probe via gordon.health label, then TCP connect probe to exposed port, then delay fallback as last resort. --- internal/usecase/container/service.go | 76 ++++++++- .../container/service_readiness_test.go | 152 +++++++++++++++++- 2 files changed, 219 insertions(+), 9 deletions(-) diff --git a/internal/usecase/container/service.go b/internal/usecase/container/service.go index 8210b291..2e5818be 100644 --- a/internal/usecase/container/service.go +++ b/internal/usecase/container/service.go @@ -507,7 +507,7 @@ func (s *Service) createStartedContainer(ctx context.Context, route domain.Route return nil, log.WrapErr(err, "failed to start container") } if !domain.IsSkipReadiness(ctx) { - if err := s.waitForReady(ctx, newContainer.ID); err != nil { + if err := s.waitForReady(ctx, newContainer.ID, containerConfig); err != nil { s.cleanupFailedContainer(ctx, newContainer.ID) return nil, log.WrapErr(err, "container failed readiness check") } @@ -2146,7 +2146,7 @@ func (s *Service) findContainerByName(ctx context.Context, name string) *domain. return nil } -func (s *Service) waitForReady(ctx context.Context, containerID string) error { +func (s *Service) waitForReady(ctx context.Context, containerID string, containerConfig *domain.ContainerConfig) error { if err := s.pollContainerRunning(ctx, containerID); err != nil { return err } @@ -2173,15 +2173,75 @@ func (s *Service) waitForReady(ctx context.Context, containerID string) error { } return s.waitForHealthy(ctx, containerID, cfg.HealthTimeout) default: // auto - _, hasHealthcheck, err := s.runtime.GetContainerHealthStatus(ctx, containerID) - if err != nil { - return err + return s.readinessCascade(ctx, containerID, containerConfig, cfg) + } +} + +// readinessCascade auto-detects the strongest available readiness signal: +// 1. Docker healthcheck (if present) → wait for healthy status +// 2. HTTP probe (if gordon.health label set) → GET until 2xx/3xx +// 3. TCP probe (if port info available) → connect until accepted +// 4. Delay fallback (last resort) → waitForReadyByDelay +func (s *Service) readinessCascade(ctx context.Context, containerID string, containerConfig *domain.ContainerConfig, cfg Config) error { + log := zerowrap.FromCtx(ctx) + + // 1. Docker healthcheck + _, hasHealthcheck, err := s.runtime.GetContainerHealthStatus(ctx, containerID) + if err != nil { + return err + } + if hasHealthcheck { + log.Info().Msg("readiness cascade: using Docker healthcheck") + return s.waitForHealthy(ctx, containerID, cfg.HealthTimeout) + } + + // 2. HTTP probe via gordon.health label + if containerConfig != nil { + if healthPath, ok := containerConfig.Labels[domain.LabelHealth]; ok && healthPath != "" { + ip, port, probeErr := s.resolveContainerEndpoint(ctx, containerID, containerConfig) + if probeErr == nil && ip != "" && port > 0 { + url := fmt.Sprintf("http://%s:%d%s", ip, port, healthPath) + timeout := cfg.HealthTimeout + if timeout == 0 { + timeout = 60 * time.Second + } + log.Info().Str("url", url).Dur("timeout", timeout).Msg("readiness cascade: using HTTP probe") + return httpProbe(ctx, url, timeout) + } + // Could not resolve endpoint; fall through to next level + log.Debug().Err(probeErr).Msg("readiness cascade: HTTP probe skipped, could not resolve container endpoint") } - if hasHealthcheck { - return s.waitForHealthy(ctx, containerID, cfg.HealthTimeout) + } + + // 3. TCP probe (if port info available) + if containerConfig != nil && len(containerConfig.Ports) > 0 { + ip, port, probeErr := s.resolveContainerEndpoint(ctx, containerID, containerConfig) + if probeErr == nil && ip != "" && port > 0 { + addr := fmt.Sprintf("%s:%d", ip, port) + timeout := cfg.HealthTimeout + if timeout == 0 { + timeout = 30 * time.Second + } + log.Info().Str("addr", addr).Dur("timeout", timeout).Msg("readiness cascade: using TCP probe") + return tcpProbe(ctx, addr, timeout) } - return s.waitForReadyByDelay(ctx, containerID) + // Could not resolve endpoint; fall through to delay + log.Debug().Err(probeErr).Msg("readiness cascade: TCP probe skipped, could not resolve container endpoint") + } + + // 4. Delay fallback + log.Info().Msg("readiness cascade: using delay fallback") + return s.waitForReadyByDelay(ctx, containerID) +} + +// resolveContainerEndpoint retrieves the container's IP and first port via +// GetContainerNetworkInfo. The result can be used for HTTP or TCP probes. +func (s *Service) resolveContainerEndpoint(ctx context.Context, containerID string, containerConfig *domain.ContainerConfig) (string, int, error) { + ip, port, err := s.runtime.GetContainerNetworkInfo(ctx, containerID) + if err != nil { + return "", 0, err } + return ip, port, nil } // waitForReadyByDelay waits using the legacy running+delay strategy. diff --git a/internal/usecase/container/service_readiness_test.go b/internal/usecase/container/service_readiness_test.go index 1e613376..bc2e5fbf 100644 --- a/internal/usecase/container/service_readiness_test.go +++ b/internal/usecase/container/service_readiness_test.go @@ -9,6 +9,7 @@ import ( "github.com/stretchr/testify/mock" "github.com/bnema/gordon/internal/boundaries/out/mocks" + "github.com/bnema/gordon/internal/domain" ) func TestService_WaitForReady_AutoFallsBackToDelayWhenNoHealthcheck(t *testing.T) { @@ -28,7 +29,156 @@ func TestService_WaitForReady_AutoFallsBackToDelayWhenNoHealthcheck(t *testing.T // Delay-mode fallback verification runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() - err := svc.waitForReady(ctx, containerID) + err := svc.waitForReady(ctx, containerID, nil) + assert.NoError(t, err) +} + +func TestService_WaitForReady_AutoCascadeUsesTCPWhenNoHealthcheckAndNoHealthLabel(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + svc := NewService(runtime, nil, nil, nil, Config{ + ReadinessMode: "auto", + }) + + ctx := testContext() + containerID := "container-1" + + containerConfig := &domain.ContainerConfig{ + Ports: []int{8080}, + Labels: map[string]string{}, + } + + // Initial running poll + runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() + // No Docker healthcheck + runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("", false, nil).Once() + // Cascade resolves container endpoint for TCP probe + runtime.EXPECT().GetContainerNetworkInfo(mock.Anything, containerID).Return("172.17.0.5", 8080, nil).Once() + + // TCP probe will try to connect to 172.17.0.5:8080 — which will fail since + // there's nothing listening. Use a short health timeout so the test doesn't hang. + svc.mu.Lock() + svc.config.HealthTimeout = 50 * time.Millisecond + svc.mu.Unlock() + + err := svc.waitForReady(ctx, containerID, containerConfig) + // Expect TCP probe timeout (no server listening at 172.17.0.5:8080) + assert.Error(t, err) + assert.Contains(t, err.Error(), "TCP probe timeout") +} + +func TestService_WaitForReady_AutoCascadeUsesHTTPProbeWhenHealthLabelSet(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + svc := NewService(runtime, nil, nil, nil, Config{ + ReadinessMode: "auto", + HealthTimeout: 50 * time.Millisecond, + }) + + ctx := testContext() + containerID := "container-1" + + containerConfig := &domain.ContainerConfig{ + Ports: []int{8080}, + Labels: map[string]string{ + domain.LabelHealth: "/healthz", + }, + } + + // Initial running poll + runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() + // No Docker healthcheck + runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("", false, nil).Once() + // Cascade resolves container endpoint for HTTP probe + runtime.EXPECT().GetContainerNetworkInfo(mock.Anything, containerID).Return("172.17.0.5", 8080, nil).Once() + + err := svc.waitForReady(ctx, containerID, containerConfig) + // Expect HTTP probe timeout (no server listening) + assert.Error(t, err) + assert.Contains(t, err.Error(), "HTTP probe timeout") +} + +func TestService_WaitForReady_AutoCascadeUsesHealthcheckWhenPresent(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + svc := NewService(runtime, nil, nil, nil, Config{ + ReadinessMode: "auto", + HealthTimeout: 50 * time.Millisecond, + }) + + ctx := testContext() + containerID := "container-1" + + containerConfig := &domain.ContainerConfig{ + Ports: []int{8080}, + Labels: map[string]string{ + domain.LabelHealth: "/healthz", + }, + } + + // Initial running poll + runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() + // Docker healthcheck IS present — cascade detects it, then waitForHealthy polls it + // First call: cascade detection (hasHealthcheck=true) + // Second call: waitForHealthy loop (status=healthy) + runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("starting", true, nil).Once() + runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("healthy", true, nil).Once() + + err := svc.waitForReady(ctx, containerID, containerConfig) + assert.NoError(t, err) +} + +func TestService_WaitForReady_AutoCascadeFallsToDelayWhenNoEndpoint(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + svc := NewService(runtime, nil, nil, nil, Config{ + ReadinessMode: "auto", + ReadinessDelay: time.Millisecond, + }) + + ctx := testContext() + containerID := "container-1" + + // Container has ports but network info is unavailable + containerConfig := &domain.ContainerConfig{ + Ports: []int{8080}, + Labels: map[string]string{}, + } + + // Initial running poll + runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() + // No Docker healthcheck + runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("", false, nil).Once() + // Network info fails — can't do TCP probe, fall through to delay + runtime.EXPECT().GetContainerNetworkInfo(mock.Anything, containerID).Return("", 0, assert.AnError).Once() + // Delay-mode fallback verification + runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() + + err := svc.waitForReady(ctx, containerID, containerConfig) + assert.NoError(t, err) +} + +func TestService_WaitForReady_ExplicitDelayModeIgnoresCascade(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + svc := NewService(runtime, nil, nil, nil, Config{ + ReadinessMode: "delay", + ReadinessDelay: time.Millisecond, + }) + + ctx := testContext() + containerID := "container-1" + + // Even with health label and ports, "delay" mode skips cascade entirely + containerConfig := &domain.ContainerConfig{ + Ports: []int{8080}, + Labels: map[string]string{ + domain.LabelHealth: "/healthz", + }, + } + + // Initial running poll + runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() + // No GetContainerHealthStatus call — delay mode skips it + // Delay-mode verification + runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() + + err := svc.waitForReady(ctx, containerID, containerConfig) assert.NoError(t, err) } From b3c40513c879c2b542016fb7276a1697664a73aa Mon Sep 17 00:00:00 2001 From: brice Date: Mon, 2 Mar 2026 17:40:55 +0100 Subject: [PATCH 07/26] feat: post-switch stabilization window with automatic rollback After switching traffic to new container, Deploy monitors it briefly. If new container crashes during stabilization, traffic is switched back to old container and the failed new container is cleaned up. --- internal/usecase/container/service.go | 64 +++++ internal/usecase/container/service_test.go | 268 ++++++++++++++++++--- 2 files changed, 294 insertions(+), 38 deletions(-) diff --git a/internal/usecase/container/service.go b/internal/usecase/container/service.go index 2e5818be..e1a5c879 100644 --- a/internal/usecase/container/service.go +++ b/internal/usecase/container/service.go @@ -48,6 +48,7 @@ type Config struct { DrainDelayConfigured bool // True when deploy.drain_delay was explicitly configured DrainMode string // Drain strategy: auto, inflight, delay DrainTimeout time.Duration // Max wait for in-flight requests to drain + StabilizationDelay time.Duration // Pause after traffic switch before verifying new container health } var tracer = otel.Tracer("gordon.container") @@ -305,6 +306,15 @@ func (s *Service) Deploy(ctx context.Context, route domain.Route) (*domain.Conta } invalidated := s.activateDeployedContainer(ctx, route.Domain, newContainer) + + // Post-switch stabilization: verify new container stays running + if hasExisting { + if !s.stabilizeNewContainer(ctx, route.Domain, newContainer, existing) { + // Rollback performed — old container is restored + return existing, nil + } + } + s.finalizePreviousContainer(ctx, route.Domain, existing, hasExisting, invalidated, newContainer.ID) // Start container log collection (non-blocking, errors don't fail deployment) @@ -549,6 +559,60 @@ func (s *Service) activateDeployedContainer(ctx context.Context, domainName stri return false } +// stabilizeNewContainer monitors the new container briefly after traffic switch. +// If it crashes during this window, rolls back to old container. +// Returns true if stabilization succeeded, false if rollback was performed. +func (s *Service) stabilizeNewContainer(ctx context.Context, domainName string, newContainer, oldContainer *domain.Container) bool { + log := zerowrap.FromCtx(ctx) + + if oldContainer == nil { + return true + } + + s.mu.RLock() + delay := s.config.StabilizationDelay + s.mu.RUnlock() + if delay == 0 { + delay = 2 * time.Second + } + + // Brief stabilization: verify new container is still running after delay + select { + case <-time.After(delay): + case <-ctx.Done(): + return true + } + + running, err := s.runtime.IsContainerRunning(ctx, newContainer.ID) + if err != nil || !running { + log.Error(). + Str("new_container", newContainer.ID). + Str("old_container", oldContainer.ID). + Msg("new container crashed during stabilization, rolling back to old") + + // Rollback: restore old container as tracked + s.mu.Lock() + s.containers[domainName] = oldContainer + s.mu.Unlock() + + // Re-invalidate proxy cache to point back to old + s.mu.RLock() + inv := s.cacheInvalidator + s.mu.RUnlock() + if inv != nil { + inv.InvalidateTarget(ctx, domainName) + } + + // Cleanup failed new container + _ = s.runtime.StopContainer(ctx, newContainer.ID) + _ = s.runtime.RemoveContainer(ctx, newContainer.ID, true) + + return false + } + + return true +} + func (s *Service) finalizePreviousContainer(ctx context.Context, domainName string, existing *domain.Container, hasExisting, invalidated bool, newContainerID string) { if !hasExisting { return diff --git a/internal/usecase/container/service_test.go b/internal/usecase/container/service_test.go index a8e12d15..dd060416 100644 --- a/internal/usecase/container/service_test.go +++ b/internal/usecase/container/service_test.go @@ -415,8 +415,9 @@ func TestService_Deploy_ReplacesExistingContainer(t *testing.T) { cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) config := Config{ - ReadinessDelay: time.Millisecond, // Minimal delay for tests - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, // Minimal delay for tests + DrainDelay: time.Millisecond, + StabilizationDelay: time.Millisecond, } svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) @@ -454,8 +455,8 @@ func TestService_Deploy_ReplacesExistingContainer(t *testing.T) { })).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - // Wait for ready - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) // Inspect after ready runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ @@ -469,7 +470,7 @@ func TestService_Deploy_ReplacesExistingContainer(t *testing.T) { // Synchronous cache invalidation before old container cleanup cacheInvalidator.EXPECT().InvalidateTarget(mock.Anything, "test.example.com").Return() - // Now cleanup old container (after new one is ready + cache invalidated + drain delay) + // Now cleanup old container (after new one is ready + cache invalidated + stabilization + drain delay) runtime.EXPECT().StopContainer(mock.Anything, "old-container").Return(nil) runtime.EXPECT().RemoveContainer(mock.Anything, "old-container", true).Return(nil) @@ -493,8 +494,9 @@ func TestService_Deploy_ResolvesExistingFromRuntime_WhenMemoryStale(t *testing.T cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + StabilizationDelay: time.Millisecond, } svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) @@ -551,8 +553,8 @@ func TestService_Deploy_ResolvesExistingFromRuntime_WhenMemoryStale(t *testing.T })).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - // Wait for ready - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) // Inspect after ready runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ @@ -592,8 +594,9 @@ func TestService_Deploy_SkipRedundantDeploy_GetImageIDError(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + StabilizationDelay: time.Millisecond, } svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -627,7 +630,8 @@ func TestService_Deploy_SkipRedundantDeploy_GetImageIDError(t *testing.T) { newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} runtime.EXPECT().CreateContainer(mock.Anything, mock.AnythingOfType("*domain.ContainerConfig")).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ ID: "new-container", Status: "running", }, nil) @@ -1667,8 +1671,9 @@ func TestService_Deploy_OrphanCleanupSkipsTrackedContainer(t *testing.T) { cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) config := Config{ - ReadinessDelay: time.Millisecond, // Minimal delay for tests - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, // Minimal delay for tests + DrainDelay: time.Millisecond, + StabilizationDelay: time.Millisecond, } svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) @@ -1714,8 +1719,8 @@ func TestService_Deploy_OrphanCleanupSkipsTrackedContainer(t *testing.T) { })).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - // Wait for ready - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) // Inspect after ready runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ @@ -1835,8 +1840,9 @@ func TestService_Deploy_OrphanCleanupRemovesStaleNewContainer(t *testing.T) { cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + StabilizationDelay: time.Millisecond, } svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) @@ -1880,7 +1886,8 @@ func TestService_Deploy_OrphanCleanupRemovesStaleNewContainer(t *testing.T) { return cfg.Name == "gordon-test.example.com-new" })).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ ID: "new-container", Status: "running", @@ -1907,8 +1914,9 @@ func TestService_Deploy_TrackedTempContainerUsesAlternateTempName(t *testing.T) cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + StabilizationDelay: time.Millisecond, } svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) @@ -1945,7 +1953,8 @@ func TestService_Deploy_TrackedTempContainerUsesAlternateTempName(t *testing.T) return cfg.Name == "gordon-test.example.com-next" })).Return(created, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ ID: "new-container", Status: "running", @@ -2096,8 +2105,9 @@ func TestService_Deploy_ConcurrentSameDomain(t *testing.T) { cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + StabilizationDelay: time.Millisecond, } svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) @@ -2142,8 +2152,8 @@ func TestService_Deploy_ConcurrentSameDomain(t *testing.T) { }) runtime.EXPECT().StartContainer(mock.Anything, mock.AnythingOfType("string")).Return(nil).Times(2) - // IsContainerRunning is called 2x per deploy in waitForReady (initial check + after delay) - runtime.EXPECT().IsContainerRunning(mock.Anything, mock.AnythingOfType("string")).Return(true, nil).Times(4) + // IsContainerRunning: 2x per deploy in waitForReady + 1x stabilization for the second deploy (which has existing) + runtime.EXPECT().IsContainerRunning(mock.Anything, mock.AnythingOfType("string")).Return(true, nil).Times(5) runtime.EXPECT().InspectContainer(mock.Anything, mock.AnythingOfType("string")). RunAndReturn(func(_ context.Context, id string) (*domain.Container, error) { @@ -2235,8 +2245,9 @@ func TestService_Deploy_CacheInvalidationBeforeOldContainerStop(t *testing.T) { cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + StabilizationDelay: time.Millisecond, } svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) @@ -2269,7 +2280,8 @@ func TestService_Deploy_CacheInvalidationBeforeOldContainerStop(t *testing.T) { newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} runtime.EXPECT().CreateContainer(mock.Anything, mock.AnythingOfType("*domain.ContainerConfig")).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ ID: "new-container", Status: "running", }, nil) @@ -2314,8 +2326,9 @@ func TestService_Deploy_NilCacheInvalidator(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + StabilizationDelay: time.Millisecond, } // Intentionally NOT setting cache invalidator svc := NewService(runtime, envLoader, eventBus, nil, config) @@ -2342,7 +2355,8 @@ func TestService_Deploy_NilCacheInvalidator(t *testing.T) { newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} runtime.EXPECT().CreateContainer(mock.Anything, mock.AnythingOfType("*domain.ContainerConfig")).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ ID: "new-container", Status: "running", }, nil) @@ -2471,8 +2485,9 @@ func TestService_Deploy_DoesNotSkipWhenImageIDDiffers(t *testing.T) { cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + StabilizationDelay: time.Millisecond, } svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) @@ -2507,7 +2522,8 @@ func TestService_Deploy_DoesNotSkipWhenImageIDDiffers(t *testing.T) { newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} runtime.EXPECT().CreateContainer(mock.Anything, mock.AnythingOfType("*domain.ContainerConfig")).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ ID: "new-container", Status: "running", }, nil) @@ -2572,8 +2588,9 @@ func TestService_Deploy_SkipRedundantDeploy_ContainerNotRunning(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + StabilizationDelay: time.Millisecond, } svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -2608,7 +2625,8 @@ func TestService_Deploy_SkipRedundantDeploy_ContainerNotRunning(t *testing.T) { newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} runtime.EXPECT().CreateContainer(mock.Anything, mock.AnythingOfType("*domain.ContainerConfig")).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (2 calls) + stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ ID: "new-container", Status: "running", }, nil) @@ -2712,3 +2730,177 @@ func TestService_Deploy_OrphanCleanup_NeverKillsRunningCanonical(t *testing.T) { assert.True(t, exists) assert.Equal(t, "new-container", tracked.ID) } + +// TestService_Deploy_RollbackOnPostSwitchCrash verifies that if the new container crashes +// during the post-switch stabilization window, traffic is automatically rolled back to the +// old container: the old container is restored as tracked, the proxy cache is re-invalidated, +// and the failed new container is cleaned up. +func TestService_Deploy_RollbackOnPostSwitchCrash(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + envLoader := mocks.NewMockEnvLoader(t) + eventBus := mocks.NewMockEventPublisher(t) + cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) + + config := Config{ + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, + StabilizationDelay: time.Millisecond, + } + svc := NewService(runtime, envLoader, eventBus, nil, config) + svc.SetProxyCacheInvalidator(cacheInvalidator) + ctx := testContext() + + // Pre-populate with existing container + existing := &domain.Container{ + ID: "old-container", + Name: "gordon-test.example.com", + Status: "running", + } + svc.containers["test.example.com"] = existing + + route := domain.Route{ + Domain: "test.example.com", + Image: "myapp:v2", + } + + // Cleanup orphans + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) + + // Image operations + runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) + runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil) + runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil) + + // Environment + envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil) + runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil) + + // Create new container + newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} + runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool { + return cfg.Name == "gordon-test.example.com-new" + })).Return(newContainer, nil) + runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) + + // Readiness: new container is running during readiness checks (2 calls) + // Stabilization: new container has crashed (returns false) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(false, nil).Once() + + // Inspect after ready + runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ + ID: "new-container", + Name: "gordon-test.example.com-new", + Status: "running", + }, nil) + + // Publish event (happens during activateDeployedContainer) + eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil) + + // InvalidateTarget called TWICE: once during activate, once during rollback + cacheInvalidator.EXPECT().InvalidateTarget(mock.Anything, "test.example.com").Return().Times(2) + + // Cleanup failed new container during rollback + runtime.EXPECT().StopContainer(mock.Anything, "new-container").Return(nil) + runtime.EXPECT().RemoveContainer(mock.Anything, "new-container", true).Return(nil) + + // StopContainer on old-container should NOT be called (old stays running) + // (no expectation for StopContainer("old-container")) + + result, err := svc.Deploy(ctx, route) + + // Deploy returns the old (existing) container after rollback + assert.NoError(t, err) + assert.NotNil(t, result) + assert.Equal(t, "old-container", result.ID, "should return old container after rollback") + + // Verify old container is restored as tracked + trackedRB, existsRB := svc.Get(ctx, "test.example.com") + assert.True(t, existsRB) + assert.Equal(t, "old-container", trackedRB.ID, "old container should be restored after rollback") +} + +// TestService_Deploy_StabilizationSuccess verifies that when the new container remains +// healthy during the stabilization window, deployment proceeds normally and the old +// container is finalized (stopped and removed). +func TestService_Deploy_StabilizationSuccess(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + envLoader := mocks.NewMockEnvLoader(t) + eventBus := mocks.NewMockEventPublisher(t) + cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) + + config := Config{ + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, + StabilizationDelay: time.Millisecond, + } + svc := NewService(runtime, envLoader, eventBus, nil, config) + svc.SetProxyCacheInvalidator(cacheInvalidator) + ctx := testContext() + + // Pre-populate with existing container + existing := &domain.Container{ + ID: "old-container", + Name: "gordon-test.example.com", + Status: "running", + } + svc.containers["test.example.com"] = existing + + route := domain.Route{ + Domain: "test.example.com", + Image: "myapp:v2", + } + + // Cleanup orphans + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) + + // Image operations + runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) + runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil) + runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil) + + // Environment + envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil) + runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil) + + // Create new container + newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} + runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool { + return cfg.Name == "gordon-test.example.com-new" + })).Return(newContainer, nil) + runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) + + // Readiness (2 calls) + stabilization check (1 call) — all return running + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) + + // Inspect after ready + runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ + ID: "new-container", + Name: "gordon-test.example.com-new", + Status: "running", + }, nil) + + // Publish event + eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil) + + // Cache invalidation (once, during activate) + cacheInvalidator.EXPECT().InvalidateTarget(mock.Anything, "test.example.com").Return() + + // Old container finalized normally + runtime.EXPECT().StopContainer(mock.Anything, "old-container").Return(nil) + runtime.EXPECT().RemoveContainer(mock.Anything, "old-container", true).Return(nil) + runtime.EXPECT().RenameContainer(mock.Anything, "new-container", "gordon-test.example.com").Return(nil) + + result, err := svc.Deploy(ctx, route) + + assert.NoError(t, err) + assert.NotNil(t, result) + assert.Equal(t, "new-container", result.ID, "should return new container on successful stabilization") + + // Verify new container is tracked + trackedSS, existsSS := svc.Get(ctx, "test.example.com") + assert.True(t, existsSS) + assert.Equal(t, "new-container", trackedSS.ID) +} From e6981dedb8c77a6a229082e2faa2c7cf5df40e46 Mon Sep 17 00:00:00 2001 From: brice Date: Mon, 2 Mar 2026 17:44:25 +0100 Subject: [PATCH 08/26] feat: add deploy config for stabilization delay and probe timeouts New config options: deploy.stabilization_delay (2s), deploy.tcp_probe_timeout (30s), deploy.http_probe_timeout (60s). --- internal/app/run.go | 6 ++++++ internal/usecase/container/service.go | 8 +++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/internal/app/run.go b/internal/app/run.go index 83b5174a..8293ca21 100644 --- a/internal/app/run.go +++ b/internal/app/run.go @@ -1193,6 +1193,9 @@ func createContainerService(ctx context.Context, v *viper.Viper, cfg Config, svc ReadinessDelay: v.GetDuration("deploy.readiness_delay"), ReadinessMode: v.GetString("deploy.readiness_mode"), HealthTimeout: v.GetDuration("deploy.health_timeout"), + StabilizationDelay: v.GetDuration("deploy.stabilization_delay"), + TCPProbeTimeout: v.GetDuration("deploy.tcp_probe_timeout"), + HTTPProbeTimeout: v.GetDuration("deploy.http_probe_timeout"), DrainDelay: v.GetDuration("deploy.drain_delay"), DrainMode: v.GetString("deploy.drain_mode"), DrainTimeout: v.GetDuration("deploy.drain_timeout"), @@ -2374,6 +2377,9 @@ func loadConfig(v *viper.Viper, configPath string) error { v.SetDefault("deploy.readiness_delay", "5s") v.SetDefault("deploy.readiness_mode", "auto") v.SetDefault("deploy.health_timeout", "90s") + v.SetDefault("deploy.stabilization_delay", "2s") + v.SetDefault("deploy.tcp_probe_timeout", "30s") + v.SetDefault("deploy.http_probe_timeout", "60s") v.SetDefault("deploy.drain_mode", "auto") v.SetDefault("deploy.drain_timeout", "30s") diff --git a/internal/usecase/container/service.go b/internal/usecase/container/service.go index e1a5c879..2d798de6 100644 --- a/internal/usecase/container/service.go +++ b/internal/usecase/container/service.go @@ -48,7 +48,9 @@ type Config struct { DrainDelayConfigured bool // True when deploy.drain_delay was explicitly configured DrainMode string // Drain strategy: auto, inflight, delay DrainTimeout time.Duration // Max wait for in-flight requests to drain - StabilizationDelay time.Duration // Pause after traffic switch before verifying new container health + StabilizationDelay time.Duration // Post-switch monitoring window (default 2s) + TCPProbeTimeout time.Duration // TCP probe timeout (default 30s) + HTTPProbeTimeout time.Duration // HTTP probe timeout (default 60s) } var tracer = otel.Tracer("gordon.container") @@ -2265,7 +2267,7 @@ func (s *Service) readinessCascade(ctx context.Context, containerID string, cont ip, port, probeErr := s.resolveContainerEndpoint(ctx, containerID, containerConfig) if probeErr == nil && ip != "" && port > 0 { url := fmt.Sprintf("http://%s:%d%s", ip, port, healthPath) - timeout := cfg.HealthTimeout + timeout := cfg.HTTPProbeTimeout if timeout == 0 { timeout = 60 * time.Second } @@ -2282,7 +2284,7 @@ func (s *Service) readinessCascade(ctx context.Context, containerID string, cont ip, port, probeErr := s.resolveContainerEndpoint(ctx, containerID, containerConfig) if probeErr == nil && ip != "" && port > 0 { addr := fmt.Sprintf("%s:%d", ip, port) - timeout := cfg.HealthTimeout + timeout := cfg.TCPProbeTimeout if timeout == 0 { timeout = 30 * time.Second } From 2a109aabedfa9cedb5b458e313ad6f19bec98e29 Mon Sep 17 00:00:00 2001 From: brice Date: Mon, 2 Mar 2026 18:21:07 +0100 Subject: [PATCH 09/26] test: add strict zero-downtime ordering tests Verify old container is never stopped before new readiness completes. Verify readiness failure leaves old container untouched. Verify first deploy works without existing container. --- internal/usecase/container/service_test.go | 251 +++++++++++++++++++++ 1 file changed, 251 insertions(+) diff --git a/internal/usecase/container/service_test.go b/internal/usecase/container/service_test.go index dd060416..c3517608 100644 --- a/internal/usecase/container/service_test.go +++ b/internal/usecase/container/service_test.go @@ -2904,3 +2904,254 @@ func TestService_Deploy_StabilizationSuccess(t *testing.T) { assert.True(t, existsSS) assert.Equal(t, "new-container", trackedSS.ID) } + +// TestService_Deploy_ZeroDowntime_OldNeverStoppedBeforeNewReady verifies the strict +// ordering invariant: StopContainer("old-container") must NEVER happen before +// InspectContainer("new-container") completes (the last step of readiness). +// Uses testify's NotBefore() to enforce mock call ordering. +func TestService_Deploy_ZeroDowntime_OldNeverStoppedBeforeNewReady(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + envLoader := mocks.NewMockEnvLoader(t) + eventBus := mocks.NewMockEventPublisher(t) + cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) + + config := Config{ + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, + StabilizationDelay: time.Millisecond, + } + svc := NewService(runtime, envLoader, eventBus, nil, config) + svc.SetProxyCacheInvalidator(cacheInvalidator) + ctx := testContext() + + // Pre-populate with existing tracked container + existingContainer := &domain.Container{ + ID: "old-container", + Name: "gordon-test.example.com", + Status: "running", + } + svc.containers["test.example.com"] = existingContainer + + route := domain.Route{ + Domain: "test.example.com", + Image: "myapp:v2", + } + + // Cleanup orphans + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) + + // Image operations + runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) + runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil) + runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil) + + // Environment + envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil) + runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil) + + // Create new container with -new suffix for zero-downtime + newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} + runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool { + return cfg.Name == "gordon-test.example.com-new" + })).Return(newContainer, nil) + runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) + + // Readiness: pollContainerRunning (1 call) + waitForReadyByDelay verification (1 call) + // Stabilization check (1 call) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) + + // KEY ORDERING CONSTRAINT: InspectContainer("new-container") is the last step + // of readiness inside createStartedContainer. StopContainer("old-container") + // must happen AFTER this call completes. + inspectCall := runtime.EXPECT().InspectContainer(mock.Anything, "new-container"). + Return(&domain.Container{ + ID: "new-container", + Name: "gordon-test.example.com-new", + Status: "running", + }, nil) + + // Publish event + eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil) + + // Synchronous cache invalidation + cacheInvalidator.EXPECT().InvalidateTarget(mock.Anything, "test.example.com").Return() + + // STRICT ORDERING: StopContainer on old container must NOT happen before + // InspectContainer on new container (readiness completion) + runtime.EXPECT().StopContainer(mock.Anything, "old-container"). + Return(nil). + NotBefore(inspectCall.Call) + + runtime.EXPECT().RemoveContainer(mock.Anything, "old-container", true).Return(nil) + runtime.EXPECT().RenameContainer(mock.Anything, "new-container", "gordon-test.example.com").Return(nil) + + result, err := svc.Deploy(ctx, route) + + assert.NoError(t, err) + assert.NotNil(t, result) + assert.Equal(t, "new-container", result.ID) + + // Verify new container is tracked + tracked, exists := svc.Get(ctx, "test.example.com") + assert.True(t, exists) + assert.Equal(t, "new-container", tracked.ID) +} + +// TestService_Deploy_ZeroDowntime_ReadinessFailure_OldUntouched verifies that when a +// new container fails readiness (IsContainerRunning returns false), the old container +// is completely untouched: never stopped, never removed. Deploy returns an error and +// the old container remains as the tracked container. +func TestService_Deploy_ZeroDowntime_ReadinessFailure_OldUntouched(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + envLoader := mocks.NewMockEnvLoader(t) + eventBus := mocks.NewMockEventPublisher(t) + + config := Config{ + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, + StabilizationDelay: time.Millisecond, + } + svc := NewService(runtime, envLoader, eventBus, nil, config) + ctx := testContext() + + // Pre-populate with existing tracked container + existingContainer := &domain.Container{ + ID: "old-container", + Name: "gordon-test.example.com", + Status: "running", + } + svc.containers["test.example.com"] = existingContainer + + route := domain.Route{ + Domain: "test.example.com", + Image: "myapp:v2", + } + + // Cleanup orphans + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) + + // Image operations + runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) + runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil) + runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil) + + // Environment + envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil) + runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil) + + // Create new container + newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} + runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool { + return cfg.Name == "gordon-test.example.com-new" + })).Return(newContainer, nil) + runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) + + // Readiness FAILS: pollContainerRunning succeeds (container starts) but + // waitForReadyByDelay verification finds it not running. + // pollContainerRunning: returns true (container initially starts) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Once() + // waitForReadyByDelay post-delay verification: returns false (container crashed) + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(false, nil).Once() + // Recovery window poll: return error to fail fast instead of waiting 30s + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container"). + Return(false, errors.New("container exited")).Once() + + // cleanupFailedContainer: stop and remove the failed new container + runtime.EXPECT().StopContainer(mock.Anything, "new-container").Return(nil) + runtime.EXPECT().RemoveContainer(mock.Anything, "new-container", true).Return(nil) + + // NOTE: StopContainer("old-container") and RemoveContainer("old-container", true) + // are intentionally NOT mocked. If either is called, testify will panic with + // "unexpected method call" — verifying the old container is never touched. + + result, err := svc.Deploy(ctx, route) + + // Deploy should return an error (readiness failure) + assert.Error(t, err) + assert.Nil(t, result) + assert.Contains(t, err.Error(), "readiness check") + + // Old container must still be tracked — completely untouched + tracked, exists := svc.Get(ctx, "test.example.com") + assert.True(t, exists, "old container should still be tracked after readiness failure") + assert.Equal(t, "old-container", tracked.ID, "old container ID should be unchanged") +} + +// TestService_Deploy_ZeroDowntime_NoExisting_FirstDeploy is a sanity check: +// first deploy with no existing container should work normally without any +// finalize/drain logic being triggered. +func TestService_Deploy_ZeroDowntime_NoExisting_FirstDeploy(t *testing.T) { + runtime := mocks.NewMockContainerRuntime(t) + envLoader := mocks.NewMockEnvLoader(t) + eventBus := mocks.NewMockEventPublisher(t) + + config := Config{ + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, + StabilizationDelay: time.Millisecond, + } + svc := NewService(runtime, envLoader, eventBus, nil, config) + ctx := testContext() + + // NO existing container — this is a first deploy + + route := domain.Route{ + Domain: "test.example.com", + Image: "myapp:v2", + } + + // resolveExistingContainer: no container in memory, runtime returns nothing + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + + // Cleanup orphans — nothing found + runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{}, nil) + + // Image operations + runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) + runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil) + runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil) + + // Environment + envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil) + runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil) + + // Create container — canonical name (no -new suffix since no existing container) + newContainer := &domain.Container{ID: "first-container", Name: "gordon-test.example.com", Status: "created"} + runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool { + return cfg.Name == "gordon-test.example.com" + })).Return(newContainer, nil) + runtime.EXPECT().StartContainer(mock.Anything, "first-container").Return(nil) + + // Readiness: pollContainerRunning (1 call) + waitForReadyByDelay verification (1 call) + // No stabilization check for first deploy (hasExisting is false) + runtime.EXPECT().IsContainerRunning(mock.Anything, "first-container").Return(true, nil).Times(2) + + // Inspect after readiness + runtime.EXPECT().InspectContainer(mock.Anything, "first-container").Return(&domain.Container{ + ID: "first-container", + Name: "gordon-test.example.com", + Status: "running", + Ports: []int{8080}, + }, nil) + + // Publish event + eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil) + + // NOTE: No StopContainer, RemoveContainer, or RenameContainer expectations — + // first deploy has no previous container to finalize, and no stabilization check. + + result, err := svc.Deploy(ctx, route) + + assert.NoError(t, err) + assert.NotNil(t, result) + assert.Equal(t, "first-container", result.ID) + assert.Equal(t, "running", result.Status) + + // Verify container is tracked + tracked, exists := svc.Get(ctx, "test.example.com") + assert.True(t, exists) + assert.Equal(t, "first-container", tracked.ID) +} From 5d857a7b713c62f7cdcd9218efdaa85a607ffdf1 Mon Sep 17 00:00:00 2001 From: brice Date: Tue, 3 Mar 2026 04:35:33 +0100 Subject: [PATCH 10/26] fix: address code review findings for zero-downtime deploy - httpProbe: add CheckRedirect to prevent following redirects, treating 3xx responses as successful readiness signals instead of chasing them - stabilizeNewContainer: verify old container is still running before rollback; return error when both containers are dead instead of silently returning a dead container reference - resolveExistingContainer: guard managedCount increment to prevent double-counting when domain is already tracked --- internal/usecase/container/readiness.go | 8 ++++- internal/usecase/container/readiness_test.go | 4 +-- internal/usecase/container/service.go | 38 ++++++++++++++++---- internal/usecase/container/service_test.go | 3 ++ 4 files changed, 43 insertions(+), 10 deletions(-) diff --git a/internal/usecase/container/readiness.go b/internal/usecase/container/readiness.go index 529effb5..45307ff8 100644 --- a/internal/usecase/container/readiness.go +++ b/internal/usecase/container/readiness.go @@ -38,7 +38,13 @@ func tcpProbe(ctx context.Context, addr string, timeout time.Duration) error { // httpProbe performs HTTP GET requests to url, retrying every 1s until a // 2xx/3xx response or timeout. Used when gordon.health label is set. func httpProbe(ctx context.Context, url string, timeout time.Duration) error { - client := &http.Client{Timeout: 2 * time.Second} + client := &http.Client{ + Timeout: 2 * time.Second, + CheckRedirect: func(req *http.Request, via []*http.Request) error { + // Don't follow redirects — treat 3xx as a successful response. + return http.ErrUseLastResponse + }, + } deadline := time.Now().Add(timeout) var lastStatus int diff --git a/internal/usecase/container/readiness_test.go b/internal/usecase/container/readiness_test.go index d3b9fe08..2e867634 100644 --- a/internal/usecase/container/readiness_test.go +++ b/internal/usecase/container/readiness_test.go @@ -141,12 +141,12 @@ func TestHTTPProbe_ContextCancelled(t *testing.T) { func TestHTTPProbe_RedirectIsSuccess(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.WriteHeader(302) + http.Redirect(w, r, "/other", http.StatusFound) })) defer srv.Close() ctx := testContext() - // Disable follow redirects for the test — the probe should treat 3xx as success + // The probe should treat 3xx as success without following the redirect err := httpProbe(ctx, srv.URL+"/healthz", 5*time.Second) assert.NoError(t, err) } diff --git a/internal/usecase/container/service.go b/internal/usecase/container/service.go index 2d798de6..0c6bcd13 100644 --- a/internal/usecase/container/service.go +++ b/internal/usecase/container/service.go @@ -311,7 +311,12 @@ func (s *Service) Deploy(ctx context.Context, route domain.Route) (*domain.Conta // Post-switch stabilization: verify new container stays running if hasExisting { - if !s.stabilizeNewContainer(ctx, route.Domain, newContainer, existing) { + stable, stabilizeErr := s.stabilizeNewContainer(ctx, route.Domain, newContainer, existing) + if stabilizeErr != nil { + // Both old and new containers are dead + return nil, stabilizeErr + } + if !stable { // Rollback performed — old container is restored return existing, nil } @@ -432,8 +437,10 @@ func (s *Service) resolveExistingContainer(ctx context.Context, domainName strin // Update in-memory state so subsequent lookups are fast s.mu.Lock() + if _, alreadyTracked := s.containers[domainName]; !alreadyTracked { + s.managedCount++ + } s.containers[domainName] = c - s.managedCount++ s.mu.Unlock() return c, true @@ -564,11 +571,12 @@ func (s *Service) activateDeployedContainer(ctx context.Context, domainName stri // stabilizeNewContainer monitors the new container briefly after traffic switch. // If it crashes during this window, rolls back to old container. // Returns true if stabilization succeeded, false if rollback was performed. -func (s *Service) stabilizeNewContainer(ctx context.Context, domainName string, newContainer, oldContainer *domain.Container) bool { +// Returns an error only when both new and old containers are dead. +func (s *Service) stabilizeNewContainer(ctx context.Context, domainName string, newContainer, oldContainer *domain.Container) (bool, error) { log := zerowrap.FromCtx(ctx) if oldContainer == nil { - return true + return true, nil } s.mu.RLock() @@ -582,7 +590,7 @@ func (s *Service) stabilizeNewContainer(ctx context.Context, domainName string, select { case <-time.After(delay): case <-ctx.Done(): - return true + return true, nil } running, err := s.runtime.IsContainerRunning(ctx, newContainer.ID) @@ -592,6 +600,22 @@ func (s *Service) stabilizeNewContainer(ctx context.Context, domainName string, Str("old_container", oldContainer.ID). Msg("new container crashed during stabilization, rolling back to old") + // Verify old container is still running before restoring it. + // If both old and new are dead (e.g., OOM), returning a dead + // container as "existing" would leave the domain in a broken state. + oldRunning, oldErr := s.runtime.IsContainerRunning(ctx, oldContainer.ID) + if oldErr != nil || !oldRunning { + log.Error(). + Str("old_container", oldContainer.ID). + Msg("old container is also not running, cannot rollback") + + // Cleanup failed new container + _ = s.runtime.StopContainer(ctx, newContainer.ID) + _ = s.runtime.RemoveContainer(ctx, newContainer.ID, true) + + return false, fmt.Errorf("stabilization failed: new container crashed and old container %s is also not running", oldContainer.ID) + } + // Rollback: restore old container as tracked s.mu.Lock() s.containers[domainName] = oldContainer @@ -609,10 +633,10 @@ func (s *Service) stabilizeNewContainer(ctx context.Context, domainName string, _ = s.runtime.StopContainer(ctx, newContainer.ID) _ = s.runtime.RemoveContainer(ctx, newContainer.ID, true) - return false + return false, nil } - return true + return true, nil } func (s *Service) finalizePreviousContainer(ctx context.Context, domainName string, existing *domain.Container, hasExisting, invalidated bool, newContainerID string) { diff --git a/internal/usecase/container/service_test.go b/internal/usecase/container/service_test.go index c3517608..339bc65c 100644 --- a/internal/usecase/container/service_test.go +++ b/internal/usecase/container/service_test.go @@ -2788,6 +2788,9 @@ func TestService_Deploy_RollbackOnPostSwitchCrash(t *testing.T) { runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(false, nil).Once() + // Rollback: verify old container is still running before restoring + runtime.EXPECT().IsContainerRunning(mock.Anything, "old-container").Return(true, nil).Once() + // Inspect after ready runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ ID: "new-container", From 0fd48ee82e9fb0b96c2319f8b82d82aa3bf4b352 Mon Sep 17 00:00:00 2001 From: brice Date: Tue, 3 Mar 2026 04:47:48 +0100 Subject: [PATCH 11/26] refactor: address CodeRabbit review nitpicks - Remove unused containerConfig parameter from resolveContainerEndpoint and update both call sites in readinessCascade - Add t.Logf in TestTCPProbe_DelayedListener to surface re-listen failures instead of silently swallowing them - Times(5) for IsContainerRunning in concurrent test is correct: 2x per deploy (pollContainerRunning + waitForReadyByDelay) + 1x stabilization for the second deploy = 5 total; left as-is --- internal/usecase/container/readiness_test.go | 1 + internal/usecase/container/service.go | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/internal/usecase/container/readiness_test.go b/internal/usecase/container/readiness_test.go index 2e867634..153ce955 100644 --- a/internal/usecase/container/readiness_test.go +++ b/internal/usecase/container/readiness_test.go @@ -63,6 +63,7 @@ func TestTCPProbe_DelayedListener(t *testing.T) { time.Sleep(time.Second) newLn, err := net.Listen("tcp", addr) if err != nil { + t.Logf("delayed listener failed to bind: %v", err) return } defer newLn.Close() diff --git a/internal/usecase/container/service.go b/internal/usecase/container/service.go index 0c6bcd13..5ad38233 100644 --- a/internal/usecase/container/service.go +++ b/internal/usecase/container/service.go @@ -2288,7 +2288,7 @@ func (s *Service) readinessCascade(ctx context.Context, containerID string, cont // 2. HTTP probe via gordon.health label if containerConfig != nil { if healthPath, ok := containerConfig.Labels[domain.LabelHealth]; ok && healthPath != "" { - ip, port, probeErr := s.resolveContainerEndpoint(ctx, containerID, containerConfig) + ip, port, probeErr := s.resolveContainerEndpoint(ctx, containerID) if probeErr == nil && ip != "" && port > 0 { url := fmt.Sprintf("http://%s:%d%s", ip, port, healthPath) timeout := cfg.HTTPProbeTimeout @@ -2305,7 +2305,7 @@ func (s *Service) readinessCascade(ctx context.Context, containerID string, cont // 3. TCP probe (if port info available) if containerConfig != nil && len(containerConfig.Ports) > 0 { - ip, port, probeErr := s.resolveContainerEndpoint(ctx, containerID, containerConfig) + ip, port, probeErr := s.resolveContainerEndpoint(ctx, containerID) if probeErr == nil && ip != "" && port > 0 { addr := fmt.Sprintf("%s:%d", ip, port) timeout := cfg.TCPProbeTimeout @@ -2326,7 +2326,7 @@ func (s *Service) readinessCascade(ctx context.Context, containerID string, cont // resolveContainerEndpoint retrieves the container's IP and first port via // GetContainerNetworkInfo. The result can be used for HTTP or TCP probes. -func (s *Service) resolveContainerEndpoint(ctx context.Context, containerID string, containerConfig *domain.ContainerConfig) (string, int, error) { +func (s *Service) resolveContainerEndpoint(ctx context.Context, containerID string) (string, int, error) { ip, port, err := s.runtime.GetContainerNetworkInfo(ctx, containerID) if err != nil { return "", 0, err From 5c94f338f0403ab8a0bb32be6be562881fcd3e0e Mon Sep 17 00:00:00 2001 From: brice Date: Tue, 3 Mar 2026 04:57:11 +0100 Subject: [PATCH 12/26] chore: remove dead dns_suffix config field DNSSuffix was set from network_isolation.dns_suffix but never read anywhere in the codebase. Remove the struct field, config wiring, viper default, and all documentation references across docs/ and wiki/. --- docs/config/index.md | 1 - docs/config/network-isolation.md | 16 ---------------- docs/config/reference.md | 2 -- internal/app/run.go | 2 -- internal/usecase/container/service.go | 1 - wiki/examples/production.md | 1 - 6 files changed, 23 deletions(-) diff --git a/docs/config/index.md b/docs/config/index.md index a1d7aa87..f8f0e5c1 100644 --- a/docs/config/index.md +++ b/docs/config/index.md @@ -106,7 +106,6 @@ preserve = true # Keep volumes on container removal [network_isolation] enabled = true # Per-app isolated networks network_prefix = "gordon" # Network name prefix -dns_suffix = ".internal" # DNS suffix for services # Auto-route [auto_route] diff --git a/docs/config/network-isolation.md b/docs/config/network-isolation.md index 798fb374..a41bdb8a 100644 --- a/docs/config/network-isolation.md +++ b/docs/config/network-isolation.md @@ -8,7 +8,6 @@ Isolate applications in separate Docker networks for enhanced security. [network_isolation] enabled = true network_prefix = "gordon" -dns_suffix = ".internal" ``` ## Options @@ -17,7 +16,6 @@ dns_suffix = ".internal" |--------|------|---------|-------------| | `enabled` | bool | `false` | Enable per-app network isolation | | `network_prefix` | string | `"gordon"` | Prefix for created networks | -| `dns_suffix` | string | `".internal"` | DNS suffix for service discovery | ## How It Works @@ -99,19 +97,6 @@ db = connect("postgresql://postgres:5432/mydb") cache = connect("redis://redis:6379") ``` -## DNS Resolution - -The `dns_suffix` option adds a suffix for internal DNS resolution: - -```toml -[network_isolation] -dns_suffix = ".internal" -``` - -Services can be accessed as: -- `postgres` (short form) -- `postgres.internal` (with suffix) - ## Examples ### Basic Isolation @@ -137,7 +122,6 @@ Each app gets its own network with its own database. [network_isolation] enabled = true network_prefix = "prod" -dns_suffix = ".internal" [routes] "app.company.com" = "company-app:v2.1.0" diff --git a/docs/config/reference.md b/docs/config/reference.md index 2be23fcb..669434e9 100644 --- a/docs/config/reference.md +++ b/docs/config/reference.md @@ -105,7 +105,6 @@ enabled = false # Create routes from image labels a [network_isolation] enabled = false # Enable per-app Docker networks network_prefix = "gordon" # Prefix for created networks -dns_suffix = ".internal" # DNS suffix for internal resolution # ============================================================================= # VOLUMES @@ -215,7 +214,6 @@ keep_last = 3 # Keep N newest tags per repository | `auto_route.enabled` | `false` | Auto-route disabled | | `network_isolation.enabled` | `false` | Isolation disabled | | `network_isolation.network_prefix` | `"gordon"` | Network prefix | -| `network_isolation.dns_suffix` | `".internal"` | DNS suffix | | `volumes.auto_create` | `true` | Auto-create volumes | | `volumes.prefix` | `"gordon"` | Volume prefix | | `volumes.preserve` | `true` | Keep volumes | diff --git a/internal/app/run.go b/internal/app/run.go index 8293ca21..ce3712dc 100644 --- a/internal/app/run.go +++ b/internal/app/run.go @@ -1187,7 +1187,6 @@ func createContainerService(ctx context.Context, v *viper.Viper, cfg Config, svc VolumePreserve: v.GetBool("volumes.preserve"), NetworkIsolation: v.GetBool("network_isolation.enabled"), NetworkPrefix: v.GetString("network_isolation.network_prefix"), - DNSSuffix: v.GetString("network_isolation.dns_suffix"), NetworkGroups: svc.configSvc.GetNetworkGroups(), Attachments: svc.configSvc.GetAttachments(), ReadinessDelay: v.GetDuration("deploy.readiness_delay"), @@ -2349,7 +2348,6 @@ func loadConfig(v *viper.Viper, configPath string) error { v.SetDefault("auto_route.enabled", false) v.SetDefault("network_isolation.enabled", false) v.SetDefault("network_isolation.network_prefix", "gordon") - v.SetDefault("network_isolation.dns_suffix", ".internal") v.SetDefault("volumes.auto_create", true) v.SetDefault("volumes.prefix", "gordon") v.SetDefault("volumes.preserve", true) diff --git a/internal/usecase/container/service.go b/internal/usecase/container/service.go index 5ad38233..ef72ae0c 100644 --- a/internal/usecase/container/service.go +++ b/internal/usecase/container/service.go @@ -38,7 +38,6 @@ type Config struct { VolumePreserve bool NetworkIsolation bool NetworkPrefix string - DNSSuffix string NetworkGroups map[string][]string Attachments map[string][]string ReadinessDelay time.Duration // Delay after container starts before considering it ready diff --git a/wiki/examples/production.md b/wiki/examples/production.md index 4957894e..bb802d83 100644 --- a/wiki/examples/production.md +++ b/wiki/examples/production.md @@ -58,7 +58,6 @@ preserve = true [network_isolation] enabled = true network_prefix = "prod" -dns_suffix = ".internal" # Application routes with pinned versions [routes] From 8147c672ef67241d54aff511b9c2216578feba78 Mon Sep 17 00:00:00 2001 From: bnema Date: Tue, 3 Mar 2026 21:28:41 +0000 Subject: [PATCH 13/26] refactor: address CodeRabbit review and fix lint gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extract tryHTTPProbe/tryTCPProbe helpers from readinessCascade to reduce cyclomatic complexity (16 → ≤15, gocyclo) - Extract recordDeployMetrics helper from Deploy defer closure to reduce cyclomatic complexity (16 → ≤15, gocyclo) - Remove unused getTrackedContainer method (unused linter) - readiness.go: replace net.DialTimeout with net.Dialer.DialContext and client.Get with http.NewRequestWithContext; per-attempt timeouts clamped to caller deadline so neither probe can overrun the context - readiness_test.go: use ephemeral port (net.Listen :0 + close) instead of hardcoded port 1 for TestTCPProbe_Timeout - service_readiness_test.go: use loopback ephemeral ports for TCP/HTTP probe timeout mocks instead of hardcoded 172.17.0.5 - service_test.go: fix TestService_Deploy_OrphanCleanup_NeverKillsRunningCanonical to reflect real Docker behavior — existing canonical container is resolved via managed label so new container gets the -new suffix, avoiding a name collision that Docker would reject at runtime --- internal/usecase/container/readiness.go | 25 +++- internal/usecase/container/readiness_test.go | 10 +- internal/usecase/container/service.go | 127 ++++++++++-------- .../container/service_readiness_test.go | 36 +++-- internal/usecase/container/service_test.go | 60 +++++---- 5 files changed, 168 insertions(+), 90 deletions(-) diff --git a/internal/usecase/container/readiness.go b/internal/usecase/container/readiness.go index 45307ff8..e9cb9423 100644 --- a/internal/usecase/container/readiness.go +++ b/internal/usecase/container/readiness.go @@ -13,6 +13,7 @@ import ( // it verifies the process is at least accepting connections. func tcpProbe(ctx context.Context, addr string, timeout time.Duration) error { deadline := time.Now().Add(timeout) + dialer := &net.Dialer{} for { if err := ctx.Err(); err != nil { return err @@ -21,7 +22,14 @@ func tcpProbe(ctx context.Context, addr string, timeout time.Duration) error { return fmt.Errorf("TCP probe timeout after %s: %s not reachable", timeout, addr) } - conn, err := net.DialTimeout("tcp", addr, time.Second) + remaining := time.Until(deadline) + attemptTimeout := time.Second + if remaining < attemptTimeout { + attemptTimeout = remaining + } + attemptCtx, cancel := context.WithTimeout(ctx, attemptTimeout) + conn, err := dialer.DialContext(attemptCtx, "tcp", addr) + cancel() if err == nil { conn.Close() return nil @@ -39,7 +47,6 @@ func tcpProbe(ctx context.Context, addr string, timeout time.Duration) error { // 2xx/3xx response or timeout. Used when gordon.health label is set. func httpProbe(ctx context.Context, url string, timeout time.Duration) error { client := &http.Client{ - Timeout: 2 * time.Second, CheckRedirect: func(req *http.Request, via []*http.Request) error { // Don't follow redirects — treat 3xx as a successful response. return http.ErrUseLastResponse @@ -56,7 +63,19 @@ func httpProbe(ctx context.Context, url string, timeout time.Duration) error { return fmt.Errorf("HTTP probe timeout after %s: last status %d from %s", timeout, lastStatus, url) } - resp, err := client.Get(url) + remaining := time.Until(deadline) + attemptTimeout := 2 * time.Second + if remaining < attemptTimeout { + attemptTimeout = remaining + } + req, reqErr := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if reqErr != nil { + return reqErr + } + attemptCtx, cancel := context.WithTimeout(ctx, attemptTimeout) + req = req.WithContext(attemptCtx) + resp, err := client.Do(req) + cancel() if err == nil { lastStatus = resp.StatusCode resp.Body.Close() diff --git a/internal/usecase/container/readiness_test.go b/internal/usecase/container/readiness_test.go index 153ce955..6f995f95 100644 --- a/internal/usecase/container/readiness_test.go +++ b/internal/usecase/container/readiness_test.go @@ -37,9 +37,15 @@ func TestTCPProbe_Success(t *testing.T) { } func TestTCPProbe_Timeout(t *testing.T) { - // Use a port that nothing listens on (port 1 is privileged, won't have a listener) + // Reserve an ephemeral port and close it immediately to get a deterministic + // unreachable target, avoiding assumptions about privileged ports. + ln, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + addr := ln.Addr().String() + ln.Close() + ctx := testContext() - err := tcpProbe(ctx, "127.0.0.1:1", 200*time.Millisecond) + err = tcpProbe(ctx, addr, 200*time.Millisecond) assert.Error(t, err) assert.Contains(t, err.Error(), "TCP probe timeout") } diff --git a/internal/usecase/container/service.go b/internal/usecase/container/service.go index ef72ae0c..9d7b93b0 100644 --- a/internal/usecase/container/service.go +++ b/internal/usecase/container/service.go @@ -259,24 +259,7 @@ func (s *Service) Deploy(ctx context.Context, route domain.Route) (*domain.Conta log := zerowrap.FromCtx(ctx) // Record deploy metrics and trace status - defer func() { - if err != nil { - span.RecordError(err) - span.SetStatus(codes.Error, err.Error()) - } - if s.metrics == nil { - return - } - attrs := metric.WithAttributes( - attribute.String("domain", route.Domain), - attribute.String("image", route.Image), - ) - s.metrics.DeployTotal.Add(ctx, 1, attrs) - s.metrics.DeployDuration.Record(ctx, time.Since(deployStart).Seconds(), attrs) - if err != nil { - s.metrics.DeployErrors.Add(ctx, 1, attrs) - } - }() + defer s.recordDeployMetrics(ctx, span, route, deployStart, &err) existing, hasExisting := s.resolveExistingContainer(ctx, route.Domain) @@ -337,6 +320,29 @@ func (s *Service) Deploy(ctx context.Context, route domain.Route) (*domain.Conta return newContainer, nil } +// recordDeployMetrics records span error status and deploy metrics at the end of a Deploy call. +// It is called via defer and receives a pointer to the named return error so it can observe +// the final error value after all deferred functions have run. +func (s *Service) recordDeployMetrics(ctx context.Context, span trace.Span, route domain.Route, start time.Time, errPtr *error) { + err := *errPtr + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + } + if s.metrics == nil { + return + } + attrs := metric.WithAttributes( + attribute.String("domain", route.Domain), + attribute.String("image", route.Image), + ) + s.metrics.DeployTotal.Add(ctx, 1, attrs) + s.metrics.DeployDuration.Record(ctx, time.Since(start).Seconds(), attrs) + if err != nil { + s.metrics.DeployErrors.Add(ctx, 1, attrs) + } +} + // skipRedundantDeploy checks whether the existing container is already running // the same image (by Docker image ID) as the one we are about to deploy. // When a push triggers both an event-based deploy and an explicit CLI deploy, @@ -397,13 +403,6 @@ type deployResources struct { volumes map[string]string } -func (s *Service) getTrackedContainer(domainName string) (*domain.Container, bool) { - s.mu.RLock() - container, ok := s.containers[domainName] - s.mu.RUnlock() - return container, ok -} - // resolveExistingContainer returns the currently running container for a domain. // It first checks the in-memory map (fast path). If not found, it queries the // runtime for a running container with matching name and managed label. This @@ -2285,37 +2284,13 @@ func (s *Service) readinessCascade(ctx context.Context, containerID string, cont } // 2. HTTP probe via gordon.health label - if containerConfig != nil { - if healthPath, ok := containerConfig.Labels[domain.LabelHealth]; ok && healthPath != "" { - ip, port, probeErr := s.resolveContainerEndpoint(ctx, containerID) - if probeErr == nil && ip != "" && port > 0 { - url := fmt.Sprintf("http://%s:%d%s", ip, port, healthPath) - timeout := cfg.HTTPProbeTimeout - if timeout == 0 { - timeout = 60 * time.Second - } - log.Info().Str("url", url).Dur("timeout", timeout).Msg("readiness cascade: using HTTP probe") - return httpProbe(ctx, url, timeout) - } - // Could not resolve endpoint; fall through to next level - log.Debug().Err(probeErr).Msg("readiness cascade: HTTP probe skipped, could not resolve container endpoint") - } + if probed, probeErr := s.tryHTTPProbe(ctx, containerID, containerConfig, cfg); probed { + return probeErr } // 3. TCP probe (if port info available) - if containerConfig != nil && len(containerConfig.Ports) > 0 { - ip, port, probeErr := s.resolveContainerEndpoint(ctx, containerID) - if probeErr == nil && ip != "" && port > 0 { - addr := fmt.Sprintf("%s:%d", ip, port) - timeout := cfg.TCPProbeTimeout - if timeout == 0 { - timeout = 30 * time.Second - } - log.Info().Str("addr", addr).Dur("timeout", timeout).Msg("readiness cascade: using TCP probe") - return tcpProbe(ctx, addr, timeout) - } - // Could not resolve endpoint; fall through to delay - log.Debug().Err(probeErr).Msg("readiness cascade: TCP probe skipped, could not resolve container endpoint") + if probed, probeErr := s.tryTCPProbe(ctx, containerID, containerConfig, cfg); probed { + return probeErr } // 4. Delay fallback @@ -2323,6 +2298,52 @@ func (s *Service) readinessCascade(ctx context.Context, containerID string, cont return s.waitForReadyByDelay(ctx, containerID) } +// tryHTTPProbe attempts an HTTP probe if the gordon.health label is set. +// Returns (true, err) if the probe was attempted, (false, nil) if skipped. +func (s *Service) tryHTTPProbe(ctx context.Context, containerID string, containerConfig *domain.ContainerConfig, cfg Config) (bool, error) { + log := zerowrap.FromCtx(ctx) + if containerConfig == nil { + return false, nil + } + healthPath, ok := containerConfig.Labels[domain.LabelHealth] + if !ok || healthPath == "" { + return false, nil + } + ip, port, probeErr := s.resolveContainerEndpoint(ctx, containerID) + if probeErr != nil || ip == "" || port <= 0 { + log.Debug().Err(probeErr).Msg("readiness cascade: HTTP probe skipped, could not resolve container endpoint") + return false, nil + } + url := fmt.Sprintf("http://%s:%d%s", ip, port, healthPath) + timeout := cfg.HTTPProbeTimeout + if timeout == 0 { + timeout = 60 * time.Second + } + log.Info().Str("url", url).Dur("timeout", timeout).Msg("readiness cascade: using HTTP probe") + return true, httpProbe(ctx, url, timeout) +} + +// tryTCPProbe attempts a TCP probe if port info is available. +// Returns (true, err) if the probe was attempted, (false, nil) if skipped. +func (s *Service) tryTCPProbe(ctx context.Context, containerID string, containerConfig *domain.ContainerConfig, cfg Config) (bool, error) { + log := zerowrap.FromCtx(ctx) + if containerConfig == nil || len(containerConfig.Ports) == 0 { + return false, nil + } + ip, port, probeErr := s.resolveContainerEndpoint(ctx, containerID) + if probeErr != nil || ip == "" || port <= 0 { + log.Debug().Err(probeErr).Msg("readiness cascade: TCP probe skipped, could not resolve container endpoint") + return false, nil + } + addr := fmt.Sprintf("%s:%d", ip, port) + timeout := cfg.TCPProbeTimeout + if timeout == 0 { + timeout = 30 * time.Second + } + log.Info().Str("addr", addr).Dur("timeout", timeout).Msg("readiness cascade: using TCP probe") + return true, tcpProbe(ctx, addr, timeout) +} + // resolveContainerEndpoint retrieves the container's IP and first port via // GetContainerNetworkInfo. The result can be used for HTTP or TCP probes. func (s *Service) resolveContainerEndpoint(ctx context.Context, containerID string) (string, int, error) { diff --git a/internal/usecase/container/service_readiness_test.go b/internal/usecase/container/service_readiness_test.go index bc2e5fbf..2dc70429 100644 --- a/internal/usecase/container/service_readiness_test.go +++ b/internal/usecase/container/service_readiness_test.go @@ -1,17 +1,31 @@ package container import ( + "net" "strings" "testing" "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" "github.com/bnema/gordon/internal/boundaries/out/mocks" "github.com/bnema/gordon/internal/domain" ) +// reserveEphemeralAddr binds on 127.0.0.1:0, captures the assigned port, closes +// the listener, and returns the address string. The port is guaranteed to be +// unreachable (no process listening) when the probe runs. +func reserveEphemeralAddr(t *testing.T) (ip string, port int) { + t.Helper() + ln, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + addr := ln.Addr().(*net.TCPAddr) + ln.Close() + return addr.IP.String(), addr.Port +} + func TestService_WaitForReady_AutoFallsBackToDelayWhenNoHealthcheck(t *testing.T) { runtime := mocks.NewMockContainerRuntime(t) svc := NewService(runtime, nil, nil, nil, Config{ @@ -47,21 +61,24 @@ func TestService_WaitForReady_AutoCascadeUsesTCPWhenNoHealthcheckAndNoHealthLabe Labels: map[string]string{}, } + // Reserve an ephemeral loopback port that is guaranteed to be unreachable. + probeIP, probePort := reserveEphemeralAddr(t) + // Initial running poll runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() // No Docker healthcheck runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("", false, nil).Once() // Cascade resolves container endpoint for TCP probe - runtime.EXPECT().GetContainerNetworkInfo(mock.Anything, containerID).Return("172.17.0.5", 8080, nil).Once() + runtime.EXPECT().GetContainerNetworkInfo(mock.Anything, containerID).Return(probeIP, probePort, nil).Once() - // TCP probe will try to connect to 172.17.0.5:8080 — which will fail since - // there's nothing listening. Use a short health timeout so the test doesn't hang. + // TCP probe will try to connect — which will fail since the port is closed. + // Use a short health timeout so the test doesn't hang. svc.mu.Lock() - svc.config.HealthTimeout = 50 * time.Millisecond + svc.config.TCPProbeTimeout = 50 * time.Millisecond svc.mu.Unlock() err := svc.waitForReady(ctx, containerID, containerConfig) - // Expect TCP probe timeout (no server listening at 172.17.0.5:8080) + // Expect TCP probe timeout (no server listening) assert.Error(t, err) assert.Contains(t, err.Error(), "TCP probe timeout") } @@ -69,8 +86,8 @@ func TestService_WaitForReady_AutoCascadeUsesTCPWhenNoHealthcheckAndNoHealthLabe func TestService_WaitForReady_AutoCascadeUsesHTTPProbeWhenHealthLabelSet(t *testing.T) { runtime := mocks.NewMockContainerRuntime(t) svc := NewService(runtime, nil, nil, nil, Config{ - ReadinessMode: "auto", - HealthTimeout: 50 * time.Millisecond, + ReadinessMode: "auto", + HTTPProbeTimeout: 50 * time.Millisecond, }) ctx := testContext() @@ -83,12 +100,15 @@ func TestService_WaitForReady_AutoCascadeUsesHTTPProbeWhenHealthLabelSet(t *test }, } + // Reserve an ephemeral loopback port that is guaranteed to be unreachable. + probeIP, probePort := reserveEphemeralAddr(t) + // Initial running poll runtime.EXPECT().IsContainerRunning(mock.Anything, containerID).Return(true, nil).Once() // No Docker healthcheck runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("", false, nil).Once() // Cascade resolves container endpoint for HTTP probe - runtime.EXPECT().GetContainerNetworkInfo(mock.Anything, containerID).Return("172.17.0.5", 8080, nil).Once() + runtime.EXPECT().GetContainerNetworkInfo(mock.Anything, containerID).Return(probeIP, probePort, nil).Once() err := svc.waitForReady(ctx, containerID, containerConfig) // Expect HTTP probe timeout (no server listening) diff --git a/internal/usecase/container/service_test.go b/internal/usecase/container/service_test.go index 339bc65c..da71d893 100644 --- a/internal/usecase/container/service_test.go +++ b/internal/usecase/container/service_test.go @@ -2644,42 +2644,45 @@ func TestService_Deploy_SkipRedundantDeploy_ContainerNotRunning(t *testing.T) { } // TestService_Deploy_OrphanCleanup_NeverKillsRunningCanonical verifies that orphan -// cleanup never stops a running container with the canonical name, even if it is not -// tracked in memory and was not resolved by resolveExistingContainer. Only stopped +// cleanup never stops a running container with the canonical name. Only stopped // canonical containers and temp containers (-new/-next) should be cleaned up. +// The active canonical container is resolved via resolveExistingContainer (it has the +// managed label), so the new container is created with the "-new" suffix — Docker +// disallows creating a container with the same name as an existing running container. func TestService_Deploy_OrphanCleanup_NeverKillsRunningCanonical(t *testing.T) { runtime := mocks.NewMockContainerRuntime(t) envLoader := mocks.NewMockEnvLoader(t) eventBus := mocks.NewMockEventPublisher(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + StabilizationDelay: time.Millisecond, } svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() - // NO tracked container in memory. resolveExistingContainer will NOT find it either - // because the running container lacks the managed label in this edge case. + // The canonical container is running and has the managed label — it will be + // discovered by resolveExistingContainer's slow path (no in-memory state). + existingContainer := &domain.Container{ + ID: "active-canonical", + Name: "gordon-test.example.com", + Status: "running", + Labels: map[string]string{domain.LabelManaged: "true"}, + } route := domain.Route{ Domain: "test.example.com", Image: "myapp:v2", } - // resolveExistingContainer: ListContainers(ctx, false) returns empty — - // the canonical container is running but not discovered (e.g., missing label) - runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{}, nil) + // resolveExistingContainer: slow path finds the running canonical container via label. + runtime.EXPECT().ListContainers(mock.Anything, false).Return([]*domain.Container{existingContainer}, nil) - // cleanupOrphanedContainers: ListContainers(ctx, true) finds both the running - // canonical container AND a stale -new leftover. The canonical container should - // NOT be killed because it's running and has the canonical name. + // cleanupOrphanedContainers: finds the running canonical AND a stale -new leftover. + // The canonical container must NOT be killed because it's running. runtime.EXPECT().ListContainers(mock.Anything, true).Return([]*domain.Container{ - { - ID: "active-canonical", - Name: "gordon-test.example.com", - Status: "running", - }, + existingContainer, { ID: "stale-leftover", Name: "gordon-test.example.com-new", @@ -2687,11 +2690,11 @@ func TestService_Deploy_OrphanCleanup_NeverKillsRunningCanonical(t *testing.T) { }, }, nil) - // Only the stale -new container should be stopped and removed during orphan cleanup + // Only the stale -new container should be stopped and removed during orphan cleanup. runtime.EXPECT().StopContainer(mock.Anything, "stale-leftover").Return(nil).Once() runtime.EXPECT().RemoveContainer(mock.Anything, "stale-leftover", true).Return(nil).Once() - // Image operations + // Image operations: pull new image (existing.Image is empty so redundancy check is skipped) runtime.EXPECT().ListImages(mock.Anything).Return([]string{}, nil) runtime.EXPECT().PullImage(mock.Anything, "myapp:v2").Return(nil) runtime.EXPECT().GetImageExposedPorts(mock.Anything, "myapp:v2").Return([]int{8080}, nil) @@ -2700,15 +2703,17 @@ func TestService_Deploy_OrphanCleanup_NeverKillsRunningCanonical(t *testing.T) { envLoader.EXPECT().LoadEnv(mock.Anything, "test.example.com").Return([]string{}, nil) runtime.EXPECT().InspectImageEnv(mock.Anything, "myapp:v2").Return([]string{}, nil) - // Create container — no existing tracked, so canonical name is used - newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com", Status: "created"} + // Create container — existing is the canonical container, so the new one gets "-new" suffix. + // Docker enforces unique container names; using "-new" avoids a name collision. + newContainer := &domain.Container{ID: "new-container", Name: "gordon-test.example.com-new", Status: "created"} runtime.EXPECT().CreateContainer(mock.Anything, mock.MatchedBy(func(cfg *domain.ContainerConfig) bool { - return cfg.Name == "gordon-test.example.com" + return cfg.Name == "gordon-test.example.com-new" })).Return(newContainer, nil) runtime.EXPECT().StartContainer(mock.Anything, "new-container").Return(nil) - // Wait for ready - runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(2) + // Wait for ready (delay mode): pollContainerRunning (1×) + waitForReadyByDelay (1×) + // Post-switch stabilization: 1× — total 3 calls. + runtime.EXPECT().IsContainerRunning(mock.Anything, "new-container").Return(true, nil).Times(3) // Inspect after ready runtime.EXPECT().InspectContainer(mock.Anything, "new-container").Return(&domain.Container{ @@ -2717,6 +2722,13 @@ func TestService_Deploy_OrphanCleanup_NeverKillsRunningCanonical(t *testing.T) { Ports: []int{8080}, }, nil) + // Rename new container to canonical name after stabilization. + runtime.EXPECT().RenameContainer(mock.Anything, "new-container", "gordon-test.example.com").Return(nil).Once() + + // Stop and remove old canonical container. + runtime.EXPECT().StopContainer(mock.Anything, "active-canonical").Return(nil).Once() + runtime.EXPECT().RemoveContainer(mock.Anything, "active-canonical", true).Return(nil).Once() + // Publish event eventBus.EXPECT().Publish(domain.EventContainerDeployed, mock.AnythingOfType("*domain.ContainerEventPayload")).Return(nil) From 2eb86f53cb809cd754b055e4a0756e15587b76ec Mon Sep 17 00:00:00 2001 From: brice Date: Wed, 4 Mar 2026 07:38:53 +0100 Subject: [PATCH 14/26] feat(registry): add deploy event suppression for CLI-managed pushes --- internal/usecase/registry/service.go | 54 ++++++++++++++++++----- internal/usecase/registry/service_test.go | 39 ++++++++++++++++ 2 files changed, 82 insertions(+), 11 deletions(-) diff --git a/internal/usecase/registry/service.go b/internal/usecase/registry/service.go index e1efff25..3c86f873 100644 --- a/internal/usecase/registry/service.go +++ b/internal/usecase/registry/service.go @@ -7,6 +7,8 @@ import ( "fmt" "io" "strings" + "sync" + "time" "github.com/bnema/zerowrap" "go.opentelemetry.io/otel" @@ -24,10 +26,11 @@ var registryTracer = otel.Tracer("gordon.registry") // Service implements the RegistryService interface. type Service struct { - blobStorage out.BlobStorage - manifestStorage out.ManifestStorage - eventBus out.EventPublisher - metrics *telemetry.Metrics + blobStorage out.BlobStorage + manifestStorage out.ManifestStorage + eventBus out.EventPublisher + metrics *telemetry.Metrics + suppressedImages sync.Map // imageName -> *time.Timer } // SetMetrics sets the telemetry metrics for the registry service. @@ -48,6 +51,31 @@ func NewService( } } +// SuppressDeployEvent marks an image name to skip image.pushed events. +// The suppression auto-expires after 2 minutes to prevent leaks. +func (s *Service) SuppressDeployEvent(imageName string) { + timer := time.AfterFunc(2*time.Minute, func() { + s.suppressedImages.Delete(imageName) + }) + if existing, loaded := s.suppressedImages.LoadOrStore(imageName, timer); loaded { + existing.(*time.Timer).Stop() + s.suppressedImages.Store(imageName, timer) + } +} + +// ClearDeployEventSuppression removes event suppression for an image. +func (s *Service) ClearDeployEventSuppression(imageName string) { + if v, loaded := s.suppressedImages.LoadAndDelete(imageName); loaded { + v.(*time.Timer).Stop() + } +} + +// IsDeployEventSuppressed checks if deploy events are suppressed for an image. +func (s *Service) IsDeployEventSuppressed(imageName string) bool { + _, exists := s.suppressedImages.Load(imageName) + return exists +} + // GetManifest retrieves a manifest by name and reference. func (s *Service) GetManifest(ctx context.Context, name, reference string) (*domain.Manifest, error) { ctx = zerowrap.CtxWithFields(ctx, map[string]any{ @@ -110,13 +138,17 @@ func (s *Service) PutManifest(ctx context.Context, manifest *domain.Manifest) (s // A docker push sends manifests by both digest and tag; firing only on // tag prevents duplicate deploy triggers for the same push. if s.eventBus != nil && !strings.HasPrefix(manifest.Reference, "sha256:") { - if err := s.eventBus.Publish(domain.EventImagePushed, domain.ImagePushedPayload{ - Name: manifest.Name, - Reference: manifest.Reference, - Manifest: manifest.Data, - Annotations: manifest.Annotations, - }); err != nil { - log.Warn().Err(err).Msg("failed to publish image pushed event") + if s.IsDeployEventSuppressed(manifest.Name) { + log.Info().Str("image", manifest.Name).Msg("skipping image.pushed event: CLI deploy intent active") + } else { + if err := s.eventBus.Publish(domain.EventImagePushed, domain.ImagePushedPayload{ + Name: manifest.Name, + Reference: manifest.Reference, + Manifest: manifest.Data, + Annotations: manifest.Annotations, + }); err != nil { + log.Warn().Err(err).Msg("failed to publish image pushed event") + } } } diff --git a/internal/usecase/registry/service_test.go b/internal/usecase/registry/service_test.go index 4b6efb16..b29f8401 100644 --- a/internal/usecase/registry/service_test.go +++ b/internal/usecase/registry/service_test.go @@ -11,6 +11,7 @@ import ( "github.com/bnema/zerowrap" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" "github.com/bnema/gordon/internal/boundaries/out/mocks" "github.com/bnema/gordon/internal/domain" @@ -437,3 +438,41 @@ func TestService_ListRepositories_Error(t *testing.T) { assert.Nil(t, repos) assert.Contains(t, err.Error(), "failed to list repositories") } + +func TestPutManifest_SkipsEventWhenDeployIntentSuppressed(t *testing.T) { + blobStorage := mocks.NewMockBlobStorage(t) + manifestStorage := mocks.NewMockManifestStorage(t) + eventBus := mocks.NewMockEventPublisher(t) + + svc := NewService(blobStorage, manifestStorage, eventBus) + + svc.SuppressDeployEvent("my-app") + + manifestStorage.EXPECT().PutManifest("my-app", "latest", "application/vnd.oci.image.manifest.v1+json", mock.Anything).Return(nil) + + manifest := &domain.Manifest{ + Name: "my-app", + Reference: "latest", + ContentType: "application/vnd.oci.image.manifest.v1+json", + Data: []byte(`{"schemaVersion":2}`), + } + + _, err := svc.PutManifest(context.Background(), manifest) + require.NoError(t, err) + + eventBus.AssertNotCalled(t, "Publish", mock.Anything, mock.Anything) +} + +func TestSuppressDeployEvent_ClearsCorrectly(t *testing.T) { + blobStorage := mocks.NewMockBlobStorage(t) + manifestStorage := mocks.NewMockManifestStorage(t) + eventBus := mocks.NewMockEventPublisher(t) + + svc := NewService(blobStorage, manifestStorage, eventBus) + + svc.SuppressDeployEvent("my-app") + assert.True(t, svc.IsDeployEventSuppressed("my-app")) + + svc.ClearDeployEventSuppression("my-app") + assert.False(t, svc.IsDeployEventSuppressed("my-app")) +} From 5c9a3136cbe919caa373627f6544679b0443c010 Mon Sep 17 00:00:00 2001 From: brice Date: Wed, 4 Mar 2026 07:40:11 +0100 Subject: [PATCH 15/26] feat(admin): add deploy-intent endpoint to suppress event-based deploys --- internal/adapters/in/http/admin/handler.go | 26 +++++ .../in/mocks/mock_registry_service.go | 80 +++++++++++++++ internal/boundaries/in/registry.go | 4 + internal/usecase/container/readiness.go | 13 ++- internal/usecase/container/service.go | 98 ++++++++++++++----- .../container/service_readiness_test.go | 4 + 6 files changed, 201 insertions(+), 24 deletions(-) diff --git a/internal/adapters/in/http/admin/handler.go b/internal/adapters/in/http/admin/handler.go index 012d7548..81e44e85 100644 --- a/internal/adapters/in/http/admin/handler.go +++ b/internal/adapters/in/http/admin/handler.go @@ -213,6 +213,7 @@ func (h *Handler) matchRoute(path string) (routeHandler, bool) { {"/routes/by-image", h.handleRoutesByImage}, {"/routes", h.handleRoutes}, {"/secrets", h.handleSecrets}, + {"/deploy-intent", h.handleDeployIntent}, {"/deploy", h.handleDeploy}, {"/restart", h.handleRestart}, {"/tags", h.handleTags}, @@ -228,6 +229,31 @@ func (h *Handler) matchRoute(path string) (routeHandler, bool) { return nil, false } +// handleDeployIntent handles /admin/deploy-intent/:image endpoint. +// It registers a deploy intent, suppressing event-based deploys for the image. +func (h *Handler) handleDeployIntent(w http.ResponseWriter, r *http.Request, path string) { + if r.Method != http.MethodPost { + h.sendError(w, http.StatusMethodNotAllowed, "method not allowed") + return + } + + imageName := strings.TrimPrefix(path, "/deploy-intent/") + if imageName == "" || imageName == "/deploy-intent" { + h.sendError(w, http.StatusBadRequest, "image name required") + return + } + + log := zerowrap.FromCtx(r.Context()) + log.Info().Str("image", imageName).Msg("deploy intent registered, suppressing image.pushed events") + + h.registrySvc.SuppressDeployEvent(imageName) + + h.sendJSON(w, http.StatusOK, map[string]string{ + "status": "ok", + "image": imageName, + }) +} + // sendJSON sends a JSON response. func (h *Handler) sendJSON(w http.ResponseWriter, status int, data any) { w.Header().Set("Content-Type", "application/json") diff --git a/internal/boundaries/in/mocks/mock_registry_service.go b/internal/boundaries/in/mocks/mock_registry_service.go index 7ee610f4..52821cb6 100644 --- a/internal/boundaries/in/mocks/mock_registry_service.go +++ b/internal/boundaries/in/mocks/mock_registry_service.go @@ -231,6 +231,46 @@ func (_c *MockRegistryService_CancelUpload_Call) RunAndReturn(run func(ctx conte return _c } +// ClearDeployEventSuppression provides a mock function for the type MockRegistryService +func (_mock *MockRegistryService) ClearDeployEventSuppression(imageName string) { + _mock.Called(imageName) + return +} + +// MockRegistryService_ClearDeployEventSuppression_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'ClearDeployEventSuppression' +type MockRegistryService_ClearDeployEventSuppression_Call struct { + *mock.Call +} + +// ClearDeployEventSuppression is a helper method to define mock.On call +// - imageName string +func (_e *MockRegistryService_Expecter) ClearDeployEventSuppression(imageName interface{}) *MockRegistryService_ClearDeployEventSuppression_Call { + return &MockRegistryService_ClearDeployEventSuppression_Call{Call: _e.mock.On("ClearDeployEventSuppression", imageName)} +} + +func (_c *MockRegistryService_ClearDeployEventSuppression_Call) Run(run func(imageName string)) *MockRegistryService_ClearDeployEventSuppression_Call { + _c.Call.Run(func(args mock.Arguments) { + var arg0 string + if args[0] != nil { + arg0 = args[0].(string) + } + run( + arg0, + ) + }) + return _c +} + +func (_c *MockRegistryService_ClearDeployEventSuppression_Call) Return() *MockRegistryService_ClearDeployEventSuppression_Call { + _c.Call.Return() + return _c +} + +func (_c *MockRegistryService_ClearDeployEventSuppression_Call) RunAndReturn(run func(imageName string)) *MockRegistryService_ClearDeployEventSuppression_Call { + _c.Run(run) + return _c +} + // DeleteManifest provides a mock function for the type MockRegistryService func (_mock *MockRegistryService) DeleteManifest(ctx context.Context, name string, reference string) error { ret := _mock.Called(ctx, name, reference) @@ -895,3 +935,43 @@ func (_c *MockRegistryService_StartUpload_Call) RunAndReturn(run func(ctx contex _c.Call.Return(run) return _c } + +// SuppressDeployEvent provides a mock function for the type MockRegistryService +func (_mock *MockRegistryService) SuppressDeployEvent(imageName string) { + _mock.Called(imageName) + return +} + +// MockRegistryService_SuppressDeployEvent_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'SuppressDeployEvent' +type MockRegistryService_SuppressDeployEvent_Call struct { + *mock.Call +} + +// SuppressDeployEvent is a helper method to define mock.On call +// - imageName string +func (_e *MockRegistryService_Expecter) SuppressDeployEvent(imageName interface{}) *MockRegistryService_SuppressDeployEvent_Call { + return &MockRegistryService_SuppressDeployEvent_Call{Call: _e.mock.On("SuppressDeployEvent", imageName)} +} + +func (_c *MockRegistryService_SuppressDeployEvent_Call) Run(run func(imageName string)) *MockRegistryService_SuppressDeployEvent_Call { + _c.Call.Run(func(args mock.Arguments) { + var arg0 string + if args[0] != nil { + arg0 = args[0].(string) + } + run( + arg0, + ) + }) + return _c +} + +func (_c *MockRegistryService_SuppressDeployEvent_Call) Return() *MockRegistryService_SuppressDeployEvent_Call { + _c.Call.Return() + return _c +} + +func (_c *MockRegistryService_SuppressDeployEvent_Call) RunAndReturn(run func(imageName string)) *MockRegistryService_SuppressDeployEvent_Call { + _c.Run(run) + return _c +} diff --git a/internal/boundaries/in/registry.go b/internal/boundaries/in/registry.go index 9eb8746c..c65c8993 100644 --- a/internal/boundaries/in/registry.go +++ b/internal/boundaries/in/registry.go @@ -29,4 +29,8 @@ type RegistryService interface { // Tag operations ListTags(ctx context.Context, name string) ([]string, error) ListRepositories(ctx context.Context) ([]string, error) + + // Deploy event suppression (prevents double-deploy when CLI manages the push) + SuppressDeployEvent(imageName string) + ClearDeployEventSuppression(imageName string) } diff --git a/internal/usecase/container/readiness.go b/internal/usecase/container/readiness.go index e9cb9423..76b85125 100644 --- a/internal/usecase/container/readiness.go +++ b/internal/usecase/container/readiness.go @@ -6,20 +6,25 @@ import ( "net" "net/http" "time" + + "github.com/bnema/zerowrap" ) // tcpProbe attempts a TCP connection to addr, retrying every 500ms until // success or timeout. This is the universal fallback readiness check — // it verifies the process is at least accepting connections. func tcpProbe(ctx context.Context, addr string, timeout time.Duration) error { + log := zerowrap.FromCtx(ctx) deadline := time.Now().Add(timeout) dialer := &net.Dialer{} + attempts := 0 + var lastErr error for { if err := ctx.Err(); err != nil { return err } if time.Now().After(deadline) { - return fmt.Errorf("TCP probe timeout after %s: %s not reachable", timeout, addr) + return fmt.Errorf("TCP probe timeout after %s: %s not reachable (attempts=%d, last_error=%v)", timeout, addr, attempts, lastErr) } remaining := time.Until(deadline) @@ -30,10 +35,16 @@ func tcpProbe(ctx context.Context, addr string, timeout time.Duration) error { attemptCtx, cancel := context.WithTimeout(ctx, attemptTimeout) conn, err := dialer.DialContext(attemptCtx, "tcp", addr) cancel() + attempts++ if err == nil { conn.Close() + log.Debug().Str("addr", addr).Int("attempts", attempts).Msg("TCP probe connected") return nil } + lastErr = err + if attempts <= 3 || attempts%10 == 0 { + log.Debug().Err(err).Str("addr", addr).Int("attempt", attempts).Msg("TCP probe attempt failed") + } select { case <-time.After(500 * time.Millisecond): diff --git a/internal/usecase/container/service.go b/internal/usecase/container/service.go index 9d7b93b0..719d8399 100644 --- a/internal/usecase/container/service.go +++ b/internal/usecase/container/service.go @@ -407,6 +407,11 @@ type deployResources struct { // It first checks the in-memory map (fast path). If not found, it queries the // runtime for a running container with matching name and managed label. This // handles cases where Gordon restarted and in-memory state is stale. +// +// The slow path checks canonical, -new, and -next names because a previous +// deploy may have been interrupted (e.g. eventbus timeout, Gordon restart) +// leaving the active container under a temp name. Canonical is preferred when +// multiple matches exist. func (s *Service) resolveExistingContainer(ctx context.Context, domainName string) (*domain.Container, bool) { // Fast path: check in-memory state s.mu.RLock() @@ -418,7 +423,12 @@ func (s *Service) resolveExistingContainer(ctx context.Context, domainName strin // Slow path: query runtime for running containers log := zerowrap.FromCtx(ctx) - expectedName := fmt.Sprintf("gordon-%s", domainName) + canonicalName := fmt.Sprintf("gordon-%s", domainName) + candidateNames := map[string]bool{ + canonicalName: true, + canonicalName + "-new": true, + canonicalName + "-next": true, + } running, err := s.runtime.ListContainers(ctx, false) if err != nil { @@ -426,23 +436,32 @@ func (s *Service) resolveExistingContainer(ctx context.Context, domainName strin return nil, false } + // Prefer canonical name; fall back to any running managed temp container. + var best *domain.Container for _, c := range running { - if c.Name == expectedName && c.Labels[domain.LabelManaged] == "true" { - log.Info(). - Str("container_id", c.ID). - Str("container_name", c.Name). - Msg("resolved existing container from runtime (in-memory state was stale)") - - // Update in-memory state so subsequent lookups are fast - s.mu.Lock() - if _, alreadyTracked := s.containers[domainName]; !alreadyTracked { - s.managedCount++ - } - s.containers[domainName] = c - s.mu.Unlock() + if !candidateNames[c.Name] || c.Labels[domain.LabelManaged] != "true" { + continue + } + if best == nil || c.Name == canonicalName { + best = c + } + } + + if best != nil { + log.Info(). + Str("container_id", best.ID). + Str("container_name", best.Name). + Msg("resolved existing container from runtime (in-memory state was stale)") - return c, true + // Update in-memory state so subsequent lookups are fast + s.mu.Lock() + if _, alreadyTracked := s.containers[domainName]; !alreadyTracked { + s.managedCount++ } + s.containers[domainName] = best + s.mu.Unlock() + + return best, true } return nil, false @@ -1744,19 +1763,26 @@ func (s *Service) cleanupOrphanedContainers(ctx context.Context, domainName stri for _, c := range allContainers { if (c.Name == expectedName || c.Name == expectedNewName || c.Name == expectedNextName) && c.ID != skipContainerID { - // Never kill a running container with the canonical name — it may be - // the active container serving traffic. Only temp (-new/-next) containers - // and stopped canonical containers are safe to remove. + // The skipContainerID is the container we resolved as the active + // one serving traffic. Any other container with a matching name is + // an orphan from a previous (possibly interrupted) deploy. + // + // Running containers with the canonical name are protected — they + // could be legitimate if resolveExistingContainer found a temp + // container instead. Running temp containers (-new/-next) that are + // NOT the skip container are safe to remove: they are leftovers + // from failed deploys. isCanonical := c.Name == expectedName if isCanonical && c.Status == "running" { log.Debug(). Str(zerowrap.FieldEntityID, c.ID). + Str("container_name", c.Name). Str(zerowrap.FieldStatus, c.Status). Msg("skipping running canonical container during orphan cleanup") continue } - log.Info().Str(zerowrap.FieldEntityID, c.ID).Str(zerowrap.FieldStatus, c.Status).Msg("found orphaned container, removing") + log.Info().Str(zerowrap.FieldEntityID, c.ID).Str("container_name", c.Name).Str(zerowrap.FieldStatus, c.Status).Msg("found orphaned container, removing") if err := s.runtime.StopContainer(ctx, c.ID); err != nil { log.WrapErrWithFields(err, "failed to stop orphaned container", map[string]any{zerowrap.FieldEntityID: c.ID}) @@ -2344,14 +2370,40 @@ func (s *Service) tryTCPProbe(ctx context.Context, containerID string, container return true, tcpProbe(ctx, addr, timeout) } -// resolveContainerEndpoint retrieves the container's IP and first port via -// GetContainerNetworkInfo. The result can be used for HTTP or TCP probes. +// resolveContainerEndpoint returns a host-reachable address for probing. +// In rootless podman/Docker setups, container internal IPs are not routable +// from the host. We use the host port binding (127.0.0.1:) +// which is always reachable. Falls back to the container's internal IP +// only if no host port mapping exists (e.g. host-network mode). func (s *Service) resolveContainerEndpoint(ctx context.Context, containerID string) (string, int, error) { - ip, port, err := s.runtime.GetContainerNetworkInfo(ctx, containerID) + log := zerowrap.FromCtx(ctx) + + // First, get the container's internal port from network info + _, internalPort, err := s.runtime.GetContainerNetworkInfo(ctx, containerID) if err != nil { return "", 0, err } - return ip, port, nil + + // Try to resolve via host port binding (works in rootless podman/Docker) + hostPort, hostErr := s.runtime.GetContainerPort(ctx, containerID, internalPort) + if hostErr == nil && hostPort > 0 { + log.Debug(). + Int("internal_port", internalPort). + Int("host_port", hostPort). + Msg("resolved container endpoint via host port binding") + return "127.0.0.1", hostPort, nil + } + + // Fallback: use internal IP (works in Docker rootful / host network) + ip, _, fallbackErr := s.runtime.GetContainerNetworkInfo(ctx, containerID) + if fallbackErr != nil { + return "", 0, fallbackErr + } + log.Debug(). + Str("ip", ip). + Int("port", internalPort). + Msg("resolved container endpoint via internal IP (no host port binding)") + return ip, internalPort, nil } // waitForReadyByDelay waits using the legacy running+delay strategy. diff --git a/internal/usecase/container/service_readiness_test.go b/internal/usecase/container/service_readiness_test.go index 2dc70429..a7c3e2b6 100644 --- a/internal/usecase/container/service_readiness_test.go +++ b/internal/usecase/container/service_readiness_test.go @@ -70,6 +70,8 @@ func TestService_WaitForReady_AutoCascadeUsesTCPWhenNoHealthcheckAndNoHealthLabe runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("", false, nil).Once() // Cascade resolves container endpoint for TCP probe runtime.EXPECT().GetContainerNetworkInfo(mock.Anything, containerID).Return(probeIP, probePort, nil).Once() + // Host port binding resolution — return the same loopback addr so probe hits it + runtime.EXPECT().GetContainerPort(mock.Anything, containerID, probePort).Return(probePort, nil).Once() // TCP probe will try to connect — which will fail since the port is closed. // Use a short health timeout so the test doesn't hang. @@ -109,6 +111,8 @@ func TestService_WaitForReady_AutoCascadeUsesHTTPProbeWhenHealthLabelSet(t *test runtime.EXPECT().GetContainerHealthStatus(mock.Anything, containerID).Return("", false, nil).Once() // Cascade resolves container endpoint for HTTP probe runtime.EXPECT().GetContainerNetworkInfo(mock.Anything, containerID).Return(probeIP, probePort, nil).Once() + // Host port binding resolution — return the same loopback addr so probe hits it + runtime.EXPECT().GetContainerPort(mock.Anything, containerID, probePort).Return(probePort, nil).Once() err := svc.waitForReady(ctx, containerID, containerConfig) // Expect HTTP probe timeout (no server listening) From 5ba38cffb391aaa5e5de90480df9cd6d5da5c183 Mon Sep 17 00:00:00 2001 From: brice Date: Wed, 4 Mar 2026 07:41:44 +0100 Subject: [PATCH 16/26] feat(cli): call deploy-intent before push to prevent event-based double deploy --- internal/adapters/in/cli/controlplane.go | 1 + internal/adapters/in/cli/controlplane_local.go | 7 +++++++ internal/adapters/in/cli/controlplane_remote.go | 4 ++++ internal/adapters/in/cli/push.go | 9 +++++++++ internal/adapters/in/cli/remote/client.go | 14 ++++++++++++++ 5 files changed, 35 insertions(+) diff --git a/internal/adapters/in/cli/controlplane.go b/internal/adapters/in/cli/controlplane.go index 3926c65e..296aa84e 100644 --- a/internal/adapters/in/cli/controlplane.go +++ b/internal/adapters/in/cli/controlplane.go @@ -34,6 +34,7 @@ type ControlPlane interface { GetStatus(ctx context.Context) (*remote.Status, error) Reload(ctx context.Context) error + DeployIntent(ctx context.Context, imageName string) error Deploy(ctx context.Context, deployDomain string) (*remote.DeployResult, error) Restart(ctx context.Context, restartDomain string, withAttachments bool) (*remote.RestartResult, error) ListTags(ctx context.Context, repository string) ([]string, error) diff --git a/internal/adapters/in/cli/controlplane_local.go b/internal/adapters/in/cli/controlplane_local.go index 60c87577..e60941b4 100644 --- a/internal/adapters/in/cli/controlplane_local.go +++ b/internal/adapters/in/cli/controlplane_local.go @@ -223,6 +223,13 @@ func (l *localControlPlane) Reload(_ context.Context) error { return app.SendReloadSignal() } +func (l *localControlPlane) DeployIntent(_ context.Context, imageName string) error { + if l.registrySvc != nil { + l.registrySvc.SuppressDeployEvent(imageName) + } + return nil +} + func (l *localControlPlane) Deploy(ctx context.Context, deployDomain string) (*remote.DeployResult, error) { if l.containerSvc != nil && l.configSvc != nil { route, err := l.configSvc.GetRoute(ctx, deployDomain) diff --git a/internal/adapters/in/cli/controlplane_remote.go b/internal/adapters/in/cli/controlplane_remote.go index 9b35f90a..a37b123f 100644 --- a/internal/adapters/in/cli/controlplane_remote.go +++ b/internal/adapters/in/cli/controlplane_remote.go @@ -88,6 +88,10 @@ func (r *remoteControlPlane) Reload(ctx context.Context) error { return r.client.Reload(ctx) } +func (r *remoteControlPlane) DeployIntent(ctx context.Context, imageName string) error { + return r.client.DeployIntent(ctx, imageName) +} + func (r *remoteControlPlane) Deploy(ctx context.Context, deployDomain string) (*remote.DeployResult, error) { return r.client.Deploy(ctx, deployDomain) } diff --git a/internal/adapters/in/cli/push.go b/internal/adapters/in/cli/push.go index d0e2f76e..3a20e6c5 100644 --- a/internal/adapters/in/cli/push.go +++ b/internal/adapters/in/cli/push.go @@ -154,6 +154,15 @@ func runPush(ctx context.Context, imageArg, domainFlag, tag string, build bool, } fmt.Printf("Domain: %s\n", styles.Theme.Bold.Render(pushDomain)) + // Signal the server to suppress event-based deploys for this image. + // The CLI will trigger an explicit deploy after push completes. + if !noDeploy { + if err := handle.plane.DeployIntent(ctx, imageName); err != nil { + // Non-fatal: worst case we get a redundant deploy via event + fmt.Fprintf(os.Stderr, "warning: failed to register deploy intent: %v\n", err) + } + } + if build { if err := buildAndPush(ctx, version, platform, dockerfile, buildArgs, versionRef, latestRef); err != nil { return err diff --git a/internal/adapters/in/cli/remote/client.go b/internal/adapters/in/cli/remote/client.go index 9ea2ca17..49644eaa 100644 --- a/internal/adapters/in/cli/remote/client.go +++ b/internal/adapters/in/cli/remote/client.go @@ -724,6 +724,20 @@ func (c *Client) Deploy(ctx context.Context, deployDomain string) (*DeployResult return &result, nil } +// DeployIntent tells the server that a CLI-managed push is about to happen, +// suppressing event-based deploys for this image. +func (c *Client) DeployIntent(ctx context.Context, imageName string) error { + resp, err := c.requestWithRetry(ctx, http.MethodPost, "/deploy-intent/"+url.PathEscape(imageName), nil) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("deploy-intent failed: %s", resp.Status) + } + return nil +} + // Restart API // RestartResult contains the result of a restart. From f5d1226cf6ea934eb51bcbcebdfb81a82c79472f Mon Sep 17 00:00:00 2001 From: brice Date: Wed, 4 Mar 2026 07:42:32 +0100 Subject: [PATCH 17/26] feat(admin): clear deploy event suppression after successful deploy --- internal/adapters/in/http/admin/handler.go | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/internal/adapters/in/http/admin/handler.go b/internal/adapters/in/http/admin/handler.go index 81e44e85..d5bf4a8b 100644 --- a/internal/adapters/in/http/admin/handler.go +++ b/internal/adapters/in/http/admin/handler.go @@ -1104,6 +1104,22 @@ func (h *Handler) handleDeploy(w http.ResponseWriter, r *http.Request, path stri return } + // Clear deploy event suppression now that the explicit deploy has completed. + // This re-enables event-based deploys for future direct docker pushes. + if route.Image != "" { + // Extract image name (without registry prefix and tag) for suppression key + imageName := route.Image + // The suppression key is just the image name as stored in the registry + // (e.g., "my-app" not "reg.example.com/my-app:latest") + if parts := strings.SplitN(imageName, "/", 2); len(parts) == 2 { + imageName = parts[1] + } + if idx := strings.LastIndex(imageName, ":"); idx != -1 { + imageName = imageName[:idx] + } + h.registrySvc.ClearDeployEventSuppression(imageName) + } + log.Info().Str("domain", deployDomain).Str("container_id", container.ID).Msg("container deployed via admin API") h.sendJSON(w, http.StatusOK, dto.DeployResponse{ Status: "deployed", From b5f1ca32a214ae175588a5b1b5950d551f7aa459 Mon Sep 17 00:00:00 2001 From: brice Date: Wed, 4 Mar 2026 09:42:58 +0100 Subject: [PATCH 18/26] fix: address code review findings from CR - registry: fix timer identity check in SuppressDeployEvent to prevent stale AfterFunc callbacks from evicting a newer suppression entry - registry: add ExtractImageName helper for canonical image ref parsing - admin/handler: add auth check, registrySvc nil-guard, and URL-decode for imageName in handleDeployIntent - admin/handler: use ExtractImageName and registrySvc nil-guard when clearing suppression after explicit deploy - container: assign stabilizeErr to named return err so deferred recordDeployMetrics and span see stabilization failures - container: skip all running containers (not just canonical) in cleanupOrphanedContainers to protect active temp containers - container: log StopContainer/RemoveContainer errors in rollback paths instead of silently discarding them --- internal/adapters/in/http/admin/handler.go | 39 +++++++++++++------- internal/usecase/container/service.go | 39 +++++++++++--------- internal/usecase/registry/service.go | 43 +++++++++++++++++++++- 3 files changed, 87 insertions(+), 34 deletions(-) diff --git a/internal/adapters/in/http/admin/handler.go b/internal/adapters/in/http/admin/handler.go index d5bf4a8b..5aa8079c 100644 --- a/internal/adapters/in/http/admin/handler.go +++ b/internal/adapters/in/http/admin/handler.go @@ -7,6 +7,7 @@ import ( "fmt" "io" "net/http" + "net/url" "strconv" "strings" "time" @@ -237,13 +238,30 @@ func (h *Handler) handleDeployIntent(w http.ResponseWriter, r *http.Request, pat return } - imageName := strings.TrimPrefix(path, "/deploy-intent/") - if imageName == "" || imageName == "/deploy-intent" { + ctx := r.Context() + if !HasAccess(ctx, domain.AdminResourceConfig, domain.AdminActionWrite) { + h.sendError(w, http.StatusForbidden, "insufficient permissions for config:write") + return + } + + if h.registrySvc == nil { + h.sendError(w, http.StatusServiceUnavailable, "registry service unavailable") + return + } + + rawName := strings.TrimPrefix(path, "/deploy-intent/") + if rawName == "" || rawName == "/deploy-intent" { h.sendError(w, http.StatusBadRequest, "image name required") return } - log := zerowrap.FromCtx(r.Context()) + imageName, err := url.PathUnescape(rawName) + if err != nil { + h.sendError(w, http.StatusBadRequest, "invalid image name encoding") + return + } + + log := zerowrap.FromCtx(ctx) log.Info().Str("image", imageName).Msg("deploy intent registered, suppressing image.pushed events") h.registrySvc.SuppressDeployEvent(imageName) @@ -1106,17 +1124,10 @@ func (h *Handler) handleDeploy(w http.ResponseWriter, r *http.Request, path stri // Clear deploy event suppression now that the explicit deploy has completed. // This re-enables event-based deploys for future direct docker pushes. - if route.Image != "" { - // Extract image name (without registry prefix and tag) for suppression key - imageName := route.Image - // The suppression key is just the image name as stored in the registry - // (e.g., "my-app" not "reg.example.com/my-app:latest") - if parts := strings.SplitN(imageName, "/", 2); len(parts) == 2 { - imageName = parts[1] - } - if idx := strings.LastIndex(imageName, ":"); idx != -1 { - imageName = imageName[:idx] - } + if route.Image != "" && h.registrySvc != nil { + // Use the registry package's image name normaliser so digest-form refs + // and multi-segment paths are handled correctly. + imageName := registry.ExtractImageName(route.Image) h.registrySvc.ClearDeployEventSuppression(imageName) } diff --git a/internal/usecase/container/service.go b/internal/usecase/container/service.go index 719d8399..7aebc1fd 100644 --- a/internal/usecase/container/service.go +++ b/internal/usecase/container/service.go @@ -295,8 +295,10 @@ func (s *Service) Deploy(ctx context.Context, route domain.Route) (*domain.Conta if hasExisting { stable, stabilizeErr := s.stabilizeNewContainer(ctx, route.Domain, newContainer, existing) if stabilizeErr != nil { - // Both old and new containers are dead - return nil, stabilizeErr + // Both old and new containers are dead; assign to named return so + // the deferred recordDeployMetrics and span see the failure. + err = stabilizeErr + return nil, err } if !stable { // Rollback performed — old container is restored @@ -627,8 +629,12 @@ func (s *Service) stabilizeNewContainer(ctx context.Context, domainName string, Msg("old container is also not running, cannot rollback") // Cleanup failed new container - _ = s.runtime.StopContainer(ctx, newContainer.ID) - _ = s.runtime.RemoveContainer(ctx, newContainer.ID, true) + if stopErr := s.runtime.StopContainer(ctx, newContainer.ID); stopErr != nil { + log.WrapErrWithFields(stopErr, "failed to stop failed new container during rollback", map[string]any{zerowrap.FieldEntityID: newContainer.ID}) + } + if removeErr := s.runtime.RemoveContainer(ctx, newContainer.ID, true); removeErr != nil { + log.WrapErrWithFields(removeErr, "failed to remove failed new container during rollback", map[string]any{zerowrap.FieldEntityID: newContainer.ID}) + } return false, fmt.Errorf("stabilization failed: new container crashed and old container %s is also not running", oldContainer.ID) } @@ -647,8 +653,12 @@ func (s *Service) stabilizeNewContainer(ctx context.Context, domainName string, } // Cleanup failed new container - _ = s.runtime.StopContainer(ctx, newContainer.ID) - _ = s.runtime.RemoveContainer(ctx, newContainer.ID, true) + if stopErr := s.runtime.StopContainer(ctx, newContainer.ID); stopErr != nil { + log.WrapErrWithFields(stopErr, "failed to stop failed new container during rollback", map[string]any{zerowrap.FieldEntityID: newContainer.ID}) + } + if removeErr := s.runtime.RemoveContainer(ctx, newContainer.ID, true); removeErr != nil { + log.WrapErrWithFields(removeErr, "failed to remove failed new container during rollback", map[string]any{zerowrap.FieldEntityID: newContainer.ID}) + } return false, nil } @@ -1763,22 +1773,15 @@ func (s *Service) cleanupOrphanedContainers(ctx context.Context, domainName stri for _, c := range allContainers { if (c.Name == expectedName || c.Name == expectedNewName || c.Name == expectedNextName) && c.ID != skipContainerID { - // The skipContainerID is the container we resolved as the active - // one serving traffic. Any other container with a matching name is - // an orphan from a previous (possibly interrupted) deploy. - // - // Running containers with the canonical name are protected — they - // could be legitimate if resolveExistingContainer found a temp - // container instead. Running temp containers (-new/-next) that are - // NOT the skip container are safe to remove: they are leftovers - // from failed deploys. - isCanonical := c.Name == expectedName - if isCanonical && c.Status == "running" { + // Skip any running container regardless of name — a running + // temp container may be actively serving traffic while the + // new container stabilizes, or may be from a concurrent deploy. + if c.Status == "running" { log.Debug(). Str(zerowrap.FieldEntityID, c.ID). Str("container_name", c.Name). Str(zerowrap.FieldStatus, c.Status). - Msg("skipping running canonical container during orphan cleanup") + Msg("skipping running container during orphan cleanup") continue } diff --git a/internal/usecase/registry/service.go b/internal/usecase/registry/service.go index 3c86f873..e8e90815 100644 --- a/internal/usecase/registry/service.go +++ b/internal/usecase/registry/service.go @@ -54,8 +54,13 @@ func NewService( // SuppressDeployEvent marks an image name to skip image.pushed events. // The suppression auto-expires after 2 minutes to prevent leaks. func (s *Service) SuppressDeployEvent(imageName string) { - timer := time.AfterFunc(2*time.Minute, func() { - s.suppressedImages.Delete(imageName) + var timer *time.Timer + timer = time.AfterFunc(2*time.Minute, func() { + // Only delete if this timer is still the current one, preventing an + // old timer's callback from removing a newer suppression entry. + if v, ok := s.suppressedImages.Load(imageName); ok && v == timer { + s.suppressedImages.Delete(imageName) + } }) if existing, loaded := s.suppressedImages.LoadOrStore(imageName, timer); loaded { existing.(*time.Timer).Stop() @@ -70,6 +75,40 @@ func (s *Service) ClearDeployEventSuppression(imageName string) { } } +// ExtractImageName returns just the repository path of a container image +// reference, stripping any registry host prefix, tag, and digest. +// Examples: +// +// "reg.example.com/team/my-app:latest" -> "team/my-app" +// "reg.example.com/my-app@sha256:abc" -> "my-app" +// "my-app:v1.2" -> "my-app" +func ExtractImageName(imageRef string) string { + name := imageRef + // Strip digest + if idx := strings.Index(name, "@"); idx != -1 { + name = name[:idx] + } + // Strip tag + // Find the last colon, but only strip it if it comes after any slash + // (to avoid treating a port number in the host as a tag). + if idx := strings.LastIndex(name, ":"); idx != -1 { + slashIdx := strings.LastIndex(name, "/") + if idx > slashIdx { + name = name[:idx] + } + } + // Strip registry host: if the first segment contains a dot or colon it is + // a registry hostname; remove it. + parts := strings.SplitN(name, "/", 2) + if len(parts) == 2 { + host := parts[0] + if strings.ContainsAny(host, ".:") || host == "localhost" { + name = parts[1] + } + } + return name +} + // IsDeployEventSuppressed checks if deploy events are suppressed for an image. func (s *Service) IsDeployEventSuppressed(imageName string) bool { _, exists := s.suppressedImages.Load(imageName) From 7d1dad7e4ec59fa3ddf459fc4f81a11d0983034b Mon Sep 17 00:00:00 2001 From: brice Date: Wed, 4 Mar 2026 09:59:08 +0100 Subject: [PATCH 19/26] fix: normalize suppression keys in registry suppression methods --- internal/usecase/registry/service.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/internal/usecase/registry/service.go b/internal/usecase/registry/service.go index e8e90815..66602856 100644 --- a/internal/usecase/registry/service.go +++ b/internal/usecase/registry/service.go @@ -54,6 +54,11 @@ func NewService( // SuppressDeployEvent marks an image name to skip image.pushed events. // The suppression auto-expires after 2 minutes to prevent leaks. func (s *Service) SuppressDeployEvent(imageName string) { + imageName = ExtractImageName(strings.TrimSpace(imageName)) + if imageName == "" { + return + } + var timer *time.Timer timer = time.AfterFunc(2*time.Minute, func() { // Only delete if this timer is still the current one, preventing an @@ -70,6 +75,11 @@ func (s *Service) SuppressDeployEvent(imageName string) { // ClearDeployEventSuppression removes event suppression for an image. func (s *Service) ClearDeployEventSuppression(imageName string) { + imageName = ExtractImageName(strings.TrimSpace(imageName)) + if imageName == "" { + return + } + if v, loaded := s.suppressedImages.LoadAndDelete(imageName); loaded { v.(*time.Timer).Stop() } @@ -111,6 +121,11 @@ func ExtractImageName(imageRef string) string { // IsDeployEventSuppressed checks if deploy events are suppressed for an image. func (s *Service) IsDeployEventSuppressed(imageName string) bool { + imageName = ExtractImageName(strings.TrimSpace(imageName)) + if imageName == "" { + return false + } + _, exists := s.suppressedImages.Load(imageName) return exists } From 87a4f062e666009729226ab902e3b9e931b1d743 Mon Sep 17 00:00:00 2001 From: brice Date: Wed, 4 Mar 2026 09:59:41 +0100 Subject: [PATCH 20/26] fix: context-aware sleep in TCP and HTTP probe retry loops --- internal/usecase/container/readiness.go | 26 +++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/internal/usecase/container/readiness.go b/internal/usecase/container/readiness.go index 76b85125..625f5dff 100644 --- a/internal/usecase/container/readiness.go +++ b/internal/usecase/container/readiness.go @@ -46,9 +46,20 @@ func tcpProbe(ctx context.Context, addr string, timeout time.Duration) error { log.Debug().Err(err).Str("addr", addr).Int("attempt", attempts).Msg("TCP probe attempt failed") } + sleepInterval := 500 * time.Millisecond + if deadline, ok := ctx.Deadline(); ok { + if rem := time.Until(deadline); rem < sleepInterval { + sleepInterval = rem + } + } + if sleepInterval <= 0 { + return ctx.Err() + } + t := time.NewTimer(sleepInterval) select { - case <-time.After(500 * time.Millisecond): + case <-t.C: case <-ctx.Done(): + t.Stop() return ctx.Err() } } @@ -95,9 +106,20 @@ func httpProbe(ctx context.Context, url string, timeout time.Duration) error { } } + sleepInterval := time.Second + if deadline, ok := ctx.Deadline(); ok { + if rem := time.Until(deadline); rem < sleepInterval { + sleepInterval = rem + } + } + if sleepInterval <= 0 { + return ctx.Err() + } + t := time.NewTimer(sleepInterval) select { - case <-time.After(time.Second): + case <-t.C: case <-ctx.Done(): + t.Stop() return ctx.Err() } } From 593697c919d230278097345c0bf3e9094ef1b6e9 Mon Sep 17 00:00:00 2001 From: brice Date: Wed, 4 Mar 2026 10:00:12 +0100 Subject: [PATCH 21/26] fix: use SO_REUSEADDR in delayed listener test to prevent TIME_WAIT flakiness --- internal/usecase/container/readiness_test.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/internal/usecase/container/readiness_test.go b/internal/usecase/container/readiness_test.go index 6f995f95..811ba9ff 100644 --- a/internal/usecase/container/readiness_test.go +++ b/internal/usecase/container/readiness_test.go @@ -6,6 +6,7 @@ import ( "net/http" "net/http/httptest" "sync/atomic" + "syscall" "testing" "time" @@ -67,7 +68,14 @@ func TestTCPProbe_DelayedListener(t *testing.T) { // Re-open after 1 second go func() { time.Sleep(time.Second) - newLn, err := net.Listen("tcp", addr) + lc := net.ListenConfig{ + Control: func(network, address string, c syscall.RawConn) error { + return c.Control(func(fd uintptr) { + _ = syscall.SetsockoptInt(int(fd), syscall.SOL_SOCKET, syscall.SO_REUSEADDR, 1) + }) + }, + } + newLn, err := lc.Listen(context.Background(), "tcp", addr) if err != nil { t.Logf("delayed listener failed to bind: %v", err) return From fae472996e5b3edea32d1e693a5fb631329a868c Mon Sep 17 00:00:00 2001 From: brice Date: Wed, 4 Mar 2026 10:00:49 +0100 Subject: [PATCH 22/26] refactor: use parseResponse in DeployIntent for consistent error handling --- internal/adapters/in/cli/remote/client.go | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/internal/adapters/in/cli/remote/client.go b/internal/adapters/in/cli/remote/client.go index 49644eaa..8eaa3ba6 100644 --- a/internal/adapters/in/cli/remote/client.go +++ b/internal/adapters/in/cli/remote/client.go @@ -731,11 +731,7 @@ func (c *Client) DeployIntent(ctx context.Context, imageName string) error { if err != nil { return err } - defer resp.Body.Close() - if resp.StatusCode != http.StatusOK { - return fmt.Errorf("deploy-intent failed: %s", resp.Status) - } - return nil + return parseResponse(resp, nil) } // Restart API From a2cc8dcf4cfbccb83c0767f03a2f6ab12d127e36 Mon Sep 17 00:00:00 2001 From: brice Date: Wed, 4 Mar 2026 10:02:15 +0100 Subject: [PATCH 23/26] refactor: extract testMinDelayConfig helper in container service tests --- internal/usecase/container/service_test.go | 70 +++++++--------------- 1 file changed, 20 insertions(+), 50 deletions(-) diff --git a/internal/usecase/container/service_test.go b/internal/usecase/container/service_test.go index da71d893..9ae0fa4d 100644 --- a/internal/usecase/container/service_test.go +++ b/internal/usecase/container/service_test.go @@ -25,6 +25,16 @@ func testContext() context.Context { return zerowrap.WithCtx(context.Background(), zerowrap.Default()) } +// testMinDelayConfig returns a Config with all timing delays set to 1ms, +// suitable for unit tests that don't want to wait for real timeouts. +func testMinDelayConfig() Config { + return Config{ + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + StabilizationDelay: time.Millisecond, + } +} + func setupMetricsTest(t *testing.T) (*telemetry.Metrics, *sdkmetric.ManualReader) { t.Helper() @@ -493,11 +503,7 @@ func TestService_Deploy_ResolvesExistingFromRuntime_WhenMemoryStale(t *testing.T eventBus := mocks.NewMockEventPublisher(t) cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) - config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - StabilizationDelay: time.Millisecond, - } + config := testMinDelayConfig() svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) ctx := testContext() @@ -593,11 +599,7 @@ func TestService_Deploy_SkipRedundantDeploy_GetImageIDError(t *testing.T) { envLoader := mocks.NewMockEnvLoader(t) eventBus := mocks.NewMockEventPublisher(t) - config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - StabilizationDelay: time.Millisecond, - } + config := testMinDelayConfig() svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -1839,11 +1841,7 @@ func TestService_Deploy_OrphanCleanupRemovesStaleNewContainer(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) - config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - StabilizationDelay: time.Millisecond, - } + config := testMinDelayConfig() svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) ctx := testContext() @@ -1913,11 +1911,7 @@ func TestService_Deploy_TrackedTempContainerUsesAlternateTempName(t *testing.T) eventBus := mocks.NewMockEventPublisher(t) cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) - config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - StabilizationDelay: time.Millisecond, - } + config := testMinDelayConfig() svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) ctx := testContext() @@ -2104,11 +2098,7 @@ func TestService_Deploy_ConcurrentSameDomain(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) - config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - StabilizationDelay: time.Millisecond, - } + config := testMinDelayConfig() svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) ctx := testContext() @@ -2244,11 +2234,7 @@ func TestService_Deploy_CacheInvalidationBeforeOldContainerStop(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) - config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - StabilizationDelay: time.Millisecond, - } + config := testMinDelayConfig() svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) ctx := testContext() @@ -2325,11 +2311,7 @@ func TestService_Deploy_NilCacheInvalidator(t *testing.T) { envLoader := mocks.NewMockEnvLoader(t) eventBus := mocks.NewMockEventPublisher(t) - config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - StabilizationDelay: time.Millisecond, - } + config := testMinDelayConfig() // Intentionally NOT setting cache invalidator svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -2484,11 +2466,7 @@ func TestService_Deploy_DoesNotSkipWhenImageIDDiffers(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) - config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - StabilizationDelay: time.Millisecond, - } + config := testMinDelayConfig() svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) ctx := testContext() @@ -2587,11 +2565,7 @@ func TestService_Deploy_SkipRedundantDeploy_ContainerNotRunning(t *testing.T) { envLoader := mocks.NewMockEnvLoader(t) eventBus := mocks.NewMockEventPublisher(t) - config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - StabilizationDelay: time.Millisecond, - } + config := testMinDelayConfig() svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -2654,11 +2628,7 @@ func TestService_Deploy_OrphanCleanup_NeverKillsRunningCanonical(t *testing.T) { envLoader := mocks.NewMockEnvLoader(t) eventBus := mocks.NewMockEventPublisher(t) - config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - StabilizationDelay: time.Millisecond, - } + config := testMinDelayConfig() svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() From bf45a073aa4223df1031c06d9f3c2f4026b56ef5 Mon Sep 17 00:00:00 2001 From: brice Date: Wed, 4 Mar 2026 14:05:33 +0100 Subject: [PATCH 24/26] refactor: split deploy coordination from registry boundary --- .../adapters/in/cli/controlplane_local.go | 16 ++- internal/adapters/in/http/admin/handler.go | 9 +- internal/boundaries/in/deploy.go | 10 ++ .../in/mocks/mock_deploy_coordinator.go | 110 ++++++++++++++++++ internal/boundaries/in/registry.go | 4 - 5 files changed, 140 insertions(+), 9 deletions(-) create mode 100644 internal/boundaries/in/deploy.go create mode 100644 internal/boundaries/in/mocks/mock_deploy_coordinator.go diff --git a/internal/adapters/in/cli/controlplane_local.go b/internal/adapters/in/cli/controlplane_local.go index e60941b4..2c051381 100644 --- a/internal/adapters/in/cli/controlplane_local.go +++ b/internal/adapters/in/cli/controlplane_local.go @@ -18,6 +18,7 @@ type localControlPlane struct { containerSvc in.ContainerService backupSvc in.BackupService registrySvc in.RegistryService + deployCoord in.DeployCoordinator healthSvc in.HealthService logSvc in.LogService } @@ -27,12 +28,21 @@ func NewLocalControlPlane(kernel *app.Kernel) ControlPlane { return &localControlPlane{} } + registrySvc := kernel.Registry() + var deployCoord in.DeployCoordinator + if registrySvc != nil { + if coordinator, ok := any(registrySvc).(in.DeployCoordinator); ok { + deployCoord = coordinator + } + } + return &localControlPlane{ configSvc: kernel.Config(), secretSvc: kernel.Secrets(), containerSvc: kernel.Container(), backupSvc: kernel.Backup(), - registrySvc: kernel.Registry(), + registrySvc: registrySvc, + deployCoord: deployCoord, healthSvc: kernel.Health(), logSvc: kernel.Logs(), } @@ -224,8 +234,8 @@ func (l *localControlPlane) Reload(_ context.Context) error { } func (l *localControlPlane) DeployIntent(_ context.Context, imageName string) error { - if l.registrySvc != nil { - l.registrySvc.SuppressDeployEvent(imageName) + if l.deployCoord != nil { + l.deployCoord.SuppressDeployEvent(imageName) } return nil } diff --git a/internal/adapters/in/http/admin/handler.go b/internal/adapters/in/http/admin/handler.go index 5aa8079c..443f1468 100644 --- a/internal/adapters/in/http/admin/handler.go +++ b/internal/adapters/in/http/admin/handler.go @@ -28,6 +28,11 @@ const maxAdminRequestSize = 1 << 20 // 1MB // maxLogLines is the maximum allowed number of log lines that can be requested. const maxLogLines = 10000 +type registryDeployService interface { + in.RegistryService + in.DeployCoordinator +} + // Handler implements the HTTP handler for the admin API. type Handler struct { configSvc in.ConfigService @@ -38,7 +43,7 @@ type Handler struct { healthSvc in.HealthService secretSvc in.SecretService logSvc in.LogService - registrySvc in.RegistryService + registrySvc registryDeployService eventBus out.EventPublisher log zerowrap.Logger } @@ -130,7 +135,7 @@ func NewHandler( healthSvc in.HealthService, secretSvc in.SecretService, logSvc in.LogService, - registrySvc in.RegistryService, + registrySvc registryDeployService, eventBus out.EventPublisher, log zerowrap.Logger, backupSvc in.BackupService, diff --git a/internal/boundaries/in/deploy.go b/internal/boundaries/in/deploy.go new file mode 100644 index 00000000..d75c4966 --- /dev/null +++ b/internal/boundaries/in/deploy.go @@ -0,0 +1,10 @@ +package in + +// DeployCoordinator defines deploy coordination operations. +// +// These methods let the CLI/admin API suppress image-pushed deploy events +// while an explicit deploy flow is in progress. +type DeployCoordinator interface { + SuppressDeployEvent(imageName string) + ClearDeployEventSuppression(imageName string) +} diff --git a/internal/boundaries/in/mocks/mock_deploy_coordinator.go b/internal/boundaries/in/mocks/mock_deploy_coordinator.go new file mode 100644 index 00000000..77f7350d --- /dev/null +++ b/internal/boundaries/in/mocks/mock_deploy_coordinator.go @@ -0,0 +1,110 @@ +// Code generated by mockery; DO NOT EDIT. +// github.com/vektra/mockery +// template: testify + +package mocks + +import ( + mock "github.com/stretchr/testify/mock" +) + +// NewMockDeployCoordinator creates a new instance of MockDeployCoordinator. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +// The first argument is typically a *testing.T value. +func NewMockDeployCoordinator(t interface { + mock.TestingT + Cleanup(func()) +}) *MockDeployCoordinator { + mock := &MockDeployCoordinator{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +} + +// MockDeployCoordinator is an autogenerated mock type for the DeployCoordinator type. +type MockDeployCoordinator struct { + mock.Mock +} + +type MockDeployCoordinator_Expecter struct { + mock *mock.Mock +} + +func (_m *MockDeployCoordinator) EXPECT() *MockDeployCoordinator_Expecter { + return &MockDeployCoordinator_Expecter{mock: &_m.Mock} +} + +// ClearDeployEventSuppression provides a mock function for the type MockDeployCoordinator. +func (_mock *MockDeployCoordinator) ClearDeployEventSuppression(imageName string) { + _mock.Called(imageName) +} + +// MockDeployCoordinator_ClearDeployEventSuppression_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'ClearDeployEventSuppression'. +type MockDeployCoordinator_ClearDeployEventSuppression_Call struct { + *mock.Call +} + +// ClearDeployEventSuppression is a helper method to define mock.On call. +// - imageName string +func (_e *MockDeployCoordinator_Expecter) ClearDeployEventSuppression(imageName interface{}) *MockDeployCoordinator_ClearDeployEventSuppression_Call { + return &MockDeployCoordinator_ClearDeployEventSuppression_Call{Call: _e.mock.On("ClearDeployEventSuppression", imageName)} +} + +func (_c *MockDeployCoordinator_ClearDeployEventSuppression_Call) Run(run func(imageName string)) *MockDeployCoordinator_ClearDeployEventSuppression_Call { + _c.Call.Run(func(args mock.Arguments) { + var arg0 string + if args[0] != nil { + arg0 = args[0].(string) + } + run(arg0) + }) + return _c +} + +func (_c *MockDeployCoordinator_ClearDeployEventSuppression_Call) Return() *MockDeployCoordinator_ClearDeployEventSuppression_Call { + _c.Call.Return() + return _c +} + +func (_c *MockDeployCoordinator_ClearDeployEventSuppression_Call) RunAndReturn(run func(imageName string)) *MockDeployCoordinator_ClearDeployEventSuppression_Call { + _c.Run(run) + return _c +} + +// SuppressDeployEvent provides a mock function for the type MockDeployCoordinator. +func (_mock *MockDeployCoordinator) SuppressDeployEvent(imageName string) { + _mock.Called(imageName) +} + +// MockDeployCoordinator_SuppressDeployEvent_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'SuppressDeployEvent'. +type MockDeployCoordinator_SuppressDeployEvent_Call struct { + *mock.Call +} + +// SuppressDeployEvent is a helper method to define mock.On call. +// - imageName string +func (_e *MockDeployCoordinator_Expecter) SuppressDeployEvent(imageName interface{}) *MockDeployCoordinator_SuppressDeployEvent_Call { + return &MockDeployCoordinator_SuppressDeployEvent_Call{Call: _e.mock.On("SuppressDeployEvent", imageName)} +} + +func (_c *MockDeployCoordinator_SuppressDeployEvent_Call) Run(run func(imageName string)) *MockDeployCoordinator_SuppressDeployEvent_Call { + _c.Call.Run(func(args mock.Arguments) { + var arg0 string + if args[0] != nil { + arg0 = args[0].(string) + } + run(arg0) + }) + return _c +} + +func (_c *MockDeployCoordinator_SuppressDeployEvent_Call) Return() *MockDeployCoordinator_SuppressDeployEvent_Call { + _c.Call.Return() + return _c +} + +func (_c *MockDeployCoordinator_SuppressDeployEvent_Call) RunAndReturn(run func(imageName string)) *MockDeployCoordinator_SuppressDeployEvent_Call { + _c.Run(run) + return _c +} diff --git a/internal/boundaries/in/registry.go b/internal/boundaries/in/registry.go index c65c8993..9eb8746c 100644 --- a/internal/boundaries/in/registry.go +++ b/internal/boundaries/in/registry.go @@ -29,8 +29,4 @@ type RegistryService interface { // Tag operations ListTags(ctx context.Context, name string) ([]string, error) ListRepositories(ctx context.Context) ([]string, error) - - // Deploy event suppression (prevents double-deploy when CLI manages the push) - SuppressDeployEvent(imageName string) - ClearDeployEventSuppression(imageName string) } From 08d30e7015be71c80725c64e14a0a9fbda7f50ab Mon Sep 17 00:00:00 2001 From: brice Date: Wed, 4 Mar 2026 20:26:42 +0100 Subject: [PATCH 25/26] fix: address code review findings in DeployIntent, probe sleep, drain delay, and docs - DeployIntent: validate imageName is non-empty before sending request - tcpProbe/httpProbe: clamp sleepInterval to probe-specific deadline (not just ctx deadline) so probes cannot oversleep past their own timeout; return the probe timeout error (not ctx.Err()) when the probe deadline expires - testMinDelayConfig: set DrainDelayConfigured: true so waitForDrain honours the 1ms drain delay in tests instead of falling back to the 2s default - docs/reference/docker-labels.md: document gordon.health, gordon.domains, gordon.port, and gordon.env-file image labels alongside gordon.proxy.port --- docs/reference/docker-labels.md | 22 ++++++++++++++++- internal/adapters/in/cli/remote/client.go | 3 +++ internal/usecase/container/readiness.go | 28 +++++++++++++++------- internal/usecase/container/service_test.go | 7 +++--- 4 files changed, 48 insertions(+), 12 deletions(-) diff --git a/docs/reference/docker-labels.md b/docs/reference/docker-labels.md index f86ad315..a1bec497 100644 --- a/docs/reference/docker-labels.md +++ b/docs/reference/docker-labels.md @@ -41,7 +41,27 @@ Labels you can set in your Dockerfile: | Label | Example | Description | |-------|---------|-------------| -| `gordon.proxy.port` | `"3000"` | Port to proxy HTTP traffic to | +| `gordon.domains` | `"app.example.com,www.app.example.com"` | Comma-separated domains for auto-route | +| `gordon.port` | `"3000"` | Port to proxy HTTP traffic to | +| `gordon.proxy.port` | `"3000"` | Port to proxy HTTP traffic to (legacy alias for `gordon.port`) | +| `gordon.health` | `"/healthz"` | HTTP health check endpoint path for readiness probing | +| `gordon.env-file` | `"/app/.env.example"` | Path to env template file inside the image | + +### Health Check Label + +When `gordon.health` is set, Gordon performs HTTP GET requests to the specified +path during deployment and waits for a 2xx or 3xx response before routing traffic +to the new container: + +```dockerfile +FROM node:20-alpine +LABEL gordon.health="/api/health" +EXPOSE 3000 +CMD ["node", "server.js"] +``` + +Gordon probes `http://:3000/api/health` until it gets a successful +response or the `deploy.http_probe_timeout` is reached. ### Proxy Port Label diff --git a/internal/adapters/in/cli/remote/client.go b/internal/adapters/in/cli/remote/client.go index 8eaa3ba6..d6797012 100644 --- a/internal/adapters/in/cli/remote/client.go +++ b/internal/adapters/in/cli/remote/client.go @@ -727,6 +727,9 @@ func (c *Client) Deploy(ctx context.Context, deployDomain string) (*DeployResult // DeployIntent tells the server that a CLI-managed push is about to happen, // suppressing event-based deploys for this image. func (c *Client) DeployIntent(ctx context.Context, imageName string) error { + if strings.TrimSpace(imageName) == "" { + return fmt.Errorf("image name cannot be empty") + } resp, err := c.requestWithRetry(ctx, http.MethodPost, "/deploy-intent/"+url.PathEscape(imageName), nil) if err != nil { return err diff --git a/internal/usecase/container/readiness.go b/internal/usecase/container/readiness.go index 625f5dff..2ee260bf 100644 --- a/internal/usecase/container/readiness.go +++ b/internal/usecase/container/readiness.go @@ -47,13 +47,19 @@ func tcpProbe(ctx context.Context, addr string, timeout time.Duration) error { } sleepInterval := 500 * time.Millisecond - if deadline, ok := ctx.Deadline(); ok { - if rem := time.Until(deadline); rem < sleepInterval { - sleepInterval = rem + if remProbe := time.Until(deadline); remProbe < sleepInterval { + sleepInterval = remProbe + } + if ctxDeadline, ok := ctx.Deadline(); ok { + if remCtx := time.Until(ctxDeadline); remCtx < sleepInterval { + sleepInterval = remCtx } } if sleepInterval <= 0 { - return ctx.Err() + if ctx.Err() != nil { + return ctx.Err() + } + return fmt.Errorf("TCP probe timeout after %s: %s not reachable (attempts=%d, last_error=%v)", timeout, addr, attempts, lastErr) } t := time.NewTimer(sleepInterval) select { @@ -107,13 +113,19 @@ func httpProbe(ctx context.Context, url string, timeout time.Duration) error { } sleepInterval := time.Second - if deadline, ok := ctx.Deadline(); ok { - if rem := time.Until(deadline); rem < sleepInterval { - sleepInterval = rem + if remProbe := time.Until(deadline); remProbe < sleepInterval { + sleepInterval = remProbe + } + if ctxDeadline, ok := ctx.Deadline(); ok { + if remCtx := time.Until(ctxDeadline); remCtx < sleepInterval { + sleepInterval = remCtx } } if sleepInterval <= 0 { - return ctx.Err() + if ctx.Err() != nil { + return ctx.Err() + } + return fmt.Errorf("HTTP probe timeout after %s: last status %d from %s", timeout, lastStatus, url) } t := time.NewTimer(sleepInterval) select { diff --git a/internal/usecase/container/service_test.go b/internal/usecase/container/service_test.go index 9ae0fa4d..15a4f707 100644 --- a/internal/usecase/container/service_test.go +++ b/internal/usecase/container/service_test.go @@ -29,9 +29,10 @@ func testContext() context.Context { // suitable for unit tests that don't want to wait for real timeouts. func testMinDelayConfig() Config { return Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, - StabilizationDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, + StabilizationDelay: time.Millisecond, } } From 11dcb09de6e771f23b900129d69b31b2898ae156 Mon Sep 17 00:00:00 2001 From: brice Date: Thu, 5 Mar 2026 06:08:51 +0100 Subject: [PATCH 26/26] refactor: trim DeployIntent input, extract probeLoop helper, fix test drain configs - DeployIntent: assign trimmed imageName before validation so url.PathEscape never encodes leading/trailing whitespace - readiness: extract probeLoop orchestration helper shared by tcpProbe and httpProbe, reducing cyclomatic complexity below the gocyclo threshold (15) - service_test: add DrainDelayConfigured: true to 14 test config blocks that were missing it, preventing silent 2s drain default --- internal/adapters/in/cli/remote/client.go | 3 +- internal/usecase/container/readiness.go | 135 ++++++++++----------- internal/usecase/container/service_test.go | 90 ++++++++------ 3 files changed, 121 insertions(+), 107 deletions(-) diff --git a/internal/adapters/in/cli/remote/client.go b/internal/adapters/in/cli/remote/client.go index d6797012..25c6a0c7 100644 --- a/internal/adapters/in/cli/remote/client.go +++ b/internal/adapters/in/cli/remote/client.go @@ -727,7 +727,8 @@ func (c *Client) Deploy(ctx context.Context, deployDomain string) (*DeployResult // DeployIntent tells the server that a CLI-managed push is about to happen, // suppressing event-based deploys for this image. func (c *Client) DeployIntent(ctx context.Context, imageName string) error { - if strings.TrimSpace(imageName) == "" { + imageName = strings.TrimSpace(imageName) + if imageName == "" { return fmt.Errorf("image name cannot be empty") } resp, err := c.requestWithRetry(ctx, http.MethodPost, "/deploy-intent/"+url.PathEscape(imageName), nil) diff --git a/internal/usecase/container/readiness.go b/internal/usecase/container/readiness.go index 2ee260bf..44dafe63 100644 --- a/internal/usecase/container/readiness.go +++ b/internal/usecase/container/readiness.go @@ -2,6 +2,7 @@ package container import ( "context" + "errors" "fmt" "net" "net/http" @@ -10,43 +11,40 @@ import ( "github.com/bnema/zerowrap" ) -// tcpProbe attempts a TCP connection to addr, retrying every 500ms until -// success or timeout. This is the universal fallback readiness check — -// it verifies the process is at least accepting connections. -func tcpProbe(ctx context.Context, addr string, timeout time.Duration) error { - log := zerowrap.FromCtx(ctx) - deadline := time.Now().Add(timeout) - dialer := &net.Dialer{} - attempts := 0 - var lastErr error +var errProbeTimeout = errors.New("probe timeout") + +func probeLoop( + ctx context.Context, + deadline time.Time, + baseSleep time.Duration, + defaultAttemptTimeout time.Duration, + work func(attemptCtx context.Context) (success bool, err error), +) error { for { if err := ctx.Err(); err != nil { return err } if time.Now().After(deadline) { - return fmt.Errorf("TCP probe timeout after %s: %s not reachable (attempts=%d, last_error=%v)", timeout, addr, attempts, lastErr) + return errProbeTimeout } remaining := time.Until(deadline) - attemptTimeout := time.Second + attemptTimeout := defaultAttemptTimeout if remaining < attemptTimeout { attemptTimeout = remaining } + attemptCtx, cancel := context.WithTimeout(ctx, attemptTimeout) - conn, err := dialer.DialContext(attemptCtx, "tcp", addr) + success, err := work(attemptCtx) cancel() - attempts++ - if err == nil { - conn.Close() - log.Debug().Str("addr", addr).Int("attempts", attempts).Msg("TCP probe connected") - return nil + if err != nil { + return err } - lastErr = err - if attempts <= 3 || attempts%10 == 0 { - log.Debug().Err(err).Str("addr", addr).Int("attempt", attempts).Msg("TCP probe attempt failed") + if success { + return nil } - sleepInterval := 500 * time.Millisecond + sleepInterval := baseSleep if remProbe := time.Until(deadline); remProbe < sleepInterval { sleepInterval = remProbe } @@ -59,8 +57,9 @@ func tcpProbe(ctx context.Context, addr string, timeout time.Duration) error { if ctx.Err() != nil { return ctx.Err() } - return fmt.Errorf("TCP probe timeout after %s: %s not reachable (attempts=%d, last_error=%v)", timeout, addr, attempts, lastErr) + return errProbeTimeout } + t := time.NewTimer(sleepInterval) select { case <-t.C: @@ -71,6 +70,36 @@ func tcpProbe(ctx context.Context, addr string, timeout time.Duration) error { } } +// tcpProbe attempts a TCP connection to addr, retrying every 500ms until +// success or timeout. This is the universal fallback readiness check — +// it verifies the process is at least accepting connections. +func tcpProbe(ctx context.Context, addr string, timeout time.Duration) error { + log := zerowrap.FromCtx(ctx) + deadline := time.Now().Add(timeout) + dialer := &net.Dialer{} + attempts := 0 + var lastErr error + err := probeLoop(ctx, deadline, 500*time.Millisecond, time.Second, func(attemptCtx context.Context) (bool, error) { + conn, err := dialer.DialContext(attemptCtx, "tcp", addr) + attempts++ + if err == nil { + conn.Close() + log.Debug().Str("addr", addr).Int("attempts", attempts).Msg("TCP probe connected") + return true, nil + } + + lastErr = err + if attempts <= 3 || attempts%10 == 0 { + log.Debug().Err(err).Str("addr", addr).Int("attempt", attempts).Msg("TCP probe attempt failed") + } + return false, nil + }) + if errors.Is(err, errProbeTimeout) { + return fmt.Errorf("TCP probe timeout after %s: %s not reachable (attempts=%d, last_error=%v)", timeout, addr, attempts, lastErr) + } + return err +} + // httpProbe performs HTTP GET requests to url, retrying every 1s until a // 2xx/3xx response or timeout. Used when gordon.health label is set. func httpProbe(ctx context.Context, url string, timeout time.Duration) error { @@ -82,57 +111,27 @@ func httpProbe(ctx context.Context, url string, timeout time.Duration) error { } deadline := time.Now().Add(timeout) var lastStatus int - - for { - if err := ctx.Err(); err != nil { - return err - } - if time.Now().After(deadline) { - return fmt.Errorf("HTTP probe timeout after %s: last status %d from %s", timeout, lastStatus, url) - } - - remaining := time.Until(deadline) - attemptTimeout := 2 * time.Second - if remaining < attemptTimeout { - attemptTimeout = remaining - } - req, reqErr := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + err := probeLoop(ctx, deadline, time.Second, 2*time.Second, func(attemptCtx context.Context) (bool, error) { + req, reqErr := http.NewRequestWithContext(attemptCtx, http.MethodGet, url, nil) if reqErr != nil { - return reqErr + return false, reqErr } - attemptCtx, cancel := context.WithTimeout(ctx, attemptTimeout) - req = req.WithContext(attemptCtx) + resp, err := client.Do(req) - cancel() - if err == nil { - lastStatus = resp.StatusCode - resp.Body.Close() - if lastStatus >= 200 && lastStatus < 400 { - return nil - } + if err != nil { + return false, nil } - sleepInterval := time.Second - if remProbe := time.Until(deadline); remProbe < sleepInterval { - sleepInterval = remProbe - } - if ctxDeadline, ok := ctx.Deadline(); ok { - if remCtx := time.Until(ctxDeadline); remCtx < sleepInterval { - sleepInterval = remCtx - } - } - if sleepInterval <= 0 { - if ctx.Err() != nil { - return ctx.Err() - } - return fmt.Errorf("HTTP probe timeout after %s: last status %d from %s", timeout, lastStatus, url) - } - t := time.NewTimer(sleepInterval) - select { - case <-t.C: - case <-ctx.Done(): - t.Stop() - return ctx.Err() + lastStatus = resp.StatusCode + resp.Body.Close() + if lastStatus >= 200 && lastStatus < 400 { + return true, nil } + + return false, nil + }) + if errors.Is(err, errProbeTimeout) { + return fmt.Errorf("HTTP probe timeout after %s: last status %d from %s", timeout, lastStatus, url) } + return err } diff --git a/internal/usecase/container/service_test.go b/internal/usecase/container/service_test.go index 15a4f707..c1a9feb9 100644 --- a/internal/usecase/container/service_test.go +++ b/internal/usecase/container/service_test.go @@ -195,10 +195,11 @@ func TestService_Deploy_Success(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - NetworkIsolation: false, - VolumeAutoCreate: false, - ReadinessDelay: time.Millisecond, // Minimal delay for tests - DrainDelay: time.Millisecond, + NetworkIsolation: false, + VolumeAutoCreate: false, + ReadinessDelay: time.Millisecond, // Minimal delay for tests + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) @@ -275,10 +276,11 @@ func TestService_Deploy_ReadinessRecoveryWindow_AllowsTransientFlap(t *testing.T eventBus := mocks.NewMockEventPublisher(t) config := Config{ - NetworkIsolation: false, - VolumeAutoCreate: false, - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + NetworkIsolation: false, + VolumeAutoCreate: false, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) @@ -426,9 +428,10 @@ func TestService_Deploy_ReplacesExistingContainer(t *testing.T) { cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) config := Config{ - ReadinessDelay: time.Millisecond, // Minimal delay for tests - DrainDelay: time.Millisecond, - StabilizationDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, // Minimal delay for tests + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, + StabilizationDelay: time.Millisecond, } svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) @@ -657,10 +660,11 @@ func TestService_Deploy_WithNetworkIsolation(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - NetworkIsolation: true, - NetworkPrefix: "gordon", - ReadinessDelay: time.Millisecond, // Minimal delay for tests - DrainDelay: time.Millisecond, + NetworkIsolation: true, + NetworkPrefix: "gordon", + ReadinessDelay: time.Millisecond, // Minimal delay for tests + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -712,10 +716,11 @@ func TestService_Deploy_WithVolumeAutoCreate(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - VolumeAutoCreate: true, - VolumePrefix: "gordon", - ReadinessDelay: time.Millisecond, // Minimal delay for tests - DrainDelay: time.Millisecond, + VolumeAutoCreate: true, + VolumePrefix: "gordon", + ReadinessDelay: time.Millisecond, // Minimal delay for tests + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -1433,6 +1438,7 @@ func TestService_Deploy_InternalDeployForcesPull(t *testing.T) { InternalRegistryPassword: "secret", ReadinessDelay: time.Millisecond, DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) @@ -1503,8 +1509,9 @@ func TestService_AutoStart_StartsNewContainers(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -1549,8 +1556,9 @@ func TestService_AutoStart_SkipsExistingContainers(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -1623,10 +1631,11 @@ func TestService_AutoStart_UsesInternalDeployContext(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - RegistryDomain: "reg.example.com", - RegistryPort: 5000, - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + RegistryDomain: "reg.example.com", + RegistryPort: 5000, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) @@ -1674,9 +1683,10 @@ func TestService_Deploy_OrphanCleanupSkipsTrackedContainer(t *testing.T) { cacheInvalidator := mocks.NewMockProxyCacheInvalidator(t) config := Config{ - ReadinessDelay: time.Millisecond, // Minimal delay for tests - DrainDelay: time.Millisecond, - StabilizationDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, // Minimal delay for tests + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, + StabilizationDelay: time.Millisecond, } svc := NewService(runtime, envLoader, eventBus, nil, config) svc.SetProxyCacheInvalidator(cacheInvalidator) @@ -1765,8 +1775,9 @@ func TestService_Deploy_OrphanCleanupRemovesTrueOrphans(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - ReadinessDelay: time.Millisecond, // Minimal delay for tests - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, // Minimal delay for tests + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -2207,8 +2218,9 @@ func TestService_Deploy_ContextCancellation(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) @@ -2360,8 +2372,9 @@ func TestService_Deploy_SkipsRedundantDeploy(t *testing.T) { eventBus := mocks.NewMockEventPublisher(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext() @@ -2413,8 +2426,9 @@ func TestService_Deploy_SkipsRedundantDeploy_WhenTrackedImageIDMissing(t *testin eventBus := mocks.NewMockEventPublisher(t) config := Config{ - ReadinessDelay: time.Millisecond, - DrainDelay: time.Millisecond, + ReadinessDelay: time.Millisecond, + DrainDelay: time.Millisecond, + DrainDelayConfigured: true, } svc := NewService(runtime, envLoader, eventBus, nil, config) ctx := testContext()