From 3180b1dca7eec50e467f675de6c23835c321415c Mon Sep 17 00:00:00 2001 From: Hiro Tamada Date: Wed, 21 Jan 2026 11:31:07 -0500 Subject: [PATCH 1/2] fix: wait for image conversion before reporting build ready (KERNEL-863) Fixes a race condition where build status would transition to "ready" before the image conversion completed, causing instance creation to fail. The registry's triggerConversion() runs asynchronously after returning 201 to the builder. This meant the builder could report success and the build manager would set status="ready" while image conversion was still in progress. Changes: - Add imageManager dependency to build manager - Add waitForImageReady() that polls until image status is "ready" - Call waitForImageReady() before setting build to "ready" status - If image conversion fails/times out, mark build as failed --- cmd/api/wire_gen.go | 2 +- lib/builds/manager.go | 55 ++++++++ lib/builds/manager_test.go | 82 +++++++++++- lib/builds/race_test.go | 255 +++++++++++++++++++++++++++++++++++++ lib/providers/providers.go | 4 +- 5 files changed, 394 insertions(+), 4 deletions(-) create mode 100644 lib/builds/race_test.go diff --git a/cmd/api/wire_gen.go b/cmd/api/wire_gen.go index 68f0531..5ac1bda 100644 --- a/cmd/api/wire_gen.go +++ b/cmd/api/wire_gen.go @@ -55,7 +55,7 @@ func initializeApp() (*application, func(), error) { if err != nil { return nil, nil, err } - buildsManager, err := providers.ProvideBuildManager(paths, config, instancesManager, volumesManager, logger) + buildsManager, err := providers.ProvideBuildManager(paths, config, instancesManager, volumesManager, manager, logger) if err != nil { return nil, nil, err } diff --git a/lib/builds/manager.go b/lib/builds/manager.go index 266cacb..5315347 100644 --- a/lib/builds/manager.go +++ b/lib/builds/manager.go @@ -88,6 +88,7 @@ type manager struct { queue *BuildQueue instanceManager instances.Manager volumeManager volumes.Manager + imageManager images.Manager secretProvider SecretProvider tokenGenerator *RegistryTokenGenerator logger *slog.Logger @@ -105,6 +106,7 @@ func NewManager( config Config, instanceMgr instances.Manager, volumeMgr volumes.Manager, + imageMgr images.Manager, secretProvider SecretProvider, logger *slog.Logger, meter metric.Meter, @@ -119,6 +121,7 @@ func NewManager( queue: NewBuildQueue(config.MaxConcurrentBuilds), instanceManager: instanceMgr, volumeManager: volumeMgr, + imageManager: imageMgr, secretProvider: secretProvider, tokenGenerator: NewRegistryTokenGenerator(config.RegistrySecret), logger: logger, @@ -294,6 +297,20 @@ func (m *manager) runBuild(ctx context.Context, id string, req CreateBuildReques m.logger.Info("build succeeded", "id", id, "digest", result.ImageDigest, "duration", duration) imageRef := fmt.Sprintf("%s/builds/%s", m.config.RegistryURL, id) + + // Wait for image to be ready before reporting build as complete. + // This fixes the race condition (KERNEL-863) where build reports "ready" + // but image conversion hasn't finished yet. + if err := m.waitForImageReady(ctx, id); err != nil { + m.logger.Error("image conversion failed after build", "id", id, "error", err, "duration", duration) + errMsg := fmt.Sprintf("image conversion failed: %v", err) + m.updateBuildComplete(id, StatusFailed, nil, &errMsg, &result.Provenance, &durationMS) + if m.metrics != nil { + m.metrics.RecordBuild(ctx, "failed", duration) + } + return + } + m.updateBuildComplete(id, StatusReady, &result.ImageDigest, nil, &result.Provenance, &durationMS) // Update with image ref @@ -640,6 +657,44 @@ func (m *manager) updateBuildComplete(id string, status string, digest *string, m.notifyStatusChange(id, status) } +// waitForImageReady polls the image manager until the build's image is ready. +// This ensures that when a build reports "ready", the image is actually usable +// for instance creation (fixes KERNEL-863 race condition). +func (m *manager) waitForImageReady(ctx context.Context, id string) error { + imageRef := fmt.Sprintf("%s/builds/%s", m.config.RegistryURL, id) + + // Poll for up to 60 seconds (image conversion is typically fast) + const maxAttempts = 120 + const pollInterval = 500 * time.Millisecond + + m.logger.Debug("waiting for image to be ready", "id", id, "image_ref", imageRef) + + for attempt := 0; attempt < maxAttempts; attempt++ { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + img, err := m.imageManager.GetImage(ctx, imageRef) + if err == nil { + switch img.Status { + case images.StatusReady: + m.logger.Debug("image is ready", "id", id, "image_ref", imageRef, "attempts", attempt+1) + return nil + case images.StatusFailed: + return fmt.Errorf("image conversion failed") + case images.StatusPending, images.StatusPulling, images.StatusConverting: + // Still processing, continue polling + } + } + // Image not found or still processing, wait and retry + time.Sleep(pollInterval) + } + + return fmt.Errorf("timeout waiting for image to be ready after %v", time.Duration(maxAttempts)*pollInterval) +} + // subscribeToStatus adds a subscriber channel for status updates on a build func (m *manager) subscribeToStatus(buildID string, ch chan BuildEvent) { m.subscriberMu.Lock() diff --git a/lib/builds/manager_test.go b/lib/builds/manager_test.go index 3d19af1..d47bfa3 100644 --- a/lib/builds/manager_test.go +++ b/lib/builds/manager_test.go @@ -10,6 +10,7 @@ import ( "testing" "time" + "github.com/kernel/hypeman/lib/images" "github.com/kernel/hypeman/lib/instances" "github.com/kernel/hypeman/lib/paths" "github.com/kernel/hypeman/lib/resources" @@ -219,8 +220,85 @@ func (m *mockSecretProvider) GetSecrets(ctx context.Context, secretIDs []string) return make(map[string]string), nil } +// mockImageManager implements images.Manager for testing +type mockImageManager struct { + images map[string]*images.Image + getImageErr error +} + +func newMockImageManager() *mockImageManager { + return &mockImageManager{ + images: make(map[string]*images.Image), + } +} + +func (m *mockImageManager) ListImages(ctx context.Context) ([]images.Image, error) { + var result []images.Image + for _, img := range m.images { + result = append(result, *img) + } + return result, nil +} + +func (m *mockImageManager) CreateImage(ctx context.Context, req images.CreateImageRequest) (*images.Image, error) { + img := &images.Image{ + Name: req.Name, + Status: images.StatusPending, + } + m.images[req.Name] = img + return img, nil +} + +func (m *mockImageManager) ImportLocalImage(ctx context.Context, repo, reference, digest string) (*images.Image, error) { + img := &images.Image{ + Name: repo + ":" + reference, + Status: images.StatusReady, + } + m.images[img.Name] = img + return img, nil +} + +func (m *mockImageManager) GetImage(ctx context.Context, name string) (*images.Image, error) { + if m.getImageErr != nil { + return nil, m.getImageErr + } + if img, ok := m.images[name]; ok { + return img, nil + } + return nil, images.ErrNotFound +} + +func (m *mockImageManager) DeleteImage(ctx context.Context, name string) error { + delete(m.images, name) + return nil +} + +func (m *mockImageManager) RecoverInterruptedBuilds() {} + +func (m *mockImageManager) TotalImageBytes(ctx context.Context) (int64, error) { + return 0, nil +} + +func (m *mockImageManager) TotalOCICacheBytes(ctx context.Context) (int64, error) { + return 0, nil +} + +// SetImageReady sets an image to ready status for testing +func (m *mockImageManager) SetImageReady(name string) { + m.images[name] = &images.Image{ + Name: name, + Status: images.StatusReady, + } +} + // Test helper to create a manager with test paths and mocks func setupTestManager(t *testing.T) (*manager, *mockInstanceManager, *mockVolumeManager, string) { + mgr, instanceMgr, volumeMgr, _, tempDir := setupTestManagerWithImageMgr(t) + return mgr, instanceMgr, volumeMgr, tempDir +} + +// setupTestManagerWithImageMgr returns the image manager for tests that need it +func setupTestManagerWithImageMgr(t *testing.T) (*manager, *mockInstanceManager, *mockVolumeManager, *mockImageManager, string) { t.Helper() // Create temp directory for test data @@ -236,6 +314,7 @@ func setupTestManager(t *testing.T) (*manager, *mockInstanceManager, *mockVolume // Create mocks instanceMgr := newMockInstanceManager() volumeMgr := newMockVolumeManager() + imageMgr := newMockImageManager() secretProvider := &mockSecretProvider{} // Create config @@ -257,13 +336,14 @@ func setupTestManager(t *testing.T) (*manager, *mockInstanceManager, *mockVolume queue: NewBuildQueue(config.MaxConcurrentBuilds), instanceManager: instanceMgr, volumeManager: volumeMgr, + imageManager: imageMgr, secretProvider: secretProvider, tokenGenerator: NewRegistryTokenGenerator(config.RegistrySecret), logger: logger, statusSubscribers: make(map[string][]chan BuildEvent), } - return mgr, instanceMgr, volumeMgr, tempDir + return mgr, instanceMgr, volumeMgr, imageMgr, tempDir } func TestCreateBuild_Success(t *testing.T) { diff --git a/lib/builds/race_test.go b/lib/builds/race_test.go new file mode 100644 index 0000000..9d31243 --- /dev/null +++ b/lib/builds/race_test.go @@ -0,0 +1,255 @@ +package builds + +import ( + "context" + "os" + "sync" + "testing" + "time" + + "github.com/kernel/hypeman/lib/images" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestBuildImageAvailabilityRace demonstrates the race condition described in KERNEL-863: +// After WaitForBuild() returns with BuildStatusReady, the image may not be immediately +// available for instance creation because: +// 1. Registry returns 201 to builder +// 2. Registry calls triggerConversion() asynchronously +// 3. Builder reports success, build status becomes "ready" +// 4. But image conversion may still be in progress +// +// This test simulates the scenario where a build completes but the image +// is not yet ready when the client tries to use it. +func TestBuildImageAvailabilityRace(t *testing.T) { + // This test demonstrates the conceptual race condition. + // The actual fix requires changes to either: + // 1. Wait for image conversion before reporting build success + // 2. Add an image availability check endpoint + // 3. Have the builder verify image is pullable before reporting success + + t.Run("demonstrates async conversion race", func(t *testing.T) { + // Simulate the race: build reports ready but image conversion is async + var ( + buildReady = make(chan struct{}) + imageConverted = make(chan struct{}) + conversionDelay = 100 * time.Millisecond + ) + + // Simulate registry receiving image and starting async conversion + go func() { + // Registry returns 201 immediately + close(buildReady) + // But conversion happens asynchronously with some delay + time.Sleep(conversionDelay) + close(imageConverted) + }() + + // Simulate client waiting for build to be ready + <-buildReady + + // Build is "ready" but image might not be converted yet + select { + case <-imageConverted: + // Image already converted - no race in this run + t.Log("Image was converted before we checked (no race this time)") + default: + // This demonstrates the race condition: + // Build is ready but image is not yet available + t.Log("RACE CONDITION: Build ready but image not yet converted") + + // In the real system, instance creation would fail here + // because imageManager.GetImage() would return not found or pending status + } + + // Wait for conversion to complete + <-imageConverted + t.Log("Image conversion completed") + }) +} + +// TestWaitForImageReady_Success tests that waitForImageReady succeeds when image becomes ready +func TestWaitForImageReady_Success(t *testing.T) { + mgr, _, _, imageMgr, tempDir := setupTestManagerWithImageMgr(t) + defer os.RemoveAll(tempDir) + + ctx := context.Background() + buildID := "test-build-123" + + // Set the image to ready in the mock + imageRef := mgr.config.RegistryURL + "/builds/" + buildID + imageMgr.SetImageReady(imageRef) + + // waitForImageReady should succeed immediately + err := mgr.waitForImageReady(ctx, buildID) + require.NoError(t, err) +} + +// TestWaitForImageReady_WaitsForConversion tests that waitForImageReady polls until ready +func TestWaitForImageReady_WaitsForConversion(t *testing.T) { + mgr, _, _, imageMgr, tempDir := setupTestManagerWithImageMgr(t) + defer os.RemoveAll(tempDir) + + ctx := context.Background() + buildID := "test-build-456" + imageRef := mgr.config.RegistryURL + "/builds/" + buildID + + // Start with image in pending status + imageMgr.images[imageRef] = &images.Image{ + Name: imageRef, + Status: images.StatusPending, + } + + // Simulate conversion completing after a short delay + go func() { + time.Sleep(100 * time.Millisecond) + imageMgr.images[imageRef].Status = images.StatusConverting + time.Sleep(100 * time.Millisecond) + imageMgr.images[imageRef].Status = images.StatusReady + }() + + // waitForImageReady should poll and eventually succeed + start := time.Now() + err := mgr.waitForImageReady(ctx, buildID) + elapsed := time.Since(start) + + require.NoError(t, err) + assert.GreaterOrEqual(t, elapsed, 200*time.Millisecond, "should have waited for conversion") +} + +// TestWaitForImageReady_Timeout tests that waitForImageReady times out if image never becomes ready +func TestWaitForImageReady_ContextCancelled(t *testing.T) { + mgr, _, _, imageMgr, tempDir := setupTestManagerWithImageMgr(t) + defer os.RemoveAll(tempDir) + + ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) + defer cancel() + + buildID := "test-build-789" + imageRef := mgr.config.RegistryURL + "/builds/" + buildID + + // Image stays in pending status forever + imageMgr.images[imageRef] = &images.Image{ + Name: imageRef, + Status: images.StatusPending, + } + + // waitForImageReady should return context error + err := mgr.waitForImageReady(ctx, buildID) + require.Error(t, err) + assert.ErrorIs(t, err, context.DeadlineExceeded) +} + +// TestWaitForImageReady_Failed tests that waitForImageReady returns error if image conversion fails +func TestWaitForImageReady_Failed(t *testing.T) { + mgr, _, _, imageMgr, tempDir := setupTestManagerWithImageMgr(t) + defer os.RemoveAll(tempDir) + + ctx := context.Background() + buildID := "test-build-failed" + imageRef := mgr.config.RegistryURL + "/builds/" + buildID + + // Image is in failed status + imageMgr.images[imageRef] = &images.Image{ + Name: imageRef, + Status: images.StatusFailed, + } + + // waitForImageReady should return error immediately + err := mgr.waitForImageReady(ctx, buildID) + require.Error(t, err) + assert.Contains(t, err.Error(), "image conversion failed") +} + +// TestImageAvailabilityAfterBuildComplete tests the proposed fix: +// Build should only report "ready" after verifying the image is available. +func TestImageAvailabilityAfterBuildComplete(t *testing.T) { + t.Skip("This test is for the proposed fix - not yet implemented") + + // The fix would involve one of: + // + // Option 1: Synchronous conversion in registry + // - Change `go r.triggerConversion()` to synchronous call + // - Pros: Simple fix + // - Cons: Increases latency for builder push response + // + // Option 2: Builder verifies image availability + // - After pushing, builder pulls/verifies the image + // - Only then reports success via vsock + // - Pros: End-to-end verification + // - Cons: Adds complexity to builder agent + // + // Option 3: Build manager waits for image + // - After receiving success from builder, poll image status + // - Only set build to "ready" when image is "ready" + // - Pros: Clean separation of concerns + // - Cons: Adds polling overhead + // + // Option 4: Expose image availability endpoint + // - Callers check image availability before creating instances + // - Pros: Flexible for callers + // - Cons: Pushes complexity to callers (current workaround) +} + +// Concurrent access test to verify thread safety of status updates +func TestConcurrentStatusUpdates(t *testing.T) { + mgr, _, _, tempDir := setupTestManager(t) + defer removeAll(tempDir) + + ctx := context.Background() + + // Create a build + req := CreateBuildRequest{ + Dockerfile: "FROM alpine", + } + build, err := mgr.CreateBuild(ctx, req, []byte("source")) + require.NoError(t, err) + + // Concurrently subscribe and update status + var wg sync.WaitGroup + const numGoroutines = 10 + + for i := 0; i < numGoroutines; i++ { + wg.Add(1) + go func(id int) { + defer wg.Done() + + // Subscribe + ch := make(chan BuildEvent, 10) + mgr.subscribeToStatus(build.ID, ch) + defer mgr.unsubscribeFromStatus(build.ID, ch) + + // Small delay to interleave + time.Sleep(time.Duration(id) * time.Millisecond) + + // Read any events + for { + select { + case <-ch: + // Got event + case <-time.After(50 * time.Millisecond): + return + } + } + }(i) + } + + // Trigger status updates concurrently + for i := 0; i < numGoroutines; i++ { + wg.Add(1) + go func() { + defer wg.Done() + mgr.updateStatus(build.ID, StatusBuilding, nil) + }() + } + + wg.Wait() + + // Should not panic or deadlock + t.Log("Concurrent status updates completed without deadlock") +} + +func removeAll(path string) { + os.RemoveAll(path) +} diff --git a/lib/providers/providers.go b/lib/providers/providers.go index 09d571c..9f0bcd0 100644 --- a/lib/providers/providers.go +++ b/lib/providers/providers.go @@ -230,7 +230,7 @@ func ProvideIngressManager(p *paths.Paths, cfg *config.Config, instanceManager i } // ProvideBuildManager provides the build manager -func ProvideBuildManager(p *paths.Paths, cfg *config.Config, instanceManager instances.Manager, volumeManager volumes.Manager, log *slog.Logger) (builds.Manager, error) { +func ProvideBuildManager(p *paths.Paths, cfg *config.Config, instanceManager instances.Manager, volumeManager volumes.Manager, imageManager images.Manager, log *slog.Logger) (builds.Manager, error) { buildConfig := builds.Config{ MaxConcurrentBuilds: cfg.MaxConcurrentSourceBuilds, BuilderImage: cfg.BuilderImage, @@ -263,5 +263,5 @@ func ProvideBuildManager(p *paths.Paths, cfg *config.Config, instanceManager ins } meter := otel.GetMeterProvider().Meter("hypeman") - return builds.NewManager(p, buildConfig, instanceManager, volumeManager, secretProvider, log, meter) + return builds.NewManager(p, buildConfig, instanceManager, volumeManager, imageManager, secretProvider, log, meter) } From 450d496cd13caca697d709ff2b8b5e674741f7d3 Mon Sep 17 00:00:00 2001 From: Hiro Tamada Date: Wed, 21 Jan 2026 12:43:15 -0500 Subject: [PATCH 2/2] fix: use buildCtx for image wait and recalculate duration Addresses PR review feedback: 1. Use buildCtx instead of ctx for waitForImageReady to respect build timeout during image conversion wait 2. Recalculate duration after waitForImageReady completes to accurately report total build time including image conversion --- lib/builds/manager.go | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/lib/builds/manager.go b/lib/builds/manager.go index 5315347..72e1b0a 100644 --- a/lib/builds/manager.go +++ b/lib/builds/manager.go @@ -301,16 +301,24 @@ func (m *manager) runBuild(ctx context.Context, id string, req CreateBuildReques // Wait for image to be ready before reporting build as complete. // This fixes the race condition (KERNEL-863) where build reports "ready" // but image conversion hasn't finished yet. - if err := m.waitForImageReady(ctx, id); err != nil { + // Use buildCtx to respect the build timeout during image wait. + if err := m.waitForImageReady(buildCtx, id); err != nil { + // Recalculate duration to include image wait time + duration = time.Since(start) + durationMS = duration.Milliseconds() m.logger.Error("image conversion failed after build", "id", id, "error", err, "duration", duration) errMsg := fmt.Sprintf("image conversion failed: %v", err) m.updateBuildComplete(id, StatusFailed, nil, &errMsg, &result.Provenance, &durationMS) if m.metrics != nil { - m.metrics.RecordBuild(ctx, "failed", duration) + m.metrics.RecordBuild(buildCtx, "failed", duration) } return } + // Recalculate duration to include image wait time for accurate reporting + duration = time.Since(start) + durationMS = duration.Milliseconds() + m.updateBuildComplete(id, StatusReady, &result.ImageDigest, nil, &result.Provenance, &durationMS) // Update with image ref @@ -320,7 +328,7 @@ func (m *manager) runBuild(ctx context.Context, id string, req CreateBuildReques } if m.metrics != nil { - m.metrics.RecordBuild(ctx, "success", duration) + m.metrics.RecordBuild(buildCtx, "success", duration) } }