From 226e19d00734b3a41110f94a6c691fc7929f78da Mon Sep 17 00:00:00 2001 From: Brooke Hamilton <45323234+brooke-hamilton@users.noreply.github.com> Date: Sun, 8 Feb 2026 13:26:52 -0500 Subject: [PATCH 1/4] consolidate preflight upgrade tests and adjust timeouts Signed-off-by: Brooke Hamilton <45323234+brooke-hamilton@users.noreply.github.com> --- .../upgrade/upgrade_test.go | 451 +++++++----------- 1 file changed, 178 insertions(+), 273 deletions(-) diff --git a/test/functional-portable/upgrade/upgrade_test.go b/test/functional-portable/upgrade/upgrade_test.go index 7081fee5b1..a7d7c428c0 100644 --- a/test/functional-portable/upgrade/upgrade_test.go +++ b/test/functional-portable/upgrade/upgrade_test.go @@ -31,256 +31,176 @@ import ( batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" ) const ( radiusNamespace = "radius-system" preUpgradeJobName = "pre-upgrade" - testTimeout = 5 * time.Minute + helmTimeout = "3m" relativeChartPath = "../../../deploy/Chart" + + // Polling intervals for waiting on Kubernetes state changes. + controlPlanePollInterval = 3 * time.Second + cleanupPollInterval = 3 * time.Second + jobPollInterval = 1 * time.Second + jobPollAttempts = 15 ) -func Test_PreflightContainer_FreshInstall(t *testing.T) { - ctx := testcontext.New(t) +// Test_PreflightContainer runs all preflight container upgrade tests as sequential +// subtests. These tests cannot run in parallel because they share the same Helm +// release name and Kubernetes namespace. Consolidating into subtests reduces the +// number of full install/uninstall cycles (from 4 to 2) and eliminates redundant +// test logic. +func Test_PreflightContainer(t *testing.T) { + t.Run("Enabled", testPreflightEnabled) + t.Run("Disabled", testPreflightDisabled) +} +// testPreflightEnabled verifies that when preflight is enabled: +// - The pre-upgrade Helm hook creates a job during upgrade +// - Custom job configuration (TTL, version check) is applied correctly +// - Job logs and status are accessible +func testPreflightEnabled(t *testing.T) { + ctx := testcontext.New(t) image, tag := getPreUpgradeImage() - // Clean up any existing installation using helm - t.Log("Cleaning up any existing Radius installation") - cleanupCommand := []string{"helm", "uninstall", "radius", "--namespace", radiusNamespace, "--ignore-not-found"} - _ = exec.Command(cleanupCommand[0], cleanupCommand[1:]...).Run() // Ignore errors during cleanup + cleanupAndWait(t, ctx) - // Use local registry image for testing the pre-upgrade functionality - t.Log("Installing Radius with preflight enabled using helm") - installCommand := []string{ - "helm", "install", "radius", relativeChartPath, - "--namespace", radiusNamespace, - "--create-namespace", - "--set", "preupgrade.enabled=true", - "--set", fmt.Sprintf("preupgrade.image=%s", image), - "--set", fmt.Sprintf("preupgrade.tag=%s", tag), - "--wait", + helmValues := map[string]string{ + "preupgrade.enabled": "true", + "preupgrade.ttlSecondsAfterFinished": "60", + "preupgrade.checks.version": "true", } - err := runHelmCommand(ctx, installCommand) - require.NoError(t, err, "Failed to install Radius using helm") - // Wait for the control plane to become available - var options rp.RPTestOptions - require.Eventually(t, func() bool { - func() { - defer func() { - if r := recover(); r != nil { - // NewRPTestOptions calls require.NoError internally, catch panics - t.Logf("Control plane not ready yet: %v", r) - } - }() - options = rp.NewRPTestOptions(t) - }() - // Test if we can make a simple API call to verify the control plane is ready - return options.ManagementClient != nil - }, 2*time.Minute, 5*time.Second, "Control plane did not become available within timeout") + t.Log("Installing Radius with preflight enabled and custom configuration") + err := helmInstall(ctx, image, tag, helmValues) + require.NoError(t, err, "Failed to install Radius") - t.Log("Attempting upgrade with local chart to trigger Helm pre-upgrade hook") - upgradeCommand := []string{ - "helm", "upgrade", "radius", relativeChartPath, - "--namespace", radiusNamespace, - "--set", "preupgrade.enabled=true", - "--set", fmt.Sprintf("preupgrade.image=%s", image), - "--set", fmt.Sprintf("preupgrade.tag=%s", tag), - "--wait", - } - // This might fail due to version issues, but should trigger the Helm hook - // The key test is that the job gets created - _ = runHelmCommand(ctx, upgradeCommand) // Ignore errors as upgrade may fail due to version issues + options := waitForControlPlane(t, ctx) + + t.Log("Upgrading to trigger pre-upgrade hook") + // Upgrade may fail due to version issues, but should trigger the Helm hook. + // The key assertion is that the job gets created. + _ = helmUpgrade(ctx, image, tag, helmValues) - t.Log("Verifying preflight job was created by Helm pre-upgrade hook") - verifyPreflightJobRan(t, ctx, options, true) + t.Log("Verifying preflight job was created and configured correctly") + job := findPreflightJob(t, ctx, options) + if job != nil { + logJobDetails(t, ctx, options, job) - t.Log("Cleaning up - uninstalling Radius using helm") - uninstallCommand := []string{"helm", "uninstall", "radius", "--namespace", radiusNamespace} - err = runHelmCommand(ctx, uninstallCommand) - require.NoError(t, err, "Failed to uninstall Radius using helm") + // Verify custom configuration was applied + require.NotNil(t, job.Spec.TTLSecondsAfterFinished, "TTLSecondsAfterFinished should be set") + require.Equal(t, int32(60), *job.Spec.TTLSecondsAfterFinished) + t.Log("Job configuration verified") + } else { + t.Log("Preflight job not found - upgrade likely failed before hooks triggered (acceptable in test environment)") + } + + helmUninstall(t, ctx) } -func Test_PreflightContainer_PreflightDisabled(t *testing.T) { +// testPreflightDisabled verifies that when preflight is disabled, the pre-upgrade +// Helm hook does not create a job during upgrade. +func testPreflightDisabled(t *testing.T) { ctx := testcontext.New(t) - image, tag := getPreUpgradeImage() - // Clean up any existing installation using helm - t.Log("Cleaning up any existing Radius installation") - cleanupCommand := []string{"helm", "uninstall", "radius", "--namespace", radiusNamespace, "--ignore-not-found"} - _ = exec.Command(cleanupCommand[0], cleanupCommand[1:]...).Run() // Ignore errors during cleanup - - // Wait for cleanup to complete - time.Sleep(3 * time.Second) + cleanupAndWait(t, ctx) - t.Log("Installing Radius with preflight disabled using helm") - installCommand := []string{ - "helm", "install", "radius", relativeChartPath, - "--namespace", radiusNamespace, - "--create-namespace", - "--set", "preupgrade.enabled=false", - "--set", fmt.Sprintf("preupgrade.image=%s", image), - "--set", fmt.Sprintf("preupgrade.tag=%s", tag), - "--wait", + helmValues := map[string]string{ + "preupgrade.enabled": "false", } - err := runHelmCommand(ctx, installCommand) - require.NoError(t, err, "Failed to install Radius using helm") - // Wait for the control plane to become available - var options rp.RPTestOptions - require.Eventually(t, func() bool { - func() { - defer func() { - if r := recover(); r != nil { - // NewRPTestOptions calls require.NoError internally, catch panics - t.Logf("Control plane not ready yet: %v", r) - } - }() - options = rp.NewRPTestOptions(t) - }() - // Test if we can make a simple API call to verify the control plane is ready - return options.ManagementClient != nil - }, 2*time.Minute, 5*time.Second, "Control plane did not become available within timeout") + t.Log("Installing Radius with preflight disabled") + err := helmInstall(ctx, image, tag, helmValues) + require.NoError(t, err, "Failed to install Radius") - t.Log("Attempting upgrade with preflight disabled using helm") + options := waitForControlPlane(t, ctx) // Ensure no leftover job exists before upgrade - _ = options.K8sClient.BatchV1().Jobs(radiusNamespace).Delete(ctx, preUpgradeJobName, metav1.DeleteOptions{}) // Ignore errors during cleanup + _ = options.K8sClient.BatchV1().Jobs(radiusNamespace).Delete(ctx, preUpgradeJobName, metav1.DeleteOptions{}) - upgradeCommand := []string{ - "helm", "upgrade", "radius", relativeChartPath, - "--namespace", radiusNamespace, - "--set", "preupgrade.enabled=false", - "--set", fmt.Sprintf("preupgrade.image=%s", image), - "--set", fmt.Sprintf("preupgrade.tag=%s", tag), - "--wait", - } - _ = runHelmCommand(ctx, upgradeCommand) // Ignore errors as upgrade might fail due to version issues + t.Log("Upgrading with preflight disabled") + _ = helmUpgrade(ctx, image, tag, helmValues) t.Log("Verifying no preflight job was created") - verifyPreflightJobRan(t, ctx, options, false) + // Brief wait to allow any unexpected job creation to surface + time.Sleep(5 * time.Second) + _, err = options.K8sClient.BatchV1().Jobs(radiusNamespace).Get(ctx, preUpgradeJobName, metav1.GetOptions{}) + if err != nil && strings.Contains(err.Error(), "not found") { + t.Log("Preflight job correctly not created when disabled") + } else { + t.Errorf("Expected preflight job to not exist when disabled, but found it or got unexpected error: %v", err) + } - t.Log("Cleaning up - uninstalling Radius using helm") - uninstallCommand := []string{"helm", "uninstall", "radius", "--namespace", radiusNamespace} - err = runHelmCommand(ctx, uninstallCommand) - require.NoError(t, err, "Failed to uninstall Radius using helm") + helmUninstall(t, ctx) } -func Test_PreflightContainer_JobConfiguration(t *testing.T) { - ctx := testcontext.New(t) - - image, tag := getPreUpgradeImage() - - // Clean up any existing installation using helm - t.Log("Cleaning up any existing Radius installation") - cleanupCommand := []string{"helm", "uninstall", "radius", "--namespace", radiusNamespace, "--ignore-not-found"} - _ = exec.Command(cleanupCommand[0], cleanupCommand[1:]...).Run() // Ignore errors during cleanup +// Helper functions - // Wait for cleanup to complete - time.Sleep(3 * time.Second) +// getPreUpgradeImage constructs the pre-upgrade container image name using the configured registry and tag. +func getPreUpgradeImage() (image string, tag string) { + registry, tag := testutil.SetDefault() + return fmt.Sprintf("%s/pre-upgrade", registry), tag +} - // Use local registry image for testing - t.Log("Installing Radius with custom preflight configuration using helm") - installCommand := []string{ +// helmInstall runs helm install with the given image, tag, and additional values. +func helmInstall(ctx context.Context, image, tag string, values map[string]string) error { + args := []string{ "helm", "install", "radius", relativeChartPath, "--namespace", radiusNamespace, "--create-namespace", - "--set", "preupgrade.enabled=true", - "--set", "preupgrade.ttlSecondsAfterFinished=60", - "--set", "preupgrade.checks.version=true", "--set", fmt.Sprintf("preupgrade.image=%s", image), "--set", fmt.Sprintf("preupgrade.tag=%s", tag), "--wait", + "--timeout", helmTimeout, } - err := runHelmCommand(ctx, installCommand) - require.NoError(t, err, "Failed to install Radius using helm") - - // Wait for the control plane to become available - var options rp.RPTestOptions - require.Eventually(t, func() bool { - func() { - defer func() { - if r := recover(); r != nil { - // NewRPTestOptions calls require.NoError internally, catch panics - t.Logf("Control plane not ready yet: %v", r) - } - }() - options = rp.NewRPTestOptions(t) - }() - // Test if we can make a simple API call to verify the control plane is ready - return options.ManagementClient != nil - }, 2*time.Minute, 5*time.Second, "Control plane did not become available within timeout") + for k, v := range values { + args = append(args, "--set", fmt.Sprintf("%s=%s", k, v)) + } + return runCommand(ctx, args) +} - t.Log("Attempting upgrade to trigger preflight checks using helm") - upgradeCommand := []string{ +// helmUpgrade runs helm upgrade with the given image, tag, and additional values. +func helmUpgrade(ctx context.Context, image, tag string, values map[string]string) error { + args := []string{ "helm", "upgrade", "radius", relativeChartPath, "--namespace", radiusNamespace, - "--set", "preupgrade.enabled=true", - "--set", "preupgrade.ttlSecondsAfterFinished=60", - "--set", "preupgrade.checks.version=true", "--set", fmt.Sprintf("preupgrade.image=%s", image), "--set", fmt.Sprintf("preupgrade.tag=%s", tag), "--wait", + "--timeout", helmTimeout, } - _ = runHelmCommand(ctx, upgradeCommand) // Ignore errors as upgrade might fail due to version issues - - t.Log("Verifying preflight job configuration") - // Try to get the job, but don't fail the test if upgrade failed before job creation - job, jobErr := func() (*batchv1.Job, error) { - for i := 0; i < 15; i++ { - job, err := options.K8sClient.BatchV1().Jobs(radiusNamespace).Get(ctx, preUpgradeJobName, metav1.GetOptions{}) - if err == nil { - return job, nil - } - time.Sleep(1 * time.Second) - } - return nil, fmt.Errorf("job not found") - }() - - if jobErr == nil { - t.Log("Found preflight job, checking configuration") - require.NotNil(t, job.Spec.TTLSecondsAfterFinished) - require.Equal(t, int32(60), *job.Spec.TTLSecondsAfterFinished) - t.Log("Job configuration is correct") - } else { - t.Log("Preflight job not found - upgrade likely failed before job creation due to version compatibility") - t.Log("This is acceptable in test environment") + for k, v := range values { + args = append(args, "--set", fmt.Sprintf("%s=%s", k, v)) } - - t.Log("Cleaning up - uninstalling Radius using helm") - uninstallCommand := []string{"helm", "uninstall", "radius", "--namespace", radiusNamespace} - err = runHelmCommand(ctx, uninstallCommand) - require.NoError(t, err, "Failed to uninstall Radius using helm") + return runCommand(ctx, args) } -func Test_PreflightContainer_PreflightOnly(t *testing.T) { - ctx := testcontext.New(t) - - image, tag := getPreUpgradeImage() - - // Clean up any existing installation using helm - t.Log("Cleaning up any existing Radius installation") - cleanupCommand := []string{"helm", "uninstall", "radius", "--namespace", radiusNamespace, "--ignore-not-found"} - _ = exec.Command(cleanupCommand[0], cleanupCommand[1:]...).Run() // Ignore errors during cleanup +// helmUninstall removes the Radius helm release. +func helmUninstall(t *testing.T, ctx context.Context) { + t.Helper() + t.Log("Uninstalling Radius") + err := runCommand(ctx, []string{"helm", "uninstall", "radius", "--namespace", radiusNamespace}) + require.NoError(t, err, "Failed to uninstall Radius") +} - // Use local registry image for testing - t.Log("Installing Radius using helm") - installCommand := []string{ - "helm", "install", "radius", relativeChartPath, - "--namespace", radiusNamespace, - "--create-namespace", - "--set", "preupgrade.enabled=true", - "--set", fmt.Sprintf("preupgrade.image=%s", image), - "--set", fmt.Sprintf("preupgrade.tag=%s", tag), - "--wait", +// runCommand executes a shell command and returns an error if it fails. +func runCommand(ctx context.Context, args []string) error { + cmd := exec.CommandContext(ctx, args[0], args[1:]...) + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("command failed: %s, output: %s", err, string(output)) } - err := runHelmCommand(ctx, installCommand) - require.NoError(t, err, "Failed to install Radius using helm") + return nil +} - // Wait for the control plane to become available +// waitForControlPlane polls until the Radius control plane API is reachable. +func waitForControlPlane(t *testing.T, ctx context.Context) rp.RPTestOptions { + t.Helper() var options rp.RPTestOptions require.Eventually(t, func() bool { func() { @@ -292,98 +212,83 @@ func Test_PreflightContainer_PreflightOnly(t *testing.T) { }() options = rp.NewRPTestOptions(t) }() - // Test if we can make a simple API call to verify the control plane is ready return options.ManagementClient != nil - }, 2*time.Minute, 5*time.Second, "Control plane did not become available within timeout") + }, 2*time.Minute, controlPlanePollInterval, "Control plane did not become available within timeout") + return options +} - t.Log("Running upgrade to trigger preflight hooks using helm") - upgradeCommand := []string{ - "helm", "upgrade", "radius", relativeChartPath, - "--namespace", radiusNamespace, - "--set", "preupgrade.enabled=true", - "--set", fmt.Sprintf("preupgrade.image=%s", image), - "--set", fmt.Sprintf("preupgrade.tag=%s", tag), - "--wait", - } - _ = runHelmCommand(ctx, upgradeCommand) // Ignore errors as upgrade might fail due to version issues but should trigger hooks +// cleanupAndWait uninstalls Radius and waits for all pods in the namespace to be fully +// terminated before returning. This prevents aggregated API service conflicts when the +// next helm install runs before the previous resources are fully cleaned up. +func cleanupAndWait(t *testing.T, ctx context.Context) { + t.Helper() - t.Log("Verifying preflight job ran successfully") - verifyPreflightJobRan(t, ctx, options, true) + t.Log("Cleaning up any existing Radius installation") + _ = exec.CommandContext(ctx, "helm", "uninstall", "radius", + "--namespace", radiusNamespace, "--ignore-not-found", "--wait").Run() + + // Wait for all pods in the namespace to terminate. The Kubernetes aggregated API + // service needs time to deregister after pods are gone, so we must wait for a + // fully clean namespace before starting a new install. + config, err := clientcmd.NewNonInteractiveDeferredLoadingClientConfig( + clientcmd.NewDefaultClientConfigLoadingRules(), nil).ClientConfig() + if err != nil { + t.Logf("Warning: could not create k8s client for cleanup wait: %v", err) + time.Sleep(10 * time.Second) + return + } - t.Log("Cleaning up - uninstalling Radius using helm") - uninstallCommand := []string{"helm", "uninstall", "radius", "--namespace", radiusNamespace} - err = runHelmCommand(ctx, uninstallCommand) - require.NoError(t, err, "Failed to uninstall Radius using helm") -} + k8sClient, err := kubernetes.NewForConfig(config) + if err != nil { + t.Logf("Warning: could not create k8s client for cleanup wait: %v", err) + time.Sleep(10 * time.Second) + return + } -// Helper functions + require.Eventually(t, func() bool { + pods, err := k8sClient.CoreV1().Pods(radiusNamespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return true // Namespace may not exist, which is fine + } + if len(pods.Items) == 0 { + return true + } + t.Logf("Waiting for %d pod(s) in %s to terminate...", len(pods.Items), radiusNamespace) + return false + }, 2*time.Minute, cleanupPollInterval, "Namespace %s did not become empty within timeout", radiusNamespace) -// getPreUpgradeImage constructs the pre-upgrade container image name using the configured registry and tag -func getPreUpgradeImage() (image string, tag string) { - registry, tag := testutil.SetDefault() - return fmt.Sprintf("%s/pre-upgrade", registry), tag + // Brief wait for Kubernetes aggregated API service deregistration + time.Sleep(3 * time.Second) } -// runHelmCommand executes a helm command and returns an error if it fails -func runHelmCommand(ctx context.Context, args []string) error { - cmd := exec.CommandContext(ctx, args[0], args[1:]...) - output, err := cmd.CombinedOutput() - if err != nil { - return fmt.Errorf("helm command failed: %s, output: %s", err, string(output)) +// findPreflightJob polls for the pre-upgrade job, returning it if found within the timeout. +func findPreflightJob(t *testing.T, ctx context.Context, options rp.RPTestOptions) *batchv1.Job { + t.Helper() + for range jobPollAttempts { + job, err := options.K8sClient.BatchV1().Jobs(radiusNamespace).Get(ctx, preUpgradeJobName, metav1.GetOptions{}) + if err == nil { + t.Log("Preflight job was created by Helm pre-upgrade hook") + return job + } + time.Sleep(jobPollInterval) } return nil } -func verifyPreflightJobRan(t *testing.T, ctx context.Context, options rp.RPTestOptions, shouldExist bool) { - if shouldExist { - // Look for the job, but be flexible about timing since upgrades might fail early - var jobExists bool - var finalJob *batchv1.Job - - // Try to find the job with a shorter timeout since upgrade might fail quickly - jobExists = func() bool { - for range 15 { // 15 seconds total - job, err := options.K8sClient.BatchV1().Jobs(radiusNamespace).Get(ctx, preUpgradeJobName, metav1.GetOptions{}) - if err == nil { - finalJob = job - return true - } - time.Sleep(1 * time.Second) - } - return false - }() - - if jobExists { - t.Log("Preflight job was created by Helm pre-upgrade hook") - - // If we found the job, let's check its logs regardless of status - pods, err := options.K8sClient.CoreV1().Pods(radiusNamespace).List(ctx, metav1.ListOptions{ - LabelSelector: "job-name=" + preUpgradeJobName, - }) - if err != nil { - t.Logf("Warning: Failed to list pods for job: %v", err) - } else if len(pods.Items) > 0 { - logs, logErr := options.K8sClient.CoreV1().Pods(radiusNamespace).GetLogs(pods.Items[0].Name, &corev1.PodLogOptions{}).DoRaw(ctx) - if logErr != nil { - t.Logf("Warning: Failed to get pod logs: %v", logErr) - } else { - t.Logf("Preflight job logs:\n%s", string(logs)) - } - } - - t.Logf("Preflight job status - Succeeded: %d, Failed: %d", finalJob.Status.Succeeded, finalJob.Status.Failed) - } else { - t.Log("Preflight job was not found - this might happen if upgrade failed before Helm hooks were triggered") - t.Log("This is acceptable in test environment where version jumps cause expected failures") - } - } else { - // Verify job does not exist (give it a moment in case there's delay) - time.Sleep(5 * time.Second) - _, err := options.K8sClient.BatchV1().Jobs(radiusNamespace).Get(ctx, preUpgradeJobName, metav1.GetOptions{}) - if err != nil && strings.Contains(err.Error(), "not found") { - t.Log("Preflight job correctly not created when disabled") - } else { - t.Errorf("Expected preflight job to not exist when disabled, but found it or got unexpected error: %v", err) +// logJobDetails retrieves and logs the job's pod logs and status. +func logJobDetails(t *testing.T, ctx context.Context, options rp.RPTestOptions, job *batchv1.Job) { + t.Helper() + + pods, err := options.K8sClient.CoreV1().Pods(radiusNamespace).List(ctx, metav1.ListOptions{ + LabelSelector: "job-name=" + preUpgradeJobName, + }) + if err == nil && len(pods.Items) > 0 { + logs, logErr := options.K8sClient.CoreV1().Pods(radiusNamespace). + GetLogs(pods.Items[0].Name, &corev1.PodLogOptions{}).DoRaw(ctx) + if logErr == nil { + t.Logf("Preflight job logs:\n%s", string(logs)) } } + + t.Logf("Preflight job status - Succeeded: %d, Failed: %d", job.Status.Succeeded, job.Status.Failed) } From e60da1e0592e55a9444bc8ca596733d56f57ab21 Mon Sep 17 00:00:00 2001 From: Brooke Hamilton <45323234+brooke-hamilton@users.noreply.github.com> Date: Sun, 8 Feb 2026 14:20:05 -0500 Subject: [PATCH 2/4] ignore contour pods on upgrade test Signed-off-by: Brooke Hamilton <45323234+brooke-hamilton@users.noreply.github.com> --- .../upgrade/upgrade_test.go | 36 +++++++++++++------ 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/test/functional-portable/upgrade/upgrade_test.go b/test/functional-portable/upgrade/upgrade_test.go index a7d7c428c0..7fc8dff014 100644 --- a/test/functional-portable/upgrade/upgrade_test.go +++ b/test/functional-portable/upgrade/upgrade_test.go @@ -47,6 +47,15 @@ const ( cleanupPollInterval = 3 * time.Second jobPollInterval = 1 * time.Second jobPollAttempts = 15 + + // cleanupTimeout is the maximum time to wait for Radius pods to terminate + // after uninstalling the Helm release. + cleanupTimeout = 2 * time.Minute + + // radiusPodSelector selects only pods belonging to the Radius Helm release. + // Contour is deployed as a separate Helm release in the same namespace and + // must be excluded from cleanup checks — its pods will remain running. + radiusPodSelector = "app.kubernetes.io/part-of=radius" ) // Test_PreflightContainer runs all preflight container upgrade tests as sequential @@ -217,9 +226,14 @@ func waitForControlPlane(t *testing.T, ctx context.Context) rp.RPTestOptions { return options } -// cleanupAndWait uninstalls Radius and waits for all pods in the namespace to be fully -// terminated before returning. This prevents aggregated API service conflicts when the -// next helm install runs before the previous resources are fully cleaned up. +// cleanupAndWait uninstalls the Radius Helm release and waits for all Radius pods in +// the namespace to be fully terminated before returning. This prevents aggregated API +// service conflicts when the next helm install runs before previous resources are +// fully cleaned up. +// +// Only Radius-owned pods (labeled app.kubernetes.io/part-of=radius) are monitored. +// Contour is deployed as a separate Helm release in the same namespace and its pods +// are expected to remain running. func cleanupAndWait(t *testing.T, ctx context.Context) { t.Helper() @@ -227,9 +241,9 @@ func cleanupAndWait(t *testing.T, ctx context.Context) { _ = exec.CommandContext(ctx, "helm", "uninstall", "radius", "--namespace", radiusNamespace, "--ignore-not-found", "--wait").Run() - // Wait for all pods in the namespace to terminate. The Kubernetes aggregated API - // service needs time to deregister after pods are gone, so we must wait for a - // fully clean namespace before starting a new install. + // Wait for Radius pods to terminate. The Kubernetes aggregated API service needs + // time to deregister after pods are gone, so we must wait for Radius pods to be + // fully removed before starting a new install. config, err := clientcmd.NewNonInteractiveDeferredLoadingClientConfig( clientcmd.NewDefaultClientConfigLoadingRules(), nil).ClientConfig() if err != nil { @@ -246,16 +260,18 @@ func cleanupAndWait(t *testing.T, ctx context.Context) { } require.Eventually(t, func() bool { - pods, err := k8sClient.CoreV1().Pods(radiusNamespace).List(ctx, metav1.ListOptions{}) + pods, err := k8sClient.CoreV1().Pods(radiusNamespace).List(ctx, metav1.ListOptions{ + LabelSelector: radiusPodSelector, + }) if err != nil { - return true // Namespace may not exist, which is fine + return true } if len(pods.Items) == 0 { return true } - t.Logf("Waiting for %d pod(s) in %s to terminate...", len(pods.Items), radiusNamespace) + t.Logf("Waiting for %d Radius pod(s) in %s to terminate...", len(pods.Items), radiusNamespace) return false - }, 2*time.Minute, cleanupPollInterval, "Namespace %s did not become empty within timeout", radiusNamespace) + }, cleanupTimeout, cleanupPollInterval, "Radius pods in %s did not terminate within timeout", radiusNamespace) // Brief wait for Kubernetes aggregated API service deregistration time.Sleep(3 * time.Second) From 86ea08a8d36b33441df99fbfe6a6bb7b73447a81 Mon Sep 17 00:00:00 2001 From: Brooke Hamilton <45323234+brooke-hamilton@users.noreply.github.com> Date: Mon, 9 Feb 2026 09:52:12 -0500 Subject: [PATCH 3/4] code review fixes Signed-off-by: Brooke Hamilton <45323234+brooke-hamilton@users.noreply.github.com> --- test/functional-portable/upgrade/upgrade_test.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test/functional-portable/upgrade/upgrade_test.go b/test/functional-portable/upgrade/upgrade_test.go index 7fc8dff014..ca11d05f49 100644 --- a/test/functional-portable/upgrade/upgrade_test.go +++ b/test/functional-portable/upgrade/upgrade_test.go @@ -105,7 +105,7 @@ func testPreflightEnabled(t *testing.T) { require.Equal(t, int32(60), *job.Spec.TTLSecondsAfterFinished) t.Log("Job configuration verified") } else { - t.Log("Preflight job not found - upgrade likely failed before hooks triggered (acceptable in test environment)") + t.Fatal("Preflight job not found - upgrade likely failed before hooks triggered") } helmUninstall(t, ctx) @@ -264,7 +264,8 @@ func cleanupAndWait(t *testing.T, ctx context.Context) { LabelSelector: radiusPodSelector, }) if err != nil { - return true + t.Logf("Warning: failed to list pods: %v", err) + return false } if len(pods.Items) == 0 { return true @@ -286,7 +287,11 @@ func findPreflightJob(t *testing.T, ctx context.Context, options rp.RPTestOption t.Log("Preflight job was created by Helm pre-upgrade hook") return job } - time.Sleep(jobPollInterval) + select { + case <-ctx.Done(): + return nil + case <-time.After(jobPollInterval): + } } return nil } From 1058b377a12a1c964ff5f2c8020aceb1dc539acc Mon Sep 17 00:00:00 2001 From: Brooke Hamilton <45323234+brooke-hamilton@users.noreply.github.com> Date: Mon, 9 Feb 2026 11:12:49 -0500 Subject: [PATCH 4/4] code review updates Signed-off-by: Brooke Hamilton <45323234+brooke-hamilton@users.noreply.github.com> --- .../upgrade/upgrade_test.go | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/test/functional-portable/upgrade/upgrade_test.go b/test/functional-portable/upgrade/upgrade_test.go index ca11d05f49..dc60ac3285 100644 --- a/test/functional-portable/upgrade/upgrade_test.go +++ b/test/functional-portable/upgrade/upgrade_test.go @@ -20,7 +20,6 @@ import ( "context" "fmt" "os/exec" - "strings" "testing" "time" @@ -30,6 +29,7 @@ import ( "github.com/stretchr/testify/require" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/clientcmd" @@ -52,6 +52,14 @@ const ( // after uninstalling the Helm release. cleanupTimeout = 2 * time.Minute + // cleanupFallbackWait is the fallback sleep duration when unable to create + // a Kubernetes client for cleanup polling. + cleanupFallbackWait = 10 * time.Second + + // apiServiceDeregistrationWait is the time to wait for Kubernetes aggregated + // API service deregistration after Radius pods are terminated. + apiServiceDeregistrationWait = 3 * time.Second + // radiusPodSelector selects only pods belonging to the Radius Helm release. // Contour is deployed as a separate Helm release in the same namespace and // must be excluded from cleanup checks — its pods will remain running. @@ -139,10 +147,13 @@ func testPreflightDisabled(t *testing.T) { // Brief wait to allow any unexpected job creation to surface time.Sleep(5 * time.Second) _, err = options.K8sClient.BatchV1().Jobs(radiusNamespace).Get(ctx, preUpgradeJobName, metav1.GetOptions{}) - if err != nil && strings.Contains(err.Error(), "not found") { + switch { + case apierrors.IsNotFound(err): t.Log("Preflight job correctly not created when disabled") - } else { - t.Errorf("Expected preflight job to not exist when disabled, but found it or got unexpected error: %v", err) + case err == nil: + t.Error("Expected preflight job to not exist when disabled, but it was found") + default: + t.Errorf("Unexpected error checking for preflight job: %v", err) } helmUninstall(t, ctx) @@ -248,14 +259,14 @@ func cleanupAndWait(t *testing.T, ctx context.Context) { clientcmd.NewDefaultClientConfigLoadingRules(), nil).ClientConfig() if err != nil { t.Logf("Warning: could not create k8s client for cleanup wait: %v", err) - time.Sleep(10 * time.Second) + time.Sleep(cleanupFallbackWait) return } k8sClient, err := kubernetes.NewForConfig(config) if err != nil { t.Logf("Warning: could not create k8s client for cleanup wait: %v", err) - time.Sleep(10 * time.Second) + time.Sleep(cleanupFallbackWait) return } @@ -274,8 +285,8 @@ func cleanupAndWait(t *testing.T, ctx context.Context) { return false }, cleanupTimeout, cleanupPollInterval, "Radius pods in %s did not terminate within timeout", radiusNamespace) - // Brief wait for Kubernetes aggregated API service deregistration - time.Sleep(3 * time.Second) + // Wait for Kubernetes aggregated API service deregistration + time.Sleep(apiServiceDeregistrationWait) } // findPreflightJob polls for the pre-upgrade job, returning it if found within the timeout.