From 21ee0e0164b7283db204aa16b4523b90c262a2f1 Mon Sep 17 00:00:00 2001 From: Daniel Peric Date: Wed, 21 Jan 2026 12:55:42 -0500 Subject: [PATCH] Fix SHC appframework bundle-push loop in FIPS mode - Treat FIPS banner-only status output as in-progress - Treat ConfDeploymentException 'already running' as in-progress - Add unit tests --- pkg/splunk/enterprise/afwscheduler.go | 91 +++++++++++++++++++--- pkg/splunk/enterprise/afwscheduler_test.go | 18 +++++ 2 files changed, 97 insertions(+), 12 deletions(-) diff --git a/pkg/splunk/enterprise/afwscheduler.go b/pkg/splunk/enterprise/afwscheduler.go index cc99eca0d..fafce6e71 100644 --- a/pkg/splunk/enterprise/afwscheduler.go +++ b/pkg/splunk/enterprise/afwscheduler.go @@ -1637,15 +1637,60 @@ func (shcPlaybookContext *SHCPlaybookContext) isBundlePushComplete(ctx context.C return false, err } + // NOTE: In some environments (notably Splunk in FIPS mode), the Splunk CLI emits a banner + // like: + // "FIPS provider enabled. ..." + // and sometimes a CLI TLS warning, before the actual bundle push status line is written. + // + // The operator runs `apply shcluster-bundle ... &> status_file &` asynchronously and then + // polls the status file. If we treat "banner-only" output as an error, we can prematurely + // reset state to Pending and re-trigger bundle push, causing collisions like: + // ConfDeploymentException: Can't start deployment job as one is already running! + // + // So we normalize banner-only / known benign warning lines away and only treat remaining + // output as an error if it isn't the known success string. + normalizedOut := normalizeSHCBundlePushStatusOutput(stdOut) + // Check if we did not get the desired output in the status file. There can be 2 scenarios - // 1. stdOut is empty, which means bundle push is still in progress // 2. stdOut has some other string other than the bundle push success message - if stdOut == "" { + if normalizedOut == "" { scopedLog.Info("SHC Bundle Push is still in progress") return false, nil - } else if !strings.Contains(stdOut, shcBundlePushCompleteStr) { + } + + // The completion constant includes a trailing newline, but we strip whitespace in normalizeSHCBundlePushStatusOutput, + // so check for the trimmed message. + completeStr := strings.TrimSpace(shcBundlePushCompleteStr) + + // Happy path: bundle push complete. + if strings.Contains(normalizedOut, completeStr) { + // now that bundle push is complete, remove the status file + err = shcPlaybookContext.removeSHCBundlePushStatusFile(ctx) + if err != nil { + scopedLog.Error(err, "removing SHC Bundle Push status file failed, will retry again.") + + // reset the state to BundlePushInProgress so that we can check the status of file again. + setBundlePushState(ctx, shcPlaybookContext.afwPipeline, enterpriseApi.BundlePushInProgress) + + // don't return error from here, so that we can retry cleaning the file in next run + return false, nil + } + + return true, nil + } + + // If Splunk reports that a deployment job is already running, treat this as "still in progress" + // (don't reset to Pending / retrigger bundle push, which can create a retry storm). + if strings.Contains(normalizedOut, "ConfDeploymentException: Can't start deployment job as one is already running!") { + scopedLog.Info("SHC Bundle Push appears to already be running; will recheck status", "statusFileOutput", normalizedOut) + return false, nil + } + + // Any other non-empty output that isn't the known success string is treated as an error. + if !strings.Contains(normalizedOut, completeStr) { // this means there was an error in bundle push command - err = fmt.Errorf("there was an error in applying SHC Bundle, err=\"%v\"", stdOut) + err = fmt.Errorf("there was an error in applying SHC Bundle, err=\"%v\"", normalizedOut) scopedLog.Error(err, "SHC Bundle push status file reported an error while applying bundle") // reset the bundle push state to Pending, so that we retry again. @@ -1659,19 +1704,41 @@ func (shcPlaybookContext *SHCPlaybookContext) isBundlePushComplete(ctx context.C return false, err } - // now that bundle push is complete, remove the status file - err = shcPlaybookContext.removeSHCBundlePushStatusFile(ctx) - if err != nil { - scopedLog.Error(err, "removing SHC Bundle Push status file failed, will retry again.") + // Defensive fallback: treat as still in progress. + return false, nil +} - // reset the state to BundlePushInProgress so that we can check the status of file again. - setBundlePushState(ctx, shcPlaybookContext.afwPipeline, enterpriseApi.BundlePushInProgress) +// normalizeSHCBundlePushStatusOutput strips known benign Splunk CLI banner/warning lines +// from the SHC bundle push status file output. This allows the operator to treat +// "banner-only" output as still-in-progress rather than as an error. +func normalizeSHCBundlePushStatusOutput(stdOut string) string { + out := strings.TrimSpace(stdOut) + if out == "" { + return "" + } - // don't return error from here, so that we can retry cleaning the file in next run - return false, nil + lines := strings.Split(out, "\n") + kept := make([]string, 0, len(lines)) + for _, raw := range lines { + line := strings.TrimSpace(raw) + if line == "" { + continue + } + + // FIPS banner emitted by Splunk CLI in FIPS-enabled environments + if strings.HasPrefix(line, "FIPS provider enabled.") { + continue + } + + // Common benign CLI warning (often appears right after the FIPS banner) + if strings.HasPrefix(line, "WARNING: Server Certificate Hostname Validation is disabled.") { + continue + } + + kept = append(kept, line) } - return true, nil + return strings.TrimSpace(strings.Join(kept, "\n")) } // triggerBundlePush triggers the bundle push operation for SHC diff --git a/pkg/splunk/enterprise/afwscheduler_test.go b/pkg/splunk/enterprise/afwscheduler_test.go index 87c5f2ba8..b64e364d6 100644 --- a/pkg/splunk/enterprise/afwscheduler_test.go +++ b/pkg/splunk/enterprise/afwscheduler_test.go @@ -2875,6 +2875,24 @@ func TestSHCRunPlaybook(t *testing.T) { t.Errorf("runPlaybook() should not have returned error or wrong bundle push state, err=%v, bundle push state=%s", err, bundlePushStateAsStr(ctx, getBundlePushState(afwPipeline))) } + // Test7b: Bundle push is still in progress when the status file has only benign Splunk CLI banner/warning + // (e.g. FIPS banner), and does not yet include the bundle push completion string. + appDeployContext.BundlePushStatus.BundlePushStage = enterpriseApi.BundlePushInProgress + mockPodExecReturnContexts[2].StdOut = "FIPS provider enabled. name: OpenSSL FIPS Provider, version: 3.0.9, buildinfo: 3.0.9, status: Success\nWARNING: Server Certificate Hostname Validation is disabled. Please see server.conf/[sslConfig]/cliVerifyServerName for details.\n" + err = playbookContext.runPlaybook(ctx) + if err != nil || getBundlePushState(afwPipeline) != enterpriseApi.BundlePushInProgress { + t.Errorf("runPlaybook() should not have returned error for banner-only output; err=%v, bundle push state=%s", err, bundlePushStateAsStr(ctx, getBundlePushState(afwPipeline))) + } + + // Test7c: Bundle push is still in progress when Splunk reports a deployment job is already running. + // This should NOT reset the state to Pending (which would retrigger and create a retry storm). + appDeployContext.BundlePushStatus.BundlePushStage = enterpriseApi.BundlePushInProgress + mockPodExecReturnContexts[2].StdOut = "FIPS provider enabled. name: OpenSSL FIPS Provider, version: 3.0.9, buildinfo: 3.0.9, status: Success\nConfDeploymentException: Can't start deployment job as one is already running!\n" + err = playbookContext.runPlaybook(ctx) + if err != nil || getBundlePushState(afwPipeline) != enterpriseApi.BundlePushInProgress { + t.Errorf("runPlaybook() should not have returned error for ConfDeploymentException already-running output; err=%v, bundle push state=%s", err, bundlePushStateAsStr(ctx, getBundlePushState(afwPipeline))) + } + // Test8: Bundle push is still in progress since stdOut != shcBundlePushCompleteStr mockPodExecReturnContexts[2].StdOut = "Error while deploying apps" err = playbookContext.runPlaybook(ctx)