From 752a67148bc0960b2b6a953410f0d359f3ad65fb Mon Sep 17 00:00:00 2001 From: Mika Ranta Date: Mon, 24 Nov 2025 09:09:43 +0200 Subject: [PATCH 1/2] fix(disks): Remove unmounting from longhorn cleanup. It is done elsewhere. --- pkg/steps.go | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/pkg/steps.go b/pkg/steps.go index 6a8ab80..4f3cbbc 100644 --- a/pkg/steps.go +++ b/pkg/steps.go @@ -225,7 +225,7 @@ var SetupAndCheckRocmStep = Step{ if strings.HasPrefix(trimmedLine, "WARNING:") { continue } - + parts := strings.Fields(line) if len(parts) > 0 { if _, err := strconv.Atoi(parts[0]); err != nil { @@ -238,7 +238,7 @@ var SetupAndCheckRocmStep = Step{ } } } - + if !validLineFound { LogMessage(Error, "rocm-smi did not return any valid GPU lines: "+string(output)) return StepResult{ @@ -1111,20 +1111,6 @@ var CleanLonghornMountsStep = Step{ return stepResult } - // Find /mnt/disk* mount points that contain longhorn-disk.cfg and unmount them - shellCmd := ` - for mount_point in /mnt/disk*; do - if [ -d "$mount_point" ] && find "$mount_point" -name "longhorn-disk.cfg" 2>/dev/null | grep -q .; then - echo "Found longhorn-disk.cfg in $mount_point, unmounting..." - sudo umount -lf "$mount_point" 2>/dev/null || true - fi - done - ` - stepResult = shellCmdHelper(shellCmd) - if stepResult.Error != nil { - return stepResult - } - // Find and unmount CSI volume mounts stepResult = shellCmdHelper("sudo umount -Af /var/lib/kubelet/pods/*/volumes/kubernetes.io~csi/pvc-* 2>/dev/null || true") if stepResult.Error != nil { From eeff1aad32292dbc29e75f5b2959d726ef1ff026 Mon Sep 17 00:00:00 2001 From: Mika Ranta Date: Mon, 24 Nov 2025 12:14:01 +0200 Subject: [PATCH 2/2] fix: validation for CLUETER_MOUNTED_DISKS --- .github/workflows/run-tests.yml | 2 +- pkg/args/args.go | 46 +++++ .../01-valid-first-node/bloom.yaml | 10 ++ .../02-valid-additional-node/bloom.yaml | 7 + .../03-error-both-disks-set/bloom.yaml | 9 + .../04-error-neither-disks-set/bloom.yaml | 8 + .../bloom.yaml | 8 + .../06-error-premounted-not-exists/bloom.yaml | 8 + .../bloom.yaml | 17 ++ .../bloom.yaml | 9 + .../09-error-invalid-ip/bloom.yaml | 7 + .../10-error-short-join-token/bloom.yaml | 7 + .../bloom.yaml | 15 ++ tests/integration/step_test.go | 161 ------------------ 14 files changed, 152 insertions(+), 162 deletions(-) create mode 100644 tests/integration/step/01_ValidateArgsStep/01-valid-first-node/bloom.yaml create mode 100644 tests/integration/step/01_ValidateArgsStep/02-valid-additional-node/bloom.yaml create mode 100644 tests/integration/step/01_ValidateArgsStep/03-error-both-disks-set/bloom.yaml create mode 100644 tests/integration/step/01_ValidateArgsStep/04-error-neither-disks-set/bloom.yaml create mode 100644 tests/integration/step/01_ValidateArgsStep/05-error-premounted-not-absolute/bloom.yaml create mode 100644 tests/integration/step/01_ValidateArgsStep/06-error-premounted-not-exists/bloom.yaml create mode 100644 tests/integration/step/01_ValidateArgsStep/07-error-premounted-bloom-tagged/bloom.yaml create mode 100644 tests/integration/step/01_ValidateArgsStep/08-error-both-disabled-and-enabled/bloom.yaml create mode 100644 tests/integration/step/01_ValidateArgsStep/09-error-invalid-ip/bloom.yaml create mode 100644 tests/integration/step/01_ValidateArgsStep/10-error-short-join-token/bloom.yaml create mode 100644 tests/integration/step/01_ValidateArgsStep/11-valid-premounted-clean-fstab/bloom.yaml delete mode 100644 tests/integration/step_test.go diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index d26709f..423e6d2 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -16,7 +16,7 @@ jobs: run: devbox run go test -v ./pkg/... - name: Run integration tests - run: devbox run go test -v ./tests/integration + run: dist/bloom test tests/integration/step/*/*/bloom.yaml - name: Run UI tests run: | diff --git a/pkg/args/args.go b/pkg/args/args.go index cf310f4..6d0b68a 100644 --- a/pkg/args/args.go +++ b/pkg/args/args.go @@ -9,6 +9,7 @@ import ( "regexp" "strings" + "github.com/silogen/cluster-bloom/pkg/mockablecmd" log "github.com/sirupsen/logrus" "github.com/spf13/viper" "gopkg.in/yaml.v3" @@ -202,6 +203,46 @@ func ValidateSkipDiskCheckConsistency(skipDiskCheckStr string) error { return nil } +// validatePremountedNotBloomManaged checks that CLUSTER_PREMOUNTED_DISKS paths are not bloom-managed in /etc/fstab +func validatePremountedNotBloomManaged(diskList []string) error { + const bloomFstabTag = "# managed by cluster-bloom" + + fstabContent, err := mockablecmd.ReadFile("ValidateArgs.ReadFstab", "/etc/fstab") + if err != nil { + // If we can't read fstab, warn but don't fail validation + log.Warnf("Could not read /etc/fstab to validate CLUSTER_PREMOUNTED_DISKS: %v", err) + return nil + } + + lines := strings.Split(string(fstabContent), "\n") + for lineNum, line := range lines { + trimmedLine := strings.TrimSpace(line) + + // Skip lines that aren't bloom-managed + if !strings.HasSuffix(trimmedLine, bloomFstabTag) { + continue + } + + // Extract mount point from bloom-managed entry + fields := strings.Fields(trimmedLine) + if len(fields) < 2 { + continue + } + + mountPoint := fields[1] + + // Check if this mount point is in CLUSTER_PREMOUNTED_DISKS + for _, disk := range diskList { + disk = strings.TrimSpace(disk) + if disk == mountPoint { + return fmt.Errorf("CLUSTER_PREMOUNTED_DISKS contains %s which is tagged '# managed by cluster-bloom' in /etc/fstab at line %d:\n %s\nPlease delete the tag from the fstab line first or use a different mount point", disk, lineNum+1, trimmedLine) + } + } + } + + return nil +} + // ValidateLonghornDisksArg validates CLUSTER_PREMOUNTED_DISKS configuration func ValidateLonghornDisksArg(disks string) error { selectedDisks := viper.GetString("CLUSTER_DISKS") @@ -222,6 +263,11 @@ func ValidateLonghornDisksArg(disks string) error { return fmt.Errorf("CLUSTER_PREMOUNTED_DISKS contains a path that does not exist: %s", disk) } } + + // Check that none of the CLUSTER_PREMOUNTED_DISKS are bloom-managed in /etc/fstab + if err := validatePremountedNotBloomManaged(diskList); err != nil { + return err + } } // Both cannot be set diff --git a/tests/integration/step/01_ValidateArgsStep/01-valid-first-node/bloom.yaml b/tests/integration/step/01_ValidateArgsStep/01-valid-first-node/bloom.yaml new file mode 100644 index 0000000..ee38a94 --- /dev/null +++ b/tests/integration/step/01_ValidateArgsStep/01-valid-first-node/bloom.yaml @@ -0,0 +1,10 @@ +ENABLED_STEPS: ValidateArgsStep +FIRST_NODE: true +GPU_NODE: true +DOMAIN: example.com +USE_CERT_MANAGER: false +CERT_OPTION: generate +CLUSTER_DISKS: /dev/nvme0n1 +RKE2_INSTALLATION_URL: https://get.rke2.io +ROCM_BASE_URL: https://repo.radeon.com/amdgpu-install/7.0.2/ubuntu/ +ROCM_DEB_PACKAGE: amdgpu-install_7.0.2.70002-1_all.deb diff --git a/tests/integration/step/01_ValidateArgsStep/02-valid-additional-node/bloom.yaml b/tests/integration/step/01_ValidateArgsStep/02-valid-additional-node/bloom.yaml new file mode 100644 index 0000000..24c6ca9 --- /dev/null +++ b/tests/integration/step/01_ValidateArgsStep/02-valid-additional-node/bloom.yaml @@ -0,0 +1,7 @@ +ENABLED_STEPS: ValidateArgsStep +FIRST_NODE: false +GPU_NODE: false +SERVER_IP: 192.168.1.100 +JOIN_TOKEN: K107c7e3e6e3e6e3e6e3e6e3e6e3e6e3e6e3e6::server:abc123def456 +CLUSTER_DISKS: /dev/nvme0n1 +RKE2_INSTALLATION_URL: https://get.rke2.io diff --git a/tests/integration/step/01_ValidateArgsStep/03-error-both-disks-set/bloom.yaml b/tests/integration/step/01_ValidateArgsStep/03-error-both-disks-set/bloom.yaml new file mode 100644 index 0000000..d87daf5 --- /dev/null +++ b/tests/integration/step/01_ValidateArgsStep/03-error-both-disks-set/bloom.yaml @@ -0,0 +1,9 @@ +ENABLED_STEPS: ValidateArgsStep +FIRST_NODE: true +DOMAIN: example.com +USE_CERT_MANAGER: false +CERT_OPTION: generate +CLUSTER_DISKS: /dev/nvme0n1 +CLUSTER_PREMOUNTED_DISKS: /tmp + +expected_error: "CLUSTER_PREMOUNTED_DISKS and CLUSTER_DISKS cannot both be set" diff --git a/tests/integration/step/01_ValidateArgsStep/04-error-neither-disks-set/bloom.yaml b/tests/integration/step/01_ValidateArgsStep/04-error-neither-disks-set/bloom.yaml new file mode 100644 index 0000000..55e702c --- /dev/null +++ b/tests/integration/step/01_ValidateArgsStep/04-error-neither-disks-set/bloom.yaml @@ -0,0 +1,8 @@ +ENABLED_STEPS: ValidateArgsStep +FIRST_NODE: true +DOMAIN: example.com +USE_CERT_MANAGER: false +CERT_OPTION: generate +NO_DISKS_FOR_CLUSTER: false + +expected_error: "either CLUSTER_PREMOUNTED_DISKS or CLUSTER_DISKS must be set" diff --git a/tests/integration/step/01_ValidateArgsStep/05-error-premounted-not-absolute/bloom.yaml b/tests/integration/step/01_ValidateArgsStep/05-error-premounted-not-absolute/bloom.yaml new file mode 100644 index 0000000..c50341d --- /dev/null +++ b/tests/integration/step/01_ValidateArgsStep/05-error-premounted-not-absolute/bloom.yaml @@ -0,0 +1,8 @@ +ENABLED_STEPS: ValidateArgsStep +FIRST_NODE: true +DOMAIN: example.com +USE_CERT_MANAGER: false +CERT_OPTION: generate +CLUSTER_PREMOUNTED_DISKS: mnt/disk0 + +expected_error: "CLUSTER_PREMOUNTED_DISKS contains a non-absolute path: mnt/disk0" diff --git a/tests/integration/step/01_ValidateArgsStep/06-error-premounted-not-exists/bloom.yaml b/tests/integration/step/01_ValidateArgsStep/06-error-premounted-not-exists/bloom.yaml new file mode 100644 index 0000000..1454b58 --- /dev/null +++ b/tests/integration/step/01_ValidateArgsStep/06-error-premounted-not-exists/bloom.yaml @@ -0,0 +1,8 @@ +ENABLED_STEPS: ValidateArgsStep +FIRST_NODE: true +DOMAIN: example.com +USE_CERT_MANAGER: false +CERT_OPTION: generate +CLUSTER_PREMOUNTED_DISKS: /mnt/nonexistent + +expected_error: "CLUSTER_PREMOUNTED_DISKS contains a path that does not exist: /mnt/nonexistent" diff --git a/tests/integration/step/01_ValidateArgsStep/07-error-premounted-bloom-tagged/bloom.yaml b/tests/integration/step/01_ValidateArgsStep/07-error-premounted-bloom-tagged/bloom.yaml new file mode 100644 index 0000000..0bbb8d6 --- /dev/null +++ b/tests/integration/step/01_ValidateArgsStep/07-error-premounted-bloom-tagged/bloom.yaml @@ -0,0 +1,17 @@ +ENABLED_STEPS: ValidateArgsStep +FIRST_NODE: true +DOMAIN: example.com +USE_CERT_MANAGER: false +CERT_OPTION: generate +CLUSTER_PREMOUNTED_DISKS: /tmp + +mocks: + # Mock /etc/fstab with bloom-tagged entry + ValidateArgs.ReadFstab: + output: | + # /etc/fstab: static file system information. + UUID=12345 / ext4 errors=remount-ro 0 1 + UUID=abc123 /tmp ext4 defaults,nofail 0 2 # managed by cluster-bloom + UUID=def456 /mnt/disk1 ext4 defaults,nofail 0 2 + +expected_error: "CLUSTER_PREMOUNTED_DISKS contains /tmp which is tagged '# managed by cluster-bloom' in /etc/fstab at line 3" diff --git a/tests/integration/step/01_ValidateArgsStep/08-error-both-disabled-and-enabled/bloom.yaml b/tests/integration/step/01_ValidateArgsStep/08-error-both-disabled-and-enabled/bloom.yaml new file mode 100644 index 0000000..4f393b9 --- /dev/null +++ b/tests/integration/step/01_ValidateArgsStep/08-error-both-disabled-and-enabled/bloom.yaml @@ -0,0 +1,9 @@ +ENABLED_STEPS: ValidateArgsStep +DISABLED_STEPS: SetupRKE2Step +FIRST_NODE: true +DOMAIN: example.com +USE_CERT_MANAGER: false +CERT_OPTION: generate +CLUSTER_DISKS: /dev/nvme0n1 + +expected_error: "DISABLED_STEPS and ENABLED_STEPS cannot both be set" diff --git a/tests/integration/step/01_ValidateArgsStep/09-error-invalid-ip/bloom.yaml b/tests/integration/step/01_ValidateArgsStep/09-error-invalid-ip/bloom.yaml new file mode 100644 index 0000000..bb22755 --- /dev/null +++ b/tests/integration/step/01_ValidateArgsStep/09-error-invalid-ip/bloom.yaml @@ -0,0 +1,7 @@ +ENABLED_STEPS: ValidateArgsStep +FIRST_NODE: false +SERVER_IP: not-an-ip +JOIN_TOKEN: K107c7e3e6e3e6e3e6e3e6e3e6e3e6e3e6e3e6::server:abc123def456 +CLUSTER_DISKS: /dev/nvme0n1 + +expected_error: "invalid IP address" diff --git a/tests/integration/step/01_ValidateArgsStep/10-error-short-join-token/bloom.yaml b/tests/integration/step/01_ValidateArgsStep/10-error-short-join-token/bloom.yaml new file mode 100644 index 0000000..7fda072 --- /dev/null +++ b/tests/integration/step/01_ValidateArgsStep/10-error-short-join-token/bloom.yaml @@ -0,0 +1,7 @@ +ENABLED_STEPS: ValidateArgsStep +FIRST_NODE: false +SERVER_IP: 192.168.1.100 +JOIN_TOKEN: tooshort +CLUSTER_DISKS: /dev/nvme0n1 + +expected_error: "JOIN_TOKEN is too short" diff --git a/tests/integration/step/01_ValidateArgsStep/11-valid-premounted-clean-fstab/bloom.yaml b/tests/integration/step/01_ValidateArgsStep/11-valid-premounted-clean-fstab/bloom.yaml new file mode 100644 index 0000000..0f300e3 --- /dev/null +++ b/tests/integration/step/01_ValidateArgsStep/11-valid-premounted-clean-fstab/bloom.yaml @@ -0,0 +1,15 @@ +ENABLED_STEPS: ValidateArgsStep +FIRST_NODE: true +DOMAIN: example.com +USE_CERT_MANAGER: false +CERT_OPTION: generate +CLUSTER_PREMOUNTED_DISKS: /tmp + +mocks: + # Mock /etc/fstab WITHOUT bloom-tagged entries for /tmp + ValidateArgs.ReadFstab: + output: | + # /etc/fstab: static file system information. + UUID=12345 / ext4 errors=remount-ro 0 1 + UUID=abc123 /tmp ext4 defaults,nofail 0 2 + UUID=def456 /mnt/disk1 ext4 defaults,nofail 0 2 # managed by cluster-bloom diff --git a/tests/integration/step_test.go b/tests/integration/step_test.go deleted file mode 100644 index 1d2b58d..0000000 --- a/tests/integration/step_test.go +++ /dev/null @@ -1,161 +0,0 @@ -/** - * Copyright 2025 Advanced Micro Devices, Inc. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -**/ - -package integration - -import ( - "io" - "log" - "os" - "path/filepath" - "strings" - "testing" - - "github.com/silogen/cluster-bloom/pkg" - "github.com/silogen/cluster-bloom/pkg/mockablecmd" - logrus "github.com/sirupsen/logrus" - "github.com/spf13/viper" - "gopkg.in/yaml.v2" -) - -// TestStepIntegration tests step execution with various configurations -func TestStepIntegration(t *testing.T) { - // Silence logs during tests unless LOG_LEVEL is set - if os.Getenv("LOG_LEVEL") == "" { - logrus.SetOutput(io.Discard) - log.SetOutput(io.Discard) - } - - // Find all test cases in step subdirectories - testFiles, err := filepath.Glob("step/**/*/bloom.yaml") - if err != nil { - t.Fatalf("Failed to find test files: %v", err) - } - - if len(testFiles) == 0 { - t.Skip("No integration test cases found") - } - - for _, testFile := range testFiles { - testName := filepath.Dir(testFile) - t.Run(testName, func(t *testing.T) { - runStepTest(t, testFile) - }) - } -} - -func runStepTest(t *testing.T, testCaseFile string) { - t.Logf("Running integration test: %s", testCaseFile) - - // Reset state - mockablecmd.ResetMocks() - viper.Reset() - - // Read test config - configData, err := os.ReadFile(testCaseFile) - if err != nil { - t.Fatalf("Failed to read test config: %v", err) - } - - // Parse config - var rawConfig map[string]interface{} - if err := yaml.Unmarshal(configData, &rawConfig); err != nil { - t.Fatalf("Failed to parse config: %v", err) - } - - // Load into viper - for k, v := range rawConfig { - viper.Set(k, v) - } - - // Load mocks - mockablecmd.LoadMocks() - - // Get enabled steps - enabledSteps := viper.GetString("ENABLED_STEPS") - if enabledSteps == "" { - t.Fatal("No ENABLED_STEPS specified in test config") - } - - t.Logf("Testing step: %s", enabledSteps) - - // Execute the step based on name - var stepErr error - switch enabledSteps { - case "PrepareLonghornDisksStep": - stepErr = runPrepareLonghornDisksStep(t) - default: - t.Fatalf("Unknown step: %s", enabledSteps) - } - - // Check for expected error - expectedError, hasExpectedError := rawConfig["expected_error"] - if hasExpectedError { - expectedErrStr := expectedError.(string) - if stepErr == nil { - t.Errorf("Expected error containing '%s' but step succeeded", expectedErrStr) - } else if !strings.Contains(stepErr.Error(), expectedErrStr) { - t.Errorf("Expected error containing '%s' but got '%s'", expectedErrStr, stepErr.Error()) - } else { - t.Logf("✅ Got expected error: %s", stepErr.Error()) - } - } else { - if stepErr != nil { - t.Errorf("Step failed: %v", stepErr) - } else { - t.Logf("✅ Step completed successfully") - } - } -} - -func runPrepareLonghornDisksStep(t *testing.T) error { - // Get cluster disks - disksStr := viper.GetString("CLUSTER_DISKS") - if disksStr == "" { - t.Log("No CLUSTER_DISKS specified") - return nil - } - - // Split and trim disks - diskParts := strings.Split(disksStr, ",") - var disks []string - for _, d := range diskParts { - d = strings.TrimSpace(d) - if d != "" { - disks = append(disks, d) - } - } - t.Logf("Processing %d disk(s): %v", len(disks), disks) - - // Mount drives - mountedMap, err := pkg.MountDrives(disks) - if err != nil { - return err - } - - t.Logf("Mounted %d disk(s)", len(mountedMap)) - - // Persist mounts - if err := pkg.PersistMountedDisks(mountedMap); err != nil { - return err - } - - // Skip GenerateNodeLabels as it requires actual RKE2 config file - // In integration tests, we're only testing the disk mounting logic - t.Logf("✅ Disks mounted and persisted successfully") - - return nil -}