diff --git a/.github/workflows/qemu-disk-test.sh b/.github/workflows/qemu-disk-test.sh new file mode 100644 index 0000000..7f17007 --- /dev/null +++ b/.github/workflows/qemu-disk-test.sh @@ -0,0 +1,301 @@ +#!/bin/bash +set -e + +# Check if required arguments are provided +if [ $# -lt 3 ]; then + echo "ERROR: Insufficient arguments" + echo "Usage: $0 [additional-yaml-paths...]" + echo "Example: $0 nvme-test-vm ./cluster-bloom ./test/bloom.yaml ./test/bloom2.yaml" + exit 1 +fi + +VM_NAME="$1" +BLOOM_BINARY="$2" +shift 2 +BLOOM_CONFIGS=("$@") + +echo "Setting up QEMU VM '$VM_NAME' with 8 NVMe drives (Linux KVM - Clean Setup)..." + +# Check dependencies +if ! command -v qemu-system-x86_64 &> /dev/null; then + echo "ERROR: QEMU not found." + exit 1 +fi + +if ! command -v mkisofs &> /dev/null && ! command -v genisoimage &> /dev/null; then + echo "ERROR: mkisofs not found." + exit 1 +fi + +# Kill any existing QEMU processes +echo "Cleaning up any existing QEMU processes..." +killall qemu-system-x86_64 2>/dev/null && echo "✓ Killed existing QEMU" || true +sleep 2 + +# Completely remove and recreate working directory +echo "Creating fresh working directory..." +rm -rf "$VM_NAME" +mkdir -p "$VM_NAME" + +# Download or copy Ubuntu 24.04 AMD64 cloud image +CI_IMAGE_CACHE="/home/ubuntu/ci/noble-server-cloudimg-amd64.img" +if [ ! -f noble-server-cloudimg-amd64.img ]; then + if [ ! -f "$CI_IMAGE_CACHE" ]; then + echo "Downloading Ubuntu 24.04 AMD64 cloud image to cache (~700MB)..." + mkdir -p "$(dirname "$CI_IMAGE_CACHE")" + curl -L -s -o "$CI_IMAGE_CACHE" \ + https://cloud-images.ubuntu.com/noble/current/noble-server-cloudimg-amd64.img + fi + echo "Copying Ubuntu cloud image from cache..." + cp "$CI_IMAGE_CACHE" noble-server-cloudimg-amd64.img +fi + +# Copy OVMF VARS for writable UEFI variables +echo "Setting up UEFI firmware..." +cp /usr/share/OVMF/OVMF_VARS.fd "$VM_NAME/" + +# Create OS disk (100GB) +echo "Creating OS disk..." +qemu-img create -f qcow2 -F qcow2 -b "$(pwd)/noble-server-cloudimg-amd64.img" "$VM_NAME/os-disk.qcow2" 100G + +# Create 8 NVMe disk images (1MB each) +echo "Creating 8 NVMe disk images..." +for i in {0..7}; do + qemu-img create -f raw "$VM_NAME/nvme${i}.img" 1M +done + +# Create cloud-init configuration with proper user setup +echo "Creating cloud-init configuration..." +mkdir -p "$VM_NAME/seed-content" + +# Generate SSH key if it doesn't exist +if [ ! -f "$VM_NAME/qemu-login" ]; then + echo "Generating SSH key (qemu-login)..." + ssh-keygen -t rsa -b 4096 -f "$VM_NAME/qemu-login" -N "" +fi + +cat > "$VM_NAME/seed-content/user-data" << EOF +#cloud-config + +# Enable password authentication (as fallback) +ssh_pwauth: True +disable_root: false + +# Create ubuntu user with SSH key +users: + - name: ubuntu + plain_text_passwd: ubuntu + lock_passwd: false + sudo: ALL=(ALL) NOPASSWD:ALL + shell: /bin/bash + groups: [users, admin, sudo] + ssh_authorized_keys: + - $(cat "$VM_NAME/qemu-login.pub") + +# Set password explicitly (fallback) +chpasswd: + list: | + ubuntu:ubuntu + expire: False + +# Run commands after boot +runcmd: + - sleep 10 + - echo "ubuntu:ubuntu" | chpasswd + - echo "System ready at $(date)" > /home/ubuntu/boot-complete.txt + - echo "" >> /home/ubuntu/boot-complete.txt + - echo "=== NVMe Devices ===" >> /home/ubuntu/boot-complete.txt + - lsblk >> /home/ubuntu/boot-complete.txt + - echo "" >> /home/ubuntu/boot-complete.txt + - echo "=== Device List ===" >> /home/ubuntu/boot-complete.txt + - ls -l /dev/nvme* >> /home/ubuntu/boot-complete.txt 2>&1 || echo "No /dev/nvme* found" >> /home/ubuntu/boot-complete.txt + - chown ubuntu:ubuntu /home/ubuntu/boot-complete.txt + +final_message: "Cloud-init complete! System is ready." +EOF + +cat > "$VM_NAME/seed-content/meta-data" << EOF +instance-id: $VM_NAME-001 +local-hostname: $VM_NAME +EOF + +# Create ISO seed image +echo "Creating cloud-init seed ISO..." +if command -v mkisofs &> /dev/null; then + mkisofs -output "$VM_NAME/seed.img" -volid cidata -joliet -rock "$VM_NAME/seed-content/user-data" "$VM_NAME/seed-content/meta-data" 2>/dev/null +elif command -v genisoimage &> /dev/null; then + genisoimage -output "$VM_NAME/seed.img" -volid cidata -joliet -rock "$VM_NAME/seed-content/user-data" "$VM_NAME/seed-content/meta-data" 2>/dev/null +fi + +# Create startup script +cat > "$VM_NAME/start-vm.sh" << STARTEOF +#!/bin/bash +SCRIPT_DIR="\$(cd "\$(dirname "\$0")" && pwd)" +cd "\$SCRIPT_DIR" + +echo "Starting x86_64 VM with 8 NVMe devices in background..." +echo "Output will be logged to \$SCRIPT_DIR/startup.log" +echo "Wait ~90 seconds for cloud-init to complete." +echo "" +echo "To monitor boot progress:" +echo " tail -f \$SCRIPT_DIR/startup.log" +echo "" +echo "To connect via SSH:" +echo " \$SCRIPT_DIR/ssh-vm.sh" +echo "" + +qemu-system-x86_64 \ + -machine q35,accel=kvm \ + -cpu host \ + -smp 2 \ + -m 10G \ + -drive if=pflash,format=raw,readonly=on,file=/usr/share/OVMF/OVMF_CODE.fd \ + -drive if=pflash,format=raw,file="\$SCRIPT_DIR/OVMF_VARS.fd" \ + -drive file=os-disk.qcow2,if=virtio,format=qcow2 \ + -drive file=seed.img,if=virtio,format=raw \ + -drive file=nvme0.img,if=none,id=nvme0,format=raw \ + -device nvme,serial=NVME000001,drive=nvme0 \ + -drive file=nvme1.img,if=none,id=nvme1,format=raw \ + -device nvme,serial=NVME000002,drive=nvme1 \ + -drive file=nvme2.img,if=none,id=nvme2,format=raw \ + -device nvme,serial=NVME000003,drive=nvme2 \ + -drive file=nvme3.img,if=none,id=nvme3,format=raw \ + -device nvme,serial=NVME000004,drive=nvme3 \ + -drive file=nvme4.img,if=none,id=nvme4,format=raw \ + -device nvme,serial=NVME000005,drive=nvme4 \ + -drive file=nvme5.img,if=none,id=nvme5,format=raw \ + -device nvme,serial=NVME000006,drive=nvme5 \ + -drive file=nvme6.img,if=none,id=nvme6,format=raw \ + -device nvme,serial=NVME000007,drive=nvme6 \ + -drive file=nvme7.img,if=none,id=nvme7,format=raw \ + -device nvme,serial=NVME000008,drive=nvme7 \ + -netdev user,id=net0,hostfwd=tcp::2222-:22 \ + -device virtio-net-pci,netdev=net0 \ + -nographic > "\$SCRIPT_DIR/startup.log" 2>&1 & + +VM_PID=\$! +echo "VM started with PID \$VM_PID" +echo "Waiting for login prompt (timeout: 2 minutes)..." + +elapsed=0 +while [ \$elapsed -lt 120 ]; do + if grep -q "$VM_NAME login:" "\$SCRIPT_DIR/startup.log" 2>/dev/null; then + echo "✓ VM is ready! (login prompt found after \${elapsed}s)" + exit 0 + fi + sleep 2 + elapsed=\$((elapsed + 2)) +done + +echo "✓ Timeout reached (2 minutes). VM may still be booting." +echo "Check logs: tail -f \$SCRIPT_DIR/startup.log" +STARTEOF + +chmod +x "$VM_NAME/start-vm.sh" + +# Create stop script +cat > "$VM_NAME/stop-vm.sh" << 'EOF' +#!/bin/bash +killall qemu-system-x86_64 +EOF + +chmod +x "$VM_NAME/stop-vm.sh" + +# Create SSH helper script +cat > "$VM_NAME/ssh-vm.sh" << EOF +#!/bin/bash +cd "\$(dirname "\$0")" +ssh -i qemu-login -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -p 2222 ubuntu@localhost +EOF + +chmod +x "$VM_NAME/ssh-vm.sh" + +echo "" +echo "==========================================" +echo "✓ Clean Setup Complete!" +echo "==========================================" +echo "" +echo "Step 1: Start the VM" +echo " cd $VM_NAME && ./start-vm.sh" +echo "" +echo "Step 2: Wait ~90 seconds for boot + cloud-init" +echo "" +echo "Step 3: In a NEW TERMINAL, SSH to the VM:" +echo " cd $VM_NAME && ./ssh-vm.sh" +echo " (passwordless SSH with qemu-login key)" +echo "" +echo "Step 4: Inside VM, verify NVMe devices:" +echo " lsblk" +echo " ls -l /dev/nvme*" +echo " cat ~/boot-complete.txt" +echo "" +echo "To stop the VM:" +echo " cd $VM_NAME && ./stop-vm.sh" +echo " or press Ctrl+A then X in the console" +echo "" + +# Start the VM automatically +echo "Starting the VM..." +bash "$VM_NAME/start-vm.sh" + +# Verify bloom binary exists +if [ ! -f "$BLOOM_BINARY" ]; then + echo "ERROR: Bloom binary not found at $BLOOM_BINARY" + exit 1 +fi + +# Verify all config files exist +for config in "${BLOOM_CONFIGS[@]}"; do + if [ ! -f "$config" ]; then + echo "ERROR: Config file not found at $config" + exit 1 + fi +done + +echo "" +echo "Copying bloom binary and config files to VM..." + +# Wait a bit more to ensure VM is fully ready for SSH +sleep 10 + +# Build file list for scp - bloom binary first +FILES_TO_COPY="$BLOOM_BINARY" + +# Add all config files +for config in "${BLOOM_CONFIGS[@]}"; do + FILES_TO_COPY="$FILES_TO_COPY $config" +done + +# Copy all files at once +scp -i "$VM_NAME/qemu-login" -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -P 2222 $FILES_TO_COPY ubuntu@localhost:~/ + +echo "Files copied successfully" +echo "Making bloom executable and running test..." + +# Build the bloom test command with all config files +BLOOM_BINARY_NAME=$(basename "$BLOOM_BINARY") +CONFIG_NAMES="" +for config in "${BLOOM_CONFIGS[@]}"; do + CONFIG_NAMES="$CONFIG_NAMES $(basename "$config")" +done + +# Make bloom executable and run the test +ssh -i "$VM_NAME/qemu-login" -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -p 2222 ubuntu@localhost chmod +x $BLOOM_BINARY_NAME +echo "Running: sudo ./$BLOOM_BINARY_NAME test$CONFIG_NAMES" +ssh -i "$VM_NAME/qemu-login" -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -p 2222 ubuntu@localhost sudo ./$BLOOM_BINARY_NAME test$CONFIG_NAMES | tee "$VM_NAME-test-results.yaml" + +echo "" +echo "Test execution completed" +echo "Results saved to: test-results.yaml" + +# Clean up VM +echo "" +echo "Cleaning up VM..." +bash "$VM_NAME/stop-vm.sh" || killall qemu-system-x86_64 2>/dev/null || true +sleep 2 + +echo "Removing $VM_NAME directory..." +rm -rf "$VM_NAME" + +echo "" +echo "✓ VM deleted and cleaned up" diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 7705640..d088655 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -12,8 +12,22 @@ jobs: - name: Build with devbox run: devbox run build + - name: Run unit tests + run: devbox run go test -v ./pkg + - name: Run step integration tests - run: dist/bloom test integration_tests/step/*/bloom.yaml + run: dist/bloom test integration_tests/step/*/*/bloom.yaml + + - name: QEMU Disk Test + run: | + bash .github/workflows/qemu-disk-test.sh nvme-test-vm dist/bloom /home/ubuntu/ci/bloom.yaml /home/ubuntu/ci/bloom.yaml - - name: Deploy - run: sudo ./dist/bloom test /home/ubuntu/ci/bloom.yaml + - name: Check Test Results + run: | + SUCCESS=$(yq '.overall_summary.success' nvme-test-vm-test-results.yaml) + if [ "$SUCCESS" != "true" ]; then + echo "❌ Tests failed - check nvme-test-vm-test-results.yaml for details" + cat nvme-test-vm-test-results.yaml + exit 1 + fi + echo "✅ All tests passed" diff --git a/cmd/root.go b/cmd/root.go index d61ffac..e45142b 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -204,8 +204,8 @@ func rootSteps() []pkg.Step { pkg.SetupMetallbStep, pkg.CreateMetalLBConfigStep, pkg.CreateDomainConfigStep, - pkg.CreateBloomConfigMapStepFunc(Version), pkg.WaitForClusterReady, + pkg.CreateBloomConfigMapStepFunc(Version), pkg.SetupClusterForgeStep, } diff --git a/cmd/test.go b/cmd/test.go index 2045e4e..2a01c1c 100644 --- a/cmd/test.go +++ b/cmd/test.go @@ -49,6 +49,8 @@ func testSteps(configFiles []string) { os.Exit(1) } + overallStartTime := time.Now() + fmt.Println("---") fmt.Printf("total_configs: %d\n", len(configFiles)) fmt.Println("test_runs:") @@ -69,11 +71,15 @@ func testSteps(configFiles []string) { } } + overallDuration := time.Since(overallStartTime) + // Print overall summary fmt.Println("overall_summary:") fmt.Printf(" total: %d\n", len(configFiles)) fmt.Printf(" passed: %d\n", passedCount) fmt.Printf(" failed: %d\n", failedCount) + fmt.Printf(" duration_ms: %d\n", overallDuration.Milliseconds()) + fmt.Printf(" duration: %v\n", overallDuration.Round(time.Millisecond)) if len(failedConfigs) > 0 { fmt.Println(" failed_configs:") for _, config := range failedConfigs { @@ -126,6 +132,7 @@ func runTestConfig(configFile string, configIdx int) bool { var finalErr error var completedSteps []string var failedStep string + configStartTime := time.Now() for i, step := range enabledSteps { fmt.Printf(" - id: %s\n", step.Id) @@ -156,7 +163,7 @@ func runTestConfig(configFile string, configIdx int) bool { pkg.LogMessage(pkg.Error, fmt.Sprintf("Execution failed: %v", result.Error)) break } else if !skipped { - completedSteps = append(completedSteps, step.Name) + completedSteps = append(completedSteps, step.Id) fmt.Println(" status: completed") if result.Message != "" { fmt.Printf(" message: \"%s\"\n", result.Message) @@ -169,10 +176,14 @@ func runTestConfig(configFile string, configIdx int) bool { time.Sleep(500 * time.Millisecond) } + configDuration := time.Since(configStartTime) + // Print summary for this config fmt.Println(" summary:") fmt.Printf(" total: %d\n", len(enabledSteps)) fmt.Printf(" completed: %d\n", len(completedSteps)) + fmt.Printf(" duration_ms: %d\n", configDuration.Milliseconds()) + fmt.Printf(" duration: %v\n", configDuration.Round(time.Millisecond)) // Determine success based on expected error success := false @@ -200,8 +211,8 @@ func runTestConfig(configFile string, configIdx int) bool { if len(completedSteps) > 0 { fmt.Println(" completed_steps:") - for _, stepName := range completedSteps { - fmt.Printf(" - %s\n", stepName) + for _, stepId := range completedSteps { + fmt.Printf(" - %s\n", stepId) } } diff --git a/pkg/clusterforge.tar.gz b/pkg/clusterforge.tar.gz new file mode 100644 index 0000000..e69de29 diff --git a/pkg/disks_test.go b/pkg/disks_test.go index 0992621..65bbc62 100644 --- a/pkg/disks_test.go +++ b/pkg/disks_test.go @@ -45,23 +45,6 @@ func TestGenerateNodeLabels(t *testing.T) { } viper.Set("CLUSTER_PREMOUNTED_DISKS", "") }) - - t.Run("with NO_DISKS_FOR_CLUSTER", func(t *testing.T) { - viper.Set("NO_DISKS_FOR_CLUSTER", true) - err := GenerateNodeLabels(map[string]string{}) - if err != nil { - t.Errorf("Expected no error with NO_DISKS_FOR_CLUSTER, got: %v", err) - } - viper.Set("NO_DISKS_FOR_CLUSTER", false) - }) - - t.Run("with no cluster disks", func(t *testing.T) { - viper.Set("CLUSTER_DISKS", []string{}) - err := GenerateNodeLabels(map[string]string{}) - if err != nil { - t.Errorf("Expected no error with empty disk list, got: %v", err) - } - }) } func TestIsVirtualDisk(t *testing.T) { @@ -179,4 +162,3 @@ func TestAppendToFile(t *testing.T) { t.Errorf("Expected %s, got %s", content, string(data)) } } - diff --git a/pkg/rke2_test.go b/pkg/rke2_test.go index 6f0983e..6989d5d 100644 --- a/pkg/rke2_test.go +++ b/pkg/rke2_test.go @@ -57,8 +57,8 @@ func TestPrepareRKE2(t *testing.T) { } tests := []struct { - name string - oidcURL string + name string + oidcURL string }{ {"without OIDC", ""}, {"with invalid OIDC", "invalid-url"}, @@ -110,7 +110,7 @@ func TestSetupRKE2Additional(t *testing.T) { viper.Set("SERVER_IP", tt.serverIP) viper.Set("JOIN_TOKEN", tt.joinToken) viper.Set("RKE2_INSTALLATION_URL", "https://get.rke2.io") - + err := SetupRKE2Additional() if tt.expectError && err == nil { t.Errorf("Expected error but got none") @@ -165,12 +165,12 @@ func TestRKE2ConfigContent(t *testing.T) { func TestOIDCConfigTemplate(t *testing.T) { config := oidcConfigTemplate - + if !strings.Contains(config, "--oidc-issuer-url=%s") { t.Errorf("Expected oidcConfigTemplate to contain '--oidc-issuer-url=%%s'") } - if !strings.Contains(config, "--oidc-client-id=rke-clusters") { - t.Errorf("Expected oidcConfigTemplate to contain '--oidc-client-id=rke-clusters'") + if !strings.Contains(config, "--oidc-client-id=k8s") { + t.Errorf("Expected oidcConfigTemplate to contain '--oidc-client-id=k8s'") } if !strings.Contains(config, "--oidc-username-claim=preferred_username") { t.Errorf("Expected oidcConfigTemplate to contain '--oidc-username-claim=preferred_username'") @@ -178,4 +178,4 @@ func TestOIDCConfigTemplate(t *testing.T) { if !strings.Contains(config, "--oidc-groups-claim=groups") { t.Errorf("Expected oidcConfigTemplate to contain '--oidc-groups-claim=groups'") } -} \ No newline at end of file +} diff --git a/pkg/steps.go b/pkg/steps.go index 99c9c2d..c9cf9a1 100644 --- a/pkg/steps.go +++ b/pkg/steps.go @@ -286,6 +286,13 @@ var UpdateModprobeStep = Step{ Id: "UpdateModprobeStep", Name: "Update Modprobe", Description: "Update Modprobe to unblacklist amdgpu", + Skip: func() bool { + if !viper.GetBool("GPU_NODE") { + LogMessage(Info, "Skipping ROCm setup for non-GPU node") + return true + } + return false + }, Action: func() StepResult { err := updateModprobe() if err != nil { @@ -714,7 +721,6 @@ func CreateBloomConfigMapStepFunc(version string) Step { if viper.GetBool("FIRST_NODE") { LogMessage(Info, "Waiting for cluster to be ready...") - time.Sleep(10 * time.Second) err := CreateConfigMap(version) if err != nil { LogMessage(Error, fmt.Sprintf("Failed to create bloom ConfigMap: %v", err)) diff --git a/pkg/steps_test.go b/pkg/steps_test.go index 841ca35..373abbc 100644 --- a/pkg/steps_test.go +++ b/pkg/steps_test.go @@ -103,6 +103,9 @@ func TestStepExecution(t *testing.T) { }) t.Run("InotifyInstancesStep", func(t *testing.T) { + if os.Getuid() != 0 { + t.Skip("Skipping test that requires root privileges") + } result := InotifyInstancesStep.Action() if result.Error == nil && result.Message == "" { // Result structure is valid @@ -112,21 +115,20 @@ func TestStepExecution(t *testing.T) { func TestSetupAndCheckRocmStep(t *testing.T) { viper.Set("GPU_NODE", false) - result := SetupAndCheckRocmStep.Action() - if result.Error != nil { - t.Errorf("Expected no error for non-GPU node, got: %v", result.Error) + if !SetupAndCheckRocmStep.Skip() { + result := SetupAndCheckRocmStep.Action() + if result.Error != nil { + t.Errorf("Expected no error for non-GPU node, got: %v", result.Error) + } } - - viper.Set("GPU_NODE", true) - result = SetupAndCheckRocmStep.Action() - // Result depends on system state } func TestPrepareLonghornDisksStep(t *testing.T) { - viper.Set("CLUSTER_DISKS", []string{}) + viper.Reset() + viper.Set("NO_DISKS_FOR_CLUSTER", true) result := PrepareLonghornDisksStep.Action() if result.Error != nil { - t.Errorf("Expected no error with empty disk list, got: %v", result.Error) + t.Errorf("Expected no error with NO_DISKS_FOR_CLUSTER=true, got: %v", result.Error) } } @@ -148,17 +150,21 @@ func TestSetupLonghornStep(t *testing.T) { func TestCreateMetalLBConfigStep(t *testing.T) { viper.Set("FIRST_NODE", false) - result := CreateMetalLBConfigStep.Action() - if result.Error != nil { - t.Errorf("Expected no error for non-first node, got: %v", result.Error) + if !CreateMetalLBConfigStep.Skip() { + result := CreateMetalLBConfigStep.Action() + if result.Error != nil { + t.Errorf("Expected no error for non-first node, got: %v", result.Error) + } } } func TestSetupKubeConfig(t *testing.T) { viper.Set("FIRST_NODE", false) - result := SetupKubeConfig.Action() - if result.Error != nil { - t.Errorf("Expected no error for non-first node, got: %v", result.Error) + if !SetupKubeConfig.Skip() { + result := SetupKubeConfig.Action() + if result.Error != nil { + t.Errorf("Expected no error for non-first node, got: %v", result.Error) + } } }