From 8dda7320ff04d2c2179e019db03ccfbc5aaf5627 Mon Sep 17 00:00:00 2001
From: Daniel Vaskivaara <daniel.vaskivaara@amd.com>
Date: Thu, 15 Jan 2026 22:20:34 +0200
Subject: [PATCH 1/3] docs: Longhorn setup and recovery docs

---
 docs/00-manual-steps-quick-reference.md      | 739 +++++++++++++++++++
 docs/11-longhorn-drive-setup-and-recovery.md | 480 ++++++++++++
 2 files changed, 1219 insertions(+)
 create mode 100644 docs/00-manual-steps-quick-reference.md
 create mode 100644 docs/11-longhorn-drive-setup-and-recovery.md

diff --git a/docs/00-manual-steps-quick-reference.md b/docs/00-manual-steps-quick-reference.md
new file mode 100644
index 0000000..8bf2c6e
--- /dev/null
+++ b/docs/00-manual-steps-quick-reference.md
@@ -0,0 +1,739 @@
+# Manual Steps Quick Reference
+
+## Overview
+
+This document collates all manual installation steps from the detailed documentation into a single sequential reference. Use this for quick lookup when performing manual installations or troubleshooting.
+
+For complete context and explanations, refer to the detailed documentation:
+- [01-rke2-deployment.md](./01-rke2-deployment.md)
+- [02-rocm-support.md](./02-rocm-support.md)
+- [03-storage-management.md](./03-storage-management.md)
+- [04-network-configuration.md](./04-network-configuration.md)
+- [05-certificate-management.md](./05-certificate-management.md)
+- [08-installation-guide.md](./08-installation-guide.md)
+
+---
+
+## Prerequisites Verification
+
+### System Requirements
+```bash
+# Verify Ubuntu version (must be 20.04, 22.04, or 24.04)
+lsb_release -a
+
+# Check disk space (root: 20GB+, /var: 5GB+, /var/lib/rancher: 500GB+ recommended)
+df -h / /var
+
+# Verify memory (4GB+ minimum, 8GB+ recommended) and CPU (2+ cores, 4+ recommended)
+free -h
+nproc
+
+# Check kernel modules
+lsmod | grep overlay
+lsmod | grep br_netfilter
+lsmod | grep amdgpu  # For GPU nodes only
+```
+
+---
+
+## First Node Installation
+
+### 1. System Preparation
+
+**Update System and Install Dependencies**
+```bash
+sudo apt update
+sudo apt install -y jq nfs-common open-iscsi chrony curl wget
+```
+
+**Configure Firewall Ports**
+```bash
+# RKE2 required ports
+sudo ufw allow 6443/tcp     # Kubernetes API
+sudo ufw allow 9345/tcp     # RKE2 supervisor
+sudo ufw allow 10250/tcp    # kubelet
+sudo ufw allow 2379:2380/tcp # etcd
+sudo ufw allow 30000:32767/tcp # NodePort services
+sudo ufw allow 8472/udp     # Cilium VXLAN
+sudo ufw allow 4240/tcp     # Cilium health checks
+```
+
+**Configure inotify Limits**
+```bash
+echo "fs.inotify.max_user_instances = 8192" | sudo tee -a /etc/sysctl.conf
+echo "fs.inotify.max_user_watches = 524288" | sudo tee -a /etc/sysctl.conf
+sudo sysctl -p
+```
+
+**Install Kubernetes Tools**
+```bash
+# Install kubectl
+curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
+sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
+
+# Install k9s
+wget https://github.com/derailed/k9s/releases/latest/download/k9s_Linux_amd64.tar.gz
+tar -xzf k9s_Linux_amd64.tar.gz
+sudo mv k9s /usr/local/bin/
+```
+
+### 2. Storage Configuration
+
+**Configure Multipath**
+```bash
+sudo apt install -y multipath-tools
+
+cat <<EOF | sudo tee /etc/multipath.conf
+blacklist {
+    devnode "^(ram|raw|loop|fd|md|dm-|sr|scd|st)[0-9]*"
+    devnode "^hd[a-z]"
+    devnode "^sd[a-z]"
+}
+EOF
+
+sudo systemctl restart multipathd
+```
+
+**Load Required Kernel Modules**
+```bash
+sudo modprobe overlay
+sudo modprobe br_netfilter
+
+# Make persistent
+cat <<EOF | sudo tee /etc/modules-load.d/k8s.conf
+overlay
+br_netfilter
+EOF
+```
+
+**Prepare Disks for Longhorn**
+```bash
+# List available NVMe drives
+lsblk | grep nvme
+
+# For each disk (e.g., /dev/nvme0n1) - WARNING: This erases all data!
+sudo wipefs -a /dev/nvme0n1
+sudo mkfs.ext4 /dev/nvme0n1
+
+# Get UUID and create mount point
+UUID=$(sudo blkid -s UUID -o value /dev/nvme0n1)
+sudo mkdir -p /mnt/disk0
+
+# Add to fstab for persistence
+echo "UUID=$UUID /mnt/disk0 ext4 defaults,nofail 0 2" | sudo tee -a /etc/fstab
+
+# Mount the disk
+sudo mount -a
+```
+
+**Configure rsyslog**
+```bash
+cat <<EOF | sudo tee /etc/rsyslog.d/30-ratelimit.conf
+# Limit iSCSI messages to prevent log flooding
+:msg, contains, "iSCSI" stop
+EOF
+
+sudo systemctl restart rsyslog
+```
+
+**Configure logrotate**
+```bash
+cat <<EOF | sudo tee /etc/logrotate.d/bloom
+/var/log/bloom.log {
+    daily
+    rotate 7
+    compress
+    missingok
+    notifempty
+}
+EOF
+```
+
+### 3. GPU Setup (GPU Nodes Only)
+
+**Install ROCm Drivers**
+```bash
+# Get Ubuntu codename and kernel version
+CODENAME=$(grep VERSION_CODENAME /etc/os-release | cut -d= -f2)
+KERNEL_VERSION=$(uname -r)
+
+# Install kernel headers
+sudo apt install -y linux-headers-$KERNEL_VERSION linux-modules-extra-$KERNEL_VERSION
+
+# Install Python dependencies
+sudo apt install -y python3-setuptools python3-wheel
+
+# Download and install amdgpu-install
+wget https://repo.radeon.com/amdgpu-install/6.3.2/ubuntu/$CODENAME/amdgpu-install_6.3.60302-1_all.deb
+sudo apt install -y ./amdgpu-install_6.3.60302-1_all.deb
+
+# Install ROCm
+sudo amdgpu-install --usecase=rocm,dkms --yes
+
+# Load amdgpu module
+sudo modprobe amdgpu
+
+# Verify installation
+rocm-smi
+```
+
+**Configure GPU Permissions**
+```bash
+cat <<EOF | sudo tee /etc/udev/rules.d/70-amdgpu.rules
+KERNEL=="kfd", MODE="0666"
+SUBSYSTEM=="drm", KERNEL=="renderD*", MODE="0666"
+EOF
+
+sudo udevadm control --reload-rules
+sudo udevadm trigger
+```
+
+### 4. RKE2 Kubernetes Installation
+
+**Install RKE2**
+```bash
+curl -sfL https://get.rke2.io | sudo sh -
+sudo mkdir -p /etc/rancher/rke2
+```
+
+**Configure RKE2 for First Node**
+```bash
+cat <<EOF | sudo tee /etc/rancher/rke2/config.yaml
+write-kubeconfig-mode: "0644"
+cni: cilium
+disable:
+  - rke2-ingress-nginx
+tls-san:
+  - $(hostname -I | awk '{print $1}')
+node-label:
+  - "node.longhorn.io/create-default-disk=config"
+  - "node.longhorn.io/instance-manager=true"
+EOF
+
+# Add GPU labels for GPU nodes:
+# node-label:
+#   - "gpu=true"
+#   - "amd.com/gpu=true"
+
+# Add Longhorn disk labels (for each mounted disk):
+# node-label:
+#   - "silogen.ai/longhorndisks=disk0xxxdisk1xxx..."
+```
+
+**Create Audit Policy**
+```bash
+sudo mkdir -p /etc/rancher/rke2
+cat <<EOF | sudo tee /etc/rancher/rke2/audit-policy.yaml
+apiVersion: audit.k8s.io/v1
+kind: Policy
+rules:
+  - level: Metadata
+EOF
+
+# Audit policy and logs are configured automatically in RKE2 config
+# No manual config update needed
+```
+
+**Start RKE2 Service**
+```bash
+sudo systemctl enable rke2-server.service
+sudo systemctl start rke2-server.service
+
+# Wait for RKE2 to start (may take 2-5 minutes)
+sudo systemctl status rke2-server.service
+
+# Check logs if needed
+sudo journalctl -u rke2-server -f
+```
+
+**Configure kubectl Access**
+```bash
+# Copy kubeconfig to user directory
+mkdir -p ~/.kube
+sudo cp /etc/rancher/rke2/rke2.yaml ~/.kube/config
+sudo chown $(id -u):$(id -g) ~/.kube/config
+
+# Add to PATH
+echo 'export PATH=$PATH:/var/lib/rancher/rke2/bin' >> ~/.bashrc
+export PATH=$PATH:/var/lib/rancher/rke2/bin
+
+# Verify cluster is running
+kubectl get nodes
+```
+
+**Get Join Information for Additional Nodes**
+```bash
+# Get join token
+sudo cat /var/lib/rancher/rke2/server/node-token
+
+# Get server IP
+hostname -I | awk '{print $1}'
+```
+
+### 5. Storage and Networking Setup
+
+**Deploy Longhorn Storage**
+```bash
+# Create Longhorn namespace
+kubectl create namespace longhorn-system
+
+# Apply Longhorn manifests
+kubectl apply -f https://raw.githubusercontent.com/longhorn/longhorn/v1.8.0/deploy/longhorn.yaml
+
+# Wait for Longhorn pods to be ready
+kubectl wait --for=condition=ready pod -l app=longhorn-manager -n longhorn-system --timeout=600s
+
+# Create default storage class
+cat <<EOF | kubectl apply -f -
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: mlstorage
+  annotations:
+    storageclass.kubernetes.io/is-default-class: "true"
+provisioner: driver.longhorn.io
+allowVolumeExpansion: true
+reclaimPolicy: Delete
+volumeBindingMode: Immediate
+parameters:
+  numberOfReplicas: "3"
+  staleReplicaTimeout: "2880"
+  fromBackup: ""
+  fsType: "ext4"
+EOF
+```
+
+**Deploy MetalLB Load Balancer**
+```bash
+# Get node IP for MetalLB pool
+NODE_IP=$(hostname -I | awk '{print $1}')
+
+# Install MetalLB
+kubectl apply -f https://raw.githubusercontent.com/metallb/metallb/v0.13.12/config/manifests/metallb-native.yaml
+
+# Wait for MetalLB to be ready
+kubectl wait --namespace metallb-system \
+  --for=condition=ready pod \
+  --selector=app=metallb \
+  --timeout=90s
+
+# Create IP address pool
+cat <<EOF | kubectl apply -f -
+apiVersion: metallb.io/v1beta1
+kind: IPAddressPool
+metadata:
+  name: cluster-bloom-ip-pool
+  namespace: metallb-system
+spec:
+  addresses:
+  - $NODE_IP/32
+---
+apiVersion: metallb.io/v1beta1
+kind: L2Advertisement
+metadata:
+  name: cluster-bloom-l2-adv
+  namespace: metallb-system
+spec:
+  ipAddressPools:
+  - cluster-bloom-ip-pool
+EOF
+```
+
+**Configure Chrony Time Synchronization**
+```bash
+cat <<EOF | sudo tee /etc/chrony/chrony.conf
+server 0.ubuntu.pool.ntp.org iburst
+server 1.ubuntu.pool.ntp.org iburst
+server 2.ubuntu.pool.ntp.org iburst
+server 3.ubuntu.pool.ntp.org iburst
+allow 0.0.0.0/0
+local stratum 10
+EOF
+
+sudo systemctl restart chrony
+```
+
+### 6. TLS Certificate Configuration
+
+**Option A: Using cert-manager with Let's Encrypt (Recommended for Production)**
+```bash
+DOMAIN="your.domain.com"
+
+# Install cert-manager
+kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.0/cert-manager.yaml
+
+# Wait for cert-manager to be ready
+kubectl wait --for=condition=ready pod -l app=cert-manager -n cert-manager --timeout=300s
+
+# Create domain configuration
+cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: cluster-domain
+  namespace: default
+data:
+  domain: "$DOMAIN"
+  use-cert-manager: "true"
+EOF
+```
+
+**Option B: Using Existing Certificates**
+```bash
+DOMAIN="your.domain.com"
+CERT_PATH="/path/to/tls.crt"
+KEY_PATH="/path/to/tls.key"
+
+# Create TLS secret from existing certificates
+kubectl create secret tls cluster-tls \
+  --cert=$CERT_PATH \
+  --key=$KEY_PATH \
+  -n default
+
+# Create domain configuration
+cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: cluster-domain
+  namespace: default
+data:
+  domain: "$DOMAIN"
+  use-cert-manager: "false"
+EOF
+```
+
+**Option C: Generate Self-Signed Certificates (Development/Testing)**
+```bash
+DOMAIN="your.domain.com"
+
+# Generate self-signed certificate
+openssl req -x509 -nodes -days 365 -newkey rsa:2048 \
+  -keyout tls.key -out tls.crt \
+  -subj "/CN=$DOMAIN/O=$DOMAIN" \
+  -addext "subjectAltName=DNS:$DOMAIN,DNS:*.$DOMAIN"
+
+# Create TLS secret
+kubectl create secret tls cluster-tls \
+  --cert=tls.crt \
+  --key=tls.key \
+  -n default
+
+# Create domain configuration
+cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: cluster-domain
+  namespace: default
+data:
+  domain: "$DOMAIN"
+  use-cert-manager: "false"
+EOF
+```
+
+---
+
+## Additional Node Installation
+
+### Worker Node Setup
+
+**1. Perform System Preparation (same as First Node)**
+- Update system and install dependencies
+- Configure firewall
+- Configure multipath
+- Load kernel modules
+- Prepare disks (if storage node)
+- Install ROCm (if GPU node)
+
+**2. Install RKE2 Agent**
+```bash
+# Install RKE2 agent
+curl -sfL https://get.rke2.io | INSTALL_RKE2_TYPE="agent" sudo sh -
+
+# Configure RKE2 agent
+sudo mkdir -p /etc/rancher/rke2
+cat <<EOF | sudo tee /etc/rancher/rke2/config.yaml
+server: https://<FIRST_NODE_IP>:9345
+token: <JOIN_TOKEN>
+node-label:
+  - "node.longhorn.io/create-default-disk=config"
+  - "node.longhorn.io/instance-manager=true"
+EOF
+
+# Add GPU labels for GPU nodes:
+# node-label:
+#   - "gpu=true"
+#   - "amd.com/gpu=true"
+
+# Add Longhorn disk labels (for each mounted disk):
+# node-label:
+#   - "silogen.ai/longhorndisks=disk0xxxdisk1xxx..."
+
+# Start agent service
+sudo systemctl enable rke2-agent.service
+sudo systemctl start rke2-agent.service
+```
+
+**3. Configure Chrony to Sync with First Node**
+```bash
+cat <<EOF | sudo tee /etc/chrony/chrony.conf
+server <FIRST_NODE_IP> iburst
+EOF
+
+sudo systemctl restart chrony
+```
+
+### Additional Control Plane Node Setup
+
+**1. Perform System Preparation (same as First Node)**
+
+**2. Install RKE2 Server**
+```bash
+# Install RKE2 server
+curl -sfL https://get.rke2.io | sudo sh -
+
+# Configure RKE2 server
+sudo mkdir -p /etc/rancher/rke2
+cat <<EOF | sudo tee /etc/rancher/rke2/config.yaml
+server: https://<FIRST_NODE_IP>:9345
+token: <JOIN_TOKEN>
+write-kubeconfig-mode: "0644"
+tls-san:
+  - $(hostname -I | awk '{print $1}')
+node-label:
+  - "node.longhorn.io/create-default-disk=config"
+  - "node.longhorn.io/instance-manager=true"
+EOF
+
+# Start server service
+sudo systemctl enable rke2-server.service
+sudo systemctl start rke2-server.service
+```
+
+**3. Configure Chrony**
+```bash
+cat <<EOF | sudo tee /etc/chrony/chrony.conf
+server <FIRST_NODE_IP> iburst
+EOF
+
+sudo systemctl restart chrony
+```
+
+---
+
+## Post-Installation Verification
+
+**Verify All Pods Running**
+```bash
+kubectl get pods -A
+```
+
+**Check Node Status**
+```bash
+kubectl get nodes -o wide
+```
+
+**Verify Longhorn**
+```bash
+kubectl get pods -n longhorn-system
+kubectl get storageclass
+```
+
+**Verify MetalLB**
+```bash
+kubectl get pods -n metallb-system
+kubectl get ipaddresspool -n metallb-system
+```
+
+**Test PVC Creation (Longhorn Validation)**
+```bash
+cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: test-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
+EOF
+
+# Wait and verify
+kubectl get pvc test-pvc
+
+# Clean up
+kubectl delete pvc test-pvc
+```
+
+**Check GPU Access (GPU Nodes)**
+```bash
+# Deploy test pod
+kubectl apply -f - <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: rocm-test
+spec:
+  containers:
+  - name: rocm-test
+    image: rocm/pytorch:latest
+    command: ["rocm-smi"]
+    resources:
+      limits:
+        amd.com/gpu: 1
+  restartPolicy: Never
+EOF
+
+# Check output
+kubectl logs rocm-test
+
+# Clean up
+kubectl delete pod rocm-test
+```
+
+**Verify Time Synchronization**
+```bash
+# On all nodes
+chronyc sources
+chronyc tracking
+```
+
+**Test Certificate (if configured)**
+```bash
+# Check certificate secret
+kubectl get secret cluster-tls -n default
+
+# Test HTTPS endpoint (if ingress configured)
+curl -k https://<DOMAIN>
+```
+
+---
+
+## Troubleshooting Quick Reference
+
+**RKE2 Service Not Starting**
+```bash
+# Check logs
+sudo journalctl -u rke2-server -n 100 --no-pager
+
+# Verify configuration
+sudo cat /etc/rancher/rke2/config.yaml
+
+# Check disk space
+df -h /var/lib/rancher
+```
+
+**Node Not Joining**
+```bash
+# Verify token and server IP
+sudo cat /etc/rancher/rke2/config.yaml
+
+# Check firewall rules
+sudo ufw status
+
+# Test connectivity to first node
+nc -zv <FIRST_NODE_IP> 9345
+
+# Check agent logs
+sudo journalctl -u rke2-agent -n 100 --no-pager
+```
+
+**Longhorn Pods Not Starting**
+```bash
+# Check disk mounts
+df -h | grep /mnt/disk
+
+# Verify node labels
+kubectl get nodes --show-labels | grep longhorn
+
+# Check Longhorn logs
+kubectl logs -n longhorn-system -l app=longhorn-manager --tail=100
+```
+
+**GPU Not Detected**
+```bash
+# Verify amdgpu module loaded
+lsmod | grep amdgpu
+
+# Check ROCm installation
+rocm-smi
+
+# Verify device plugin
+kubectl get pods -n kube-system | grep amd-gpu
+```
+
+**Time Sync Issues**
+```bash
+# Check chrony status
+chronyc tracking
+
+# Restart chrony
+sudo systemctl restart chrony
+
+# Verify NTP sources
+chronyc sources -v
+```
+
+---
+
+## Environment Variables for ClusterBloom
+
+If using ClusterBloom automation instead of manual steps, configure via environment variables or `bloom.yaml`:
+
+```bash
+# Node configuration
+export FIRST_NODE=true              # false for additional nodes
+export JOIN_TOKEN="<token>"         # Required for additional nodes
+export SERVER_IP="<ip>"             # Required for additional nodes
+
+# Storage configuration
+export NO_DISKS_FOR_CLUSTER=false   # Set true to skip disk setup
+export CLUSTER_DISKS="/dev/nvme0n1,/dev/nvme1n1"  # Pre-selected disks
+export CLUSTER_PREMOUNTED_DISKS="/mnt/disk0,/mnt/disk1"  # Pre-mounted paths
+
+# GPU configuration
+export GPU_NODE=true                # Enable GPU support
+
+# Certificate configuration
+export USE_CERT_MANAGER=true        # Use cert-manager for automatic certificates
+export DOMAIN="cluster.example.com" # Cluster domain
+export CERT_OPTION="generate"       # "existing" or "generate" if not using cert-manager
+export TLS_CERT="/path/to/cert.pem" # Path to certificate (if existing)
+export TLS_KEY="/path/to/key.pem"   # Path to key (if existing)
+
+# Network configuration
+export METALLB_IP_RANGE="192.168.1.100-192.168.1.110"  # Optional custom IP range
+```
+
+---
+
+## Related Documentation
+
+- **[PRD.md](./PRD.md)** - Product overview and features
+- **[01-rke2-deployment.md](./01-rke2-deployment.md)** - Detailed RKE2 deployment documentation
+- **[02-rocm-support.md](./02-rocm-support.md)** - AMD GPU and ROCm configuration
+- **[03-storage-management.md](./03-storage-management.md)** - Longhorn storage setup
+- **[04-network-configuration.md](./04-network-configuration.md)** - Network and load balancing
+- **[05-certificate-management.md](./05-certificate-management.md)** - TLS certificate management
+- **[08-installation-guide.md](./08-installation-guide.md)** - Complete manual installation guide
+- **[10-configuration-reference.md](./10-configuration-reference.md)** - Configuration variable reference
+
+---
+
+## Key Differences: Manual vs Automated
+
+ClusterBloom automates all of the above steps and provides:
+
+1. **Interactive UI** - TUI and Web UI for configuration and monitoring
+2. **Validation** - Pre-flight checks before any system modifications
+3. **Error Recovery** - Automatic retry and reconfiguration on failures
+4. **State Management** - Tracks progress and resumes on interruption
+5. **Configuration Management** - YAML-based configuration with validation
+6. **Disk Auto-detection** - Intelligent disk selection and formatting
+7. **Integration** - Seamless ClusterForge and 1Password Connect integration
+8. **Monitoring** - Real-time progress tracking and detailed logging
+9. **Multi-node Coordination** - Automatic generation of join commands
+10. **Best Practices** - Built-in configurations following Kubernetes best practices
diff --git a/docs/11-longhorn-drive-setup-and-recovery.md b/docs/11-longhorn-drive-setup-and-recovery.md
new file mode 100644
index 0000000..b3d593d
--- /dev/null
+++ b/docs/11-longhorn-drive-setup-and-recovery.md
@@ -0,0 +1,480 @@
+# Longhorn Drive Setup and Recovery Documentation
+
+This documentation provides comprehensive instructions forsetting up, recovering, and managing Longhorn drives on cluster-bloom nodes. It includes both manual step-by-step procedures and a sample (not officially supported) script which serves as an automation example.
+
+## Table of Contents
+
+1. [Overview](#overview)
+2. [Prerequisites](#prerequisites)
+3. [Disk Space Requirements](#disk-space-requirements)
+4. [RAID Considerations](#raid-considerations)
+5. [Reboot Checklist](#reboot-checklist)
+6. [Manual Disk Setup Procedure](#manual-disk-setup-procedure)
+7. [Automation Script](#automation-script)
+8. [Longhorn UI Configuration](#longhorn-ui-configuration)
+9. [Troubleshooting](#troubleshooting)
+10. [Reference](#reference)
+
+## Overview
+
+Longhorn is a distributed block storage system for Kubernetes that requires proper disk configuration to ensure data persistence across node reboots. This documentation covers:
+
+- **Drive Priority**: NVMe drives (preferred) → SSD drives → HDD drives (sda, sdb, etc.)
+- **RAID Restriction**: Longhorn explicitly does NOT support RAID configurations
+- **Special Requirements**: `/var/lib/rancher` needs dedicated mountpoint only if root partition is space-constrained
+- **Mount Pattern**: Disks mounted at `/mnt/diskX` where X starts from 0 and increments
+- **Filesystem**: ext4 with UUID-based mounting for reliability
+
+## Prerequisites
+
+- Root access to the cluster node
+- Understanding of Linux disk management
+- Backup of important data (formatting operations are destructive)
+- Basic knowledge of Longhorn concepts
+
+### Required Packages
+
+Ensure these utilities are available:
+```bash
+sudo apt update
+sudo apt install -y util-linux e2fsprogs mdadm
+```
+
+## Disk Space Requirements
+
+Based on cluster-bloom requirements, ensure adequate disk space:
+
+#### Disk Space Requirements
+- **Root partition**: Minimum 10GB required, 20GB recommended
+- **Available space**: Minimum 10GB required  
+- **/var partition**: 5GB recommended for container images
+- **/var/lib/rancher**: dedicated partition in the case that the root partition is constrained (and no separate /var or /var/lib mounts exist)
+
+#### Space Validation
+```bash
+# Check current disk usage
+df -h /
+df -h /var
+df -h /var/lib/rancher 2>/dev/null || echo "/var/lib/rancher not separately mounted"
+
+# Check available space
+df -h | awk '$6=="/" {print "Root partition: " $4 " available"}'
+```
+
+## RAID Considerations
+
+**⚠️ CRITICAL**: Longhorn documentation explicitly states that **RAID configurations are NOT supported**. Longhorn provides its own replication and high availability mechanisms.
+
+### Detecting RAID Configuration
+
+Check if your system has software RAID that needs to be removed:
+
+```bash
+# Check for software RAID arrays
+cat /proc/mdstat
+
+# List RAID arrays
+sudo mdadm --detail --scan
+
+# Example of RAID configuration that must be removed:
+```
+
+**Example `lsblk` output showing problematic RAID setup:**
+```
+NAME        MAJ:MIN RM   SIZE RO TYPE  MOUNTPOINTS                UUID
+nvme0n1     259:0    0   3.5T  0 disk                             
+└─nvme0n1p1 259:1    0   3.5T  0 part                             
+  └─md0       9:0    0    14T  0 raid0 /                         
+nvme1n1     259:2    0   894G  0 disk                             
+├─nvme1n1p1 259:3    0   512M  0 part  /boot/efi                 
+└─nvme1n1p2 259:4    0   893G  0 part  [SWAP]                    
+nvme2n1     259:5    0   3.5T  0 disk                             
+└─nvme2n1p1 259:6    0   3.5T  0 part                             
+  └─md0       9:0    0    14T  0 raid0 /                         
+nvme3n1     259:7    0   3.5T  0 disk                             
+└─nvme3n1p1 259:8    0   3.5T  0 part                             
+  └─md0       9:0    0    14T  0 raid0 /                         
+nvme4n1     259:9    0   3.5T  0 disk                             
+└─nvme4n1p1 259:10   0   3.5T  0 part                             
+  └─md0       9:0    0    14T  0 raid0 /                         
+nvme5n1     259:11   0   894G  0 disk                             
+```
+
+In the above example, **md0** shows a RAID0 array using multiple NVMe drives - this must be broken apart for Longhorn use.
+
+### RAID Removal Process
+
+The automation script can safely backup, remove, and optionally restore RAID configurations:
+
+```bash
+# Check if RAID is present
+cat /proc/mdstat
+
+# Backup and remove RAID (interactive)
+sudo bash longhorn-disk-setup.sh --remove-raid
+
+# Force RAID removal without confirmation
+sudo bash longhorn-disk-setup.sh --force-raid-removal
+```
+
+#### RAID Backup and Restore
+
+The script automatically backs up RAID configurations before removal:
+
+**Backup Location**: `/root/longhorn-raid-backup/`
+**Backup Contents**:
+- `mdadm.conf.backup` - RAID configuration
+- `mdstat.backup` - RAID status at backup time
+- `md*_detail.backup` - Individual array details
+
+**Manual RAID Restoration** (if needed):
+```bash
+# List backups
+ls -la /root/longhorn-raid-backup/
+
+# View original configuration
+cat /root/longhorn-raid-backup/mdadm.conf.backup
+
+# Restore RAID (DESTRUCTIVE - will recreate arrays)
+sudo mdadm --assemble --scan --config=/root/longhorn-raid-backup/mdadm.conf.backup
+```
+
+**⚠️ Important**: RAID restoration will destroy any data written to individual disks after RAID removal.
+
+## Reboot Checklist
+
+After any node reboot, verify that all Longhorn storage disks are properly mounted:
+
+### Quick Validation Commands
+
+```bash
+# 1. Check current fstab entries for Longhorn disks
+sudo cat /etc/fstab | grep -E "/mnt/disk[0-9]+"
+
+# 2. Check currently mounted disks
+df -h | grep -E "/mnt/disk[0-9]+"
+
+# 3. List all disks with UUIDs
+lsblk -o +UUID
+
+# 4. Verify all fstab entries mount correctly
+sudo mount -a && echo "All mounts successful" || echo "Mount errors detected"
+```
+
+### Expected fstab Format
+
+Your `/etc/fstab` should contain entries like:
+
+```bash
+UUID=f9134cf2-0205-4012-8e8b-ac44757a0d15 /mnt/disk0 ext4 defaults,nofail 0 2
+UUID=9111f9b3-e4e5-4a50-a9cc-3258d40786f3 /mnt/disk1 ext4 defaults,nofail 0 2
+UUID=e27fc7cd-356a-40de-89ae-ea1f0af59d24 /mnt/disk2 ext4 defaults,nofail 0 2
+UUID=489f3576-cf3b-4319-ba9d-a07427225f81 /mnt/disk3 ext4 defaults,nofail 0 2
+UUID=3206db8b-109e-4b9f-8320-7db4cca5210d /mnt/disk4 ext4 defaults,nofail 0 2
+```
+
+**Note**: The `nofail` option ensures the system boots even if a disk is unavailable.
+
+## Manual Disk Setup Procedure
+
+### Step 1: Identify Candidate Disks
+
+First, examine your system's storage layout:
+
+```bash
+# List all block devices with UUIDs
+lsblk -o +UUID
+
+# Example output with 5 NVMe drives:
+NAME        MAJ:MIN RM   SIZE RO TYPE  MOUNTPOINTS                UUID
+nvme0n1     259:0    0   3.5T  0 disk                             
+├─nvme0n1p1 259:1    0   3.5T  0 part  /                         a1b2c3d4-e5f6-7890-abcd-ef1234567890
+nvme1n1     259:2    0   894G  0 disk                             
+├─nvme1n1p1 259:3    0   512M  0 part  /boot/efi                 1234-5678
+└─nvme1n1p2 259:4    0   893G  0 part  [SWAP]                    
+nvme2n1     259:5    0   3.5T  0 disk                             
+nvme3n1     259:6    0   3.5T  0 disk                             
+nvme4n1     259:7    0   3.5T  0 disk                             
+nvme5n1     259:8    0   894G  0 disk                             
+sdb         8:16     0   256G  0 disk                             
+```
+
+**Disk Priority for Longhorn Storage**:
+1. **NVMe drives** (nvme0n1, nvme2n1, nvme3n1, nvme4n1, nvme5n1) - Highest priority
+2. **SSD drives** (typically sdb, sdc, etc.) - Medium priority  
+3. **HDD drives** (sda usually excluded as boot drive) - Lowest priority
+
+### Step 2: Check Current Mount Status
+
+```bash
+# Check what's currently mounted
+mount | grep -E "/mnt/disk|/var/lib/rancher"
+
+# Compare with fstab entries
+sudo cat /etc/fstab
+```
+
+### Step 3: Identify Unmounted Candidate Disks
+
+Look for disks that:
+- Are not currently mounted
+- Don't have a UUID (indicating they need formatting)
+- Are suitable for Longhorn storage
+
+Example identification process:
+
+```bash
+# Check if disk has UUID (formatted)
+sudo blkid /dev/nvme2n1
+# If no output, disk needs formatting
+
+# Check if disk is mounted
+mount | grep /dev/nvme2n1
+# If no output, disk is not mounted
+```
+
+### Step 4: Format Unmounted Disks
+
+**⚠️ WARNING**: This will destroy all data on the disk!
+
+For each unformatted disk:
+
+```bash
+# Format with ext4 filesystem
+sudo mkfs.ext4 /dev/nvme2n1
+
+# Verify UUID was assigned
+sudo blkid /dev/nvme2n1
+# Output: /dev/nvme2n1: UUID="e27fc7cd-356a-40de-89ae-ea1f0af59d24" TYPE="ext4"
+```
+
+### Step 5: Create Mount Points
+
+```bash
+# Create mount directories
+sudo mkdir -p /mnt/disk0
+sudo mkdir -p /mnt/disk1
+sudo mkdir -p /mnt/disk2
+sudo mkdir -p /mnt/disk3
+sudo mkdir -p /mnt/disk4
+# Continue for additional disks
+```
+
+### Step 6: Add Disks to fstab
+
+For each formatted disk, add an entry to `/etc/fstab`:
+
+```bash
+# Get the UUID for the disk
+UUID=$(sudo blkid -s UUID -o value /dev/nvme2n1)
+
+# Add entry to fstab (replace with actual UUID)
+echo "UUID=$UUID /mnt/disk2 ext4 defaults,nofail 0 2" | sudo tee -a /etc/fstab
+```
+
+### Step 7: Mount All Disks
+
+```bash
+# Mount all entries in fstab
+sudo mount -a
+
+# Verify successful mounting
+df -h | grep "/mnt/disk"
+```
+
+Expected output:
+```
+/dev/nvme2n1    3.4T   89M  3.2T   1% /mnt/disk0
+/dev/nvme3n1    3.4T   89M  3.2T   1% /mnt/disk1
+/dev/nvme4n1    3.4T   89M  3.2T   1% /mnt/disk2
+/dev/nvme5n1    894G   77M  848G   1% /mnt/disk3
+/dev/sdb        251G   65M  238G   1% /mnt/disk4
+```
+
+## Automation Script
+
+The automation script provides comprehensive disk management including RAID detection and removal:
+
+### Script Usage
+
+```bash
+# Dry run to see recommendations without making changes
+sudo bash longhorn-disk-setup.sh --dry-run
+
+# Full interactive setup (with RAID handling if needed)  
+sudo bash longhorn-disk-setup.sh
+
+# Force RAID backup and removal (if detected)
+sudo bash longhorn-disk-setup.sh --remove-raid
+```
+
+### Script Capabilities
+
+The script will:
+1. **Check disk space requirements** and recommend `/var/lib/rancher` setup if needed
+2. **Detect and handle software RAID** configurations safely
+3. **Discover candidate disks** (prioritized by type: NVMe → SSD → HDD)
+4. **Identify unformatted disks** and prompt for formatting
+5. **Create mount points** with proper permissions
+6. **Add fstab entries** with UUID-based mounting
+7. **Validate mounts** and test reboot safety
+8. **Provide summary** and next steps
+
+### RAID Handling Features
+
+- **RAID Detection**: Automatically detects software RAID arrays
+- **Configuration Backup**: Saves RAID configuration for potential restoration
+- **Safe Removal**: Properly stops and removes RAID arrays
+- **Restoration Capability**: Can restore original RAID if needed
+
+## Longhorn UI Configuration
+
+After disks are mounted and persistent, configure them in Longhorn:
+
+### Step 1: Access Longhorn UI
+
+```bash
+# Access the Longhorn dashboard
+https://longhorn.cluster-name
+```
+
+### Step 2: Add Disks to Nodes
+
+1. **Navigate to Nodes**: Click on the "Node" tab in the Longhorn UI
+2. **Select Node**: Choose the node you want to configure
+3. **Edit Disks**: In the "Operations" column (far right), click the dropdown menu and select "Edit node and disks"
+4. **Add Disk**: Scroll to the bottom of the form and click "Add disk"
+5. **Configure Disk**:
+   - **Name**: Descriptive name (e.g., "nvme-disk-0")
+   - **Disk Type**: "filesystem"
+   - **Path**: Mount path (e.g., "/mnt/disk0")
+   - **Storage Reserved**: Amount to reserve (bytes) - optional
+6. **Enable Scheduling**: Click the "Enable" button under "Scheduling"
+7. **Save**: Click "Save" to apply changes
+
+### Step 3: Verify Disk Addition
+
+- Check that the disk appears in the node's disk list
+- Verify "Schedulable" status is "True"
+- Monitor disk space and usage
+
+## Special Requirements
+
+### /var/lib/rancher Partition
+
+Based on cluster-bloom requirements and available space:
+
+- **Conditional Requirement**: `/var/lib/rancher` should have its own dedicated mount point **only if** root partition is space-constrained
+- **Size Guidelines**: Refer to disk space requirements above
+- **Configuration**: Can be specified via `CLUSTER_DISKS` or `CLUSTER_PREMOUNTED_DISKS` in bloom.yaml
+
+#### When to Create Separate /var/lib/rancher:
+```bash
+# Check if root partition needs dedicated /var/lib/rancher
+ROOT_AVAILABLE=$(df --output=avail / | tail -1)
+if [ "$ROOT_AVAILABLE" -lt 20971520 ]; then  # Less than 20GB in KB
+    echo "Root partition space-constrained, recommend separate /var/lib/rancher"
+else
+    echo "Root partition has sufficient space"
+fi
+```
+
+Example setup if needed:
+```bash
+# If using a dedicated disk for /var/lib/rancher
+UUID=12345678-90ab-cdef-1234-567890abcdef /var/lib/rancher ext4 defaults,nofail 0 2
+```
+
+## Troubleshooting
+
+### Common Issues
+
+**1. Disk not mounting after reboot**
+```bash
+# Check fstab entry syntax
+sudo cat /etc/fstab | grep UUID
+
+# Test mount manually
+sudo mount UUID=your-uuid-here /mnt/disk0
+
+# Check filesystem health
+sudo fsck -f /dev/nvme2n1
+```
+
+**2. UUID not found**
+```bash
+# Regenerate UUID if filesystem is corrupted
+sudo tune2fs -U random /dev/nvme2n1
+
+# Update fstab with new UUID
+sudo blkid /dev/nvme2n1
+```
+
+**3. Mount point permission issues**
+```bash
+# Fix ownership and permissions
+sudo chown root:root /mnt/disk0
+sudo chmod 755 /mnt/disk0
+```
+
+**4. Longhorn not detecting disks**
+- Ensure disk path matches exactly in Longhorn UI
+- Verify disk has sufficient space
+- Check Longhorn logs for errors
+```bash
+kubectl logs -n longhorn-system deployment/longhorn-manager
+```
+
+### Validation Commands
+
+```bash
+# Comprehensive disk check
+echo "=== Disk Status ==="
+lsblk -o +UUID
+
+echo -e "\n=== RAID Check ==="
+if [[ -f /proc/mdstat ]]; then
+    cat /proc/mdstat
+    if grep -q "^md" /proc/mdstat; then
+        echo "⚠️  RAID arrays detected - Longhorn does not support RAID!"
+    else
+        echo "✓ No RAID arrays found"
+    fi
+else
+    echo "✓ No RAID support"
+fi
+
+echo -e "\n=== Mount Status ==="
+df -h | grep "/mnt/disk"
+
+echo -e "\n=== fstab Entries ==="
+grep "/mnt/disk" /etc/fstab
+
+echo -e "\n=== Mount Test ==="
+sudo mount -a && echo "✓ All mounts successful" || echo "✗ Mount errors"
+
+echo -e "\n=== Disk Space Check ==="
+df -h / | tail -1 | awk '{print "Root: " $4 " available (" $5 " used)"}'
+```
+
+## Reference
+
+### Longhorn Documentation
+- [Multiple Disk Support](https://longhorn.io/docs/1.8.0/nodes-and-volumes/nodes/multidisk/)
+- [Node Space Usage](https://longhorn.io/docs/1.8.0/nodes-and-volumes/nodes/node-space-usage/)
+
+### Cluster-Bloom Configuration
+- Storage options: `NO_DISKS_FOR_CLUSTER`, `CLUSTER_DISKS`, `CLUSTER_PREMOUNTED_DISKS`
+- Device path format: `/dev/nvme0n1,/dev/nvme1n1` (comma-separated)
+- Premounted disk format: `/mnt/disk1,/mnt/disk2` (comma-separated)
+
+### Setup Script
+  - cluster-bloom/experimental/longhorn-disk-setup.sh
+
+### Best Practices
+1. **Always backup data** before disk operations
+2. **Use UUID-based mounting** for reliability
+3. **Test mount operations** before rebooting
+4. **Monitor disk space** regularly
+5. **Keep fstab entries simple** and well-documented
+6. **Use `nofail` option** to prevent boot issues
\ No newline at end of file

From 2b1cb73afb2c84c9ccaf80dc5c4a7a9fcc50e985 Mon Sep 17 00:00:00 2001
From: Daniel Vaskivaara <daniel.vaskivaara@amd.com>
Date: Thu, 15 Jan 2026 22:20:59 +0200
Subject: [PATCH 2/3] feat: experimental helper script to accompany Longhorn
 docs

---
 docs/11-manual-steps-quick-reference.md | 739 ------------------------
 experimental/longhorn-disk-setup.sh     | 597 +++++++++++++++++++
 2 files changed, 597 insertions(+), 739 deletions(-)
 delete mode 100644 docs/11-manual-steps-quick-reference.md
 create mode 100755 experimental/longhorn-disk-setup.sh

diff --git a/docs/11-manual-steps-quick-reference.md b/docs/11-manual-steps-quick-reference.md
deleted file mode 100644
index 8bf2c6e..0000000
--- a/docs/11-manual-steps-quick-reference.md
+++ /dev/null
@@ -1,739 +0,0 @@
-# Manual Steps Quick Reference
-
-## Overview
-
-This document collates all manual installation steps from the detailed documentation into a single sequential reference. Use this for quick lookup when performing manual installations or troubleshooting.
-
-For complete context and explanations, refer to the detailed documentation:
-- [01-rke2-deployment.md](./01-rke2-deployment.md)
-- [02-rocm-support.md](./02-rocm-support.md)
-- [03-storage-management.md](./03-storage-management.md)
-- [04-network-configuration.md](./04-network-configuration.md)
-- [05-certificate-management.md](./05-certificate-management.md)
-- [08-installation-guide.md](./08-installation-guide.md)
-
----
-
-## Prerequisites Verification
-
-### System Requirements
-```bash
-# Verify Ubuntu version (must be 20.04, 22.04, or 24.04)
-lsb_release -a
-
-# Check disk space (root: 20GB+, /var: 5GB+, /var/lib/rancher: 500GB+ recommended)
-df -h / /var
-
-# Verify memory (4GB+ minimum, 8GB+ recommended) and CPU (2+ cores, 4+ recommended)
-free -h
-nproc
-
-# Check kernel modules
-lsmod | grep overlay
-lsmod | grep br_netfilter
-lsmod | grep amdgpu  # For GPU nodes only
-```
-
----
-
-## First Node Installation
-
-### 1. System Preparation
-
-**Update System and Install Dependencies**
-```bash
-sudo apt update
-sudo apt install -y jq nfs-common open-iscsi chrony curl wget
-```
-
-**Configure Firewall Ports**
-```bash
-# RKE2 required ports
-sudo ufw allow 6443/tcp     # Kubernetes API
-sudo ufw allow 9345/tcp     # RKE2 supervisor
-sudo ufw allow 10250/tcp    # kubelet
-sudo ufw allow 2379:2380/tcp # etcd
-sudo ufw allow 30000:32767/tcp # NodePort services
-sudo ufw allow 8472/udp     # Cilium VXLAN
-sudo ufw allow 4240/tcp     # Cilium health checks
-```
-
-**Configure inotify Limits**
-```bash
-echo "fs.inotify.max_user_instances = 8192" | sudo tee -a /etc/sysctl.conf
-echo "fs.inotify.max_user_watches = 524288" | sudo tee -a /etc/sysctl.conf
-sudo sysctl -p
-```
-
-**Install Kubernetes Tools**
-```bash
-# Install kubectl
-curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
-sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
-
-# Install k9s
-wget https://github.com/derailed/k9s/releases/latest/download/k9s_Linux_amd64.tar.gz
-tar -xzf k9s_Linux_amd64.tar.gz
-sudo mv k9s /usr/local/bin/
-```
-
-### 2. Storage Configuration
-
-**Configure Multipath**
-```bash
-sudo apt install -y multipath-tools
-
-cat <<EOF | sudo tee /etc/multipath.conf
-blacklist {
-    devnode "^(ram|raw|loop|fd|md|dm-|sr|scd|st)[0-9]*"
-    devnode "^hd[a-z]"
-    devnode "^sd[a-z]"
-}
-EOF
-
-sudo systemctl restart multipathd
-```
-
-**Load Required Kernel Modules**
-```bash
-sudo modprobe overlay
-sudo modprobe br_netfilter
-
-# Make persistent
-cat <<EOF | sudo tee /etc/modules-load.d/k8s.conf
-overlay
-br_netfilter
-EOF
-```
-
-**Prepare Disks for Longhorn**
-```bash
-# List available NVMe drives
-lsblk | grep nvme
-
-# For each disk (e.g., /dev/nvme0n1) - WARNING: This erases all data!
-sudo wipefs -a /dev/nvme0n1
-sudo mkfs.ext4 /dev/nvme0n1
-
-# Get UUID and create mount point
-UUID=$(sudo blkid -s UUID -o value /dev/nvme0n1)
-sudo mkdir -p /mnt/disk0
-
-# Add to fstab for persistence
-echo "UUID=$UUID /mnt/disk0 ext4 defaults,nofail 0 2" | sudo tee -a /etc/fstab
-
-# Mount the disk
-sudo mount -a
-```
-
-**Configure rsyslog**
-```bash
-cat <<EOF | sudo tee /etc/rsyslog.d/30-ratelimit.conf
-# Limit iSCSI messages to prevent log flooding
-:msg, contains, "iSCSI" stop
-EOF
-
-sudo systemctl restart rsyslog
-```
-
-**Configure logrotate**
-```bash
-cat <<EOF | sudo tee /etc/logrotate.d/bloom
-/var/log/bloom.log {
-    daily
-    rotate 7
-    compress
-    missingok
-    notifempty
-}
-EOF
-```
-
-### 3. GPU Setup (GPU Nodes Only)
-
-**Install ROCm Drivers**
-```bash
-# Get Ubuntu codename and kernel version
-CODENAME=$(grep VERSION_CODENAME /etc/os-release | cut -d= -f2)
-KERNEL_VERSION=$(uname -r)
-
-# Install kernel headers
-sudo apt install -y linux-headers-$KERNEL_VERSION linux-modules-extra-$KERNEL_VERSION
-
-# Install Python dependencies
-sudo apt install -y python3-setuptools python3-wheel
-
-# Download and install amdgpu-install
-wget https://repo.radeon.com/amdgpu-install/6.3.2/ubuntu/$CODENAME/amdgpu-install_6.3.60302-1_all.deb
-sudo apt install -y ./amdgpu-install_6.3.60302-1_all.deb
-
-# Install ROCm
-sudo amdgpu-install --usecase=rocm,dkms --yes
-
-# Load amdgpu module
-sudo modprobe amdgpu
-
-# Verify installation
-rocm-smi
-```
-
-**Configure GPU Permissions**
-```bash
-cat <<EOF | sudo tee /etc/udev/rules.d/70-amdgpu.rules
-KERNEL=="kfd", MODE="0666"
-SUBSYSTEM=="drm", KERNEL=="renderD*", MODE="0666"
-EOF
-
-sudo udevadm control --reload-rules
-sudo udevadm trigger
-```
-
-### 4. RKE2 Kubernetes Installation
-
-**Install RKE2**
-```bash
-curl -sfL https://get.rke2.io | sudo sh -
-sudo mkdir -p /etc/rancher/rke2
-```
-
-**Configure RKE2 for First Node**
-```bash
-cat <<EOF | sudo tee /etc/rancher/rke2/config.yaml
-write-kubeconfig-mode: "0644"
-cni: cilium
-disable:
-  - rke2-ingress-nginx
-tls-san:
-  - $(hostname -I | awk '{print $1}')
-node-label:
-  - "node.longhorn.io/create-default-disk=config"
-  - "node.longhorn.io/instance-manager=true"
-EOF
-
-# Add GPU labels for GPU nodes:
-# node-label:
-#   - "gpu=true"
-#   - "amd.com/gpu=true"
-
-# Add Longhorn disk labels (for each mounted disk):
-# node-label:
-#   - "silogen.ai/longhorndisks=disk0xxxdisk1xxx..."
-```
-
-**Create Audit Policy**
-```bash
-sudo mkdir -p /etc/rancher/rke2
-cat <<EOF | sudo tee /etc/rancher/rke2/audit-policy.yaml
-apiVersion: audit.k8s.io/v1
-kind: Policy
-rules:
-  - level: Metadata
-EOF
-
-# Audit policy and logs are configured automatically in RKE2 config
-# No manual config update needed
-```
-
-**Start RKE2 Service**
-```bash
-sudo systemctl enable rke2-server.service
-sudo systemctl start rke2-server.service
-
-# Wait for RKE2 to start (may take 2-5 minutes)
-sudo systemctl status rke2-server.service
-
-# Check logs if needed
-sudo journalctl -u rke2-server -f
-```
-
-**Configure kubectl Access**
-```bash
-# Copy kubeconfig to user directory
-mkdir -p ~/.kube
-sudo cp /etc/rancher/rke2/rke2.yaml ~/.kube/config
-sudo chown $(id -u):$(id -g) ~/.kube/config
-
-# Add to PATH
-echo 'export PATH=$PATH:/var/lib/rancher/rke2/bin' >> ~/.bashrc
-export PATH=$PATH:/var/lib/rancher/rke2/bin
-
-# Verify cluster is running
-kubectl get nodes
-```
-
-**Get Join Information for Additional Nodes**
-```bash
-# Get join token
-sudo cat /var/lib/rancher/rke2/server/node-token
-
-# Get server IP
-hostname -I | awk '{print $1}'
-```
-
-### 5. Storage and Networking Setup
-
-**Deploy Longhorn Storage**
-```bash
-# Create Longhorn namespace
-kubectl create namespace longhorn-system
-
-# Apply Longhorn manifests
-kubectl apply -f https://raw.githubusercontent.com/longhorn/longhorn/v1.8.0/deploy/longhorn.yaml
-
-# Wait for Longhorn pods to be ready
-kubectl wait --for=condition=ready pod -l app=longhorn-manager -n longhorn-system --timeout=600s
-
-# Create default storage class
-cat <<EOF | kubectl apply -f -
-apiVersion: storage.k8s.io/v1
-kind: StorageClass
-metadata:
-  name: mlstorage
-  annotations:
-    storageclass.kubernetes.io/is-default-class: "true"
-provisioner: driver.longhorn.io
-allowVolumeExpansion: true
-reclaimPolicy: Delete
-volumeBindingMode: Immediate
-parameters:
-  numberOfReplicas: "3"
-  staleReplicaTimeout: "2880"
-  fromBackup: ""
-  fsType: "ext4"
-EOF
-```
-
-**Deploy MetalLB Load Balancer**
-```bash
-# Get node IP for MetalLB pool
-NODE_IP=$(hostname -I | awk '{print $1}')
-
-# Install MetalLB
-kubectl apply -f https://raw.githubusercontent.com/metallb/metallb/v0.13.12/config/manifests/metallb-native.yaml
-
-# Wait for MetalLB to be ready
-kubectl wait --namespace metallb-system \
-  --for=condition=ready pod \
-  --selector=app=metallb \
-  --timeout=90s
-
-# Create IP address pool
-cat <<EOF | kubectl apply -f -
-apiVersion: metallb.io/v1beta1
-kind: IPAddressPool
-metadata:
-  name: cluster-bloom-ip-pool
-  namespace: metallb-system
-spec:
-  addresses:
-  - $NODE_IP/32
----
-apiVersion: metallb.io/v1beta1
-kind: L2Advertisement
-metadata:
-  name: cluster-bloom-l2-adv
-  namespace: metallb-system
-spec:
-  ipAddressPools:
-  - cluster-bloom-ip-pool
-EOF
-```
-
-**Configure Chrony Time Synchronization**
-```bash
-cat <<EOF | sudo tee /etc/chrony/chrony.conf
-server 0.ubuntu.pool.ntp.org iburst
-server 1.ubuntu.pool.ntp.org iburst
-server 2.ubuntu.pool.ntp.org iburst
-server 3.ubuntu.pool.ntp.org iburst
-allow 0.0.0.0/0
-local stratum 10
-EOF
-
-sudo systemctl restart chrony
-```
-
-### 6. TLS Certificate Configuration
-
-**Option A: Using cert-manager with Let's Encrypt (Recommended for Production)**
-```bash
-DOMAIN="your.domain.com"
-
-# Install cert-manager
-kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.0/cert-manager.yaml
-
-# Wait for cert-manager to be ready
-kubectl wait --for=condition=ready pod -l app=cert-manager -n cert-manager --timeout=300s
-
-# Create domain configuration
-cat <<EOF | kubectl apply -f -
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: cluster-domain
-  namespace: default
-data:
-  domain: "$DOMAIN"
-  use-cert-manager: "true"
-EOF
-```
-
-**Option B: Using Existing Certificates**
-```bash
-DOMAIN="your.domain.com"
-CERT_PATH="/path/to/tls.crt"
-KEY_PATH="/path/to/tls.key"
-
-# Create TLS secret from existing certificates
-kubectl create secret tls cluster-tls \
-  --cert=$CERT_PATH \
-  --key=$KEY_PATH \
-  -n default
-
-# Create domain configuration
-cat <<EOF | kubectl apply -f -
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: cluster-domain
-  namespace: default
-data:
-  domain: "$DOMAIN"
-  use-cert-manager: "false"
-EOF
-```
-
-**Option C: Generate Self-Signed Certificates (Development/Testing)**
-```bash
-DOMAIN="your.domain.com"
-
-# Generate self-signed certificate
-openssl req -x509 -nodes -days 365 -newkey rsa:2048 \
-  -keyout tls.key -out tls.crt \
-  -subj "/CN=$DOMAIN/O=$DOMAIN" \
-  -addext "subjectAltName=DNS:$DOMAIN,DNS:*.$DOMAIN"
-
-# Create TLS secret
-kubectl create secret tls cluster-tls \
-  --cert=tls.crt \
-  --key=tls.key \
-  -n default
-
-# Create domain configuration
-cat <<EOF | kubectl apply -f -
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: cluster-domain
-  namespace: default
-data:
-  domain: "$DOMAIN"
-  use-cert-manager: "false"
-EOF
-```
-
----
-
-## Additional Node Installation
-
-### Worker Node Setup
-
-**1. Perform System Preparation (same as First Node)**
-- Update system and install dependencies
-- Configure firewall
-- Configure multipath
-- Load kernel modules
-- Prepare disks (if storage node)
-- Install ROCm (if GPU node)
-
-**2. Install RKE2 Agent**
-```bash
-# Install RKE2 agent
-curl -sfL https://get.rke2.io | INSTALL_RKE2_TYPE="agent" sudo sh -
-
-# Configure RKE2 agent
-sudo mkdir -p /etc/rancher/rke2
-cat <<EOF | sudo tee /etc/rancher/rke2/config.yaml
-server: https://<FIRST_NODE_IP>:9345
-token: <JOIN_TOKEN>
-node-label:
-  - "node.longhorn.io/create-default-disk=config"
-  - "node.longhorn.io/instance-manager=true"
-EOF
-
-# Add GPU labels for GPU nodes:
-# node-label:
-#   - "gpu=true"
-#   - "amd.com/gpu=true"
-
-# Add Longhorn disk labels (for each mounted disk):
-# node-label:
-#   - "silogen.ai/longhorndisks=disk0xxxdisk1xxx..."
-
-# Start agent service
-sudo systemctl enable rke2-agent.service
-sudo systemctl start rke2-agent.service
-```
-
-**3. Configure Chrony to Sync with First Node**
-```bash
-cat <<EOF | sudo tee /etc/chrony/chrony.conf
-server <FIRST_NODE_IP> iburst
-EOF
-
-sudo systemctl restart chrony
-```
-
-### Additional Control Plane Node Setup
-
-**1. Perform System Preparation (same as First Node)**
-
-**2. Install RKE2 Server**
-```bash
-# Install RKE2 server
-curl -sfL https://get.rke2.io | sudo sh -
-
-# Configure RKE2 server
-sudo mkdir -p /etc/rancher/rke2
-cat <<EOF | sudo tee /etc/rancher/rke2/config.yaml
-server: https://<FIRST_NODE_IP>:9345
-token: <JOIN_TOKEN>
-write-kubeconfig-mode: "0644"
-tls-san:
-  - $(hostname -I | awk '{print $1}')
-node-label:
-  - "node.longhorn.io/create-default-disk=config"
-  - "node.longhorn.io/instance-manager=true"
-EOF
-
-# Start server service
-sudo systemctl enable rke2-server.service
-sudo systemctl start rke2-server.service
-```
-
-**3. Configure Chrony**
-```bash
-cat <<EOF | sudo tee /etc/chrony/chrony.conf
-server <FIRST_NODE_IP> iburst
-EOF
-
-sudo systemctl restart chrony
-```
-
----
-
-## Post-Installation Verification
-
-**Verify All Pods Running**
-```bash
-kubectl get pods -A
-```
-
-**Check Node Status**
-```bash
-kubectl get nodes -o wide
-```
-
-**Verify Longhorn**
-```bash
-kubectl get pods -n longhorn-system
-kubectl get storageclass
-```
-
-**Verify MetalLB**
-```bash
-kubectl get pods -n metallb-system
-kubectl get ipaddresspool -n metallb-system
-```
-
-**Test PVC Creation (Longhorn Validation)**
-```bash
-cat <<EOF | kubectl apply -f -
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: test-pvc
-spec:
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 1Gi
-EOF
-
-# Wait and verify
-kubectl get pvc test-pvc
-
-# Clean up
-kubectl delete pvc test-pvc
-```
-
-**Check GPU Access (GPU Nodes)**
-```bash
-# Deploy test pod
-kubectl apply -f - <<EOF
-apiVersion: v1
-kind: Pod
-metadata:
-  name: rocm-test
-spec:
-  containers:
-  - name: rocm-test
-    image: rocm/pytorch:latest
-    command: ["rocm-smi"]
-    resources:
-      limits:
-        amd.com/gpu: 1
-  restartPolicy: Never
-EOF
-
-# Check output
-kubectl logs rocm-test
-
-# Clean up
-kubectl delete pod rocm-test
-```
-
-**Verify Time Synchronization**
-```bash
-# On all nodes
-chronyc sources
-chronyc tracking
-```
-
-**Test Certificate (if configured)**
-```bash
-# Check certificate secret
-kubectl get secret cluster-tls -n default
-
-# Test HTTPS endpoint (if ingress configured)
-curl -k https://<DOMAIN>
-```
-
----
-
-## Troubleshooting Quick Reference
-
-**RKE2 Service Not Starting**
-```bash
-# Check logs
-sudo journalctl -u rke2-server -n 100 --no-pager
-
-# Verify configuration
-sudo cat /etc/rancher/rke2/config.yaml
-
-# Check disk space
-df -h /var/lib/rancher
-```
-
-**Node Not Joining**
-```bash
-# Verify token and server IP
-sudo cat /etc/rancher/rke2/config.yaml
-
-# Check firewall rules
-sudo ufw status
-
-# Test connectivity to first node
-nc -zv <FIRST_NODE_IP> 9345
-
-# Check agent logs
-sudo journalctl -u rke2-agent -n 100 --no-pager
-```
-
-**Longhorn Pods Not Starting**
-```bash
-# Check disk mounts
-df -h | grep /mnt/disk
-
-# Verify node labels
-kubectl get nodes --show-labels | grep longhorn
-
-# Check Longhorn logs
-kubectl logs -n longhorn-system -l app=longhorn-manager --tail=100
-```
-
-**GPU Not Detected**
-```bash
-# Verify amdgpu module loaded
-lsmod | grep amdgpu
-
-# Check ROCm installation
-rocm-smi
-
-# Verify device plugin
-kubectl get pods -n kube-system | grep amd-gpu
-```
-
-**Time Sync Issues**
-```bash
-# Check chrony status
-chronyc tracking
-
-# Restart chrony
-sudo systemctl restart chrony
-
-# Verify NTP sources
-chronyc sources -v
-```
-
----
-
-## Environment Variables for ClusterBloom
-
-If using ClusterBloom automation instead of manual steps, configure via environment variables or `bloom.yaml`:
-
-```bash
-# Node configuration
-export FIRST_NODE=true              # false for additional nodes
-export JOIN_TOKEN="<token>"         # Required for additional nodes
-export SERVER_IP="<ip>"             # Required for additional nodes
-
-# Storage configuration
-export NO_DISKS_FOR_CLUSTER=false   # Set true to skip disk setup
-export CLUSTER_DISKS="/dev/nvme0n1,/dev/nvme1n1"  # Pre-selected disks
-export CLUSTER_PREMOUNTED_DISKS="/mnt/disk0,/mnt/disk1"  # Pre-mounted paths
-
-# GPU configuration
-export GPU_NODE=true                # Enable GPU support
-
-# Certificate configuration
-export USE_CERT_MANAGER=true        # Use cert-manager for automatic certificates
-export DOMAIN="cluster.example.com" # Cluster domain
-export CERT_OPTION="generate"       # "existing" or "generate" if not using cert-manager
-export TLS_CERT="/path/to/cert.pem" # Path to certificate (if existing)
-export TLS_KEY="/path/to/key.pem"   # Path to key (if existing)
-
-# Network configuration
-export METALLB_IP_RANGE="192.168.1.100-192.168.1.110"  # Optional custom IP range
-```
-
----
-
-## Related Documentation
-
-- **[PRD.md](./PRD.md)** - Product overview and features
-- **[01-rke2-deployment.md](./01-rke2-deployment.md)** - Detailed RKE2 deployment documentation
-- **[02-rocm-support.md](./02-rocm-support.md)** - AMD GPU and ROCm configuration
-- **[03-storage-management.md](./03-storage-management.md)** - Longhorn storage setup
-- **[04-network-configuration.md](./04-network-configuration.md)** - Network and load balancing
-- **[05-certificate-management.md](./05-certificate-management.md)** - TLS certificate management
-- **[08-installation-guide.md](./08-installation-guide.md)** - Complete manual installation guide
-- **[10-configuration-reference.md](./10-configuration-reference.md)** - Configuration variable reference
-
----
-
-## Key Differences: Manual vs Automated
-
-ClusterBloom automates all of the above steps and provides:
-
-1. **Interactive UI** - TUI and Web UI for configuration and monitoring
-2. **Validation** - Pre-flight checks before any system modifications
-3. **Error Recovery** - Automatic retry and reconfiguration on failures
-4. **State Management** - Tracks progress and resumes on interruption
-5. **Configuration Management** - YAML-based configuration with validation
-6. **Disk Auto-detection** - Intelligent disk selection and formatting
-7. **Integration** - Seamless ClusterForge and 1Password Connect integration
-8. **Monitoring** - Real-time progress tracking and detailed logging
-9. **Multi-node Coordination** - Automatic generation of join commands
-10. **Best Practices** - Built-in configurations following Kubernetes best practices
diff --git a/experimental/longhorn-disk-setup.sh b/experimental/longhorn-disk-setup.sh
new file mode 100755
index 0000000..df7f7f5
--- /dev/null
+++ b/experimental/longhorn-disk-setup.sh
@@ -0,0 +1,597 @@
+#!/bin/bash
+
+# Longhorn Disk Setup Automation Script
+# This script automates the disk formatting, mounting, and fstab configuration
+# for Longhorn storage on cluster-bloom nodes
+# 
+# Features:
+# - RAID detection and removal (with backup/restore capability)
+# - Disk space analysis and recommendations
+# - Dry-run mode for planning
+# - Safe disk formatting and mounting
+
+set -euo pipefail
+
+# Configuration
+MOUNT_BASE="/mnt/disk"
+FILESYSTEM_TYPE="ext4"
+FSTAB_OPTIONS="defaults,nofail 0 2"
+BACKUP_DIR="/root/longhorn-raid-backup"
+
+# Runtime flags
+DRY_RUN=false
+REMOVE_RAID=false
+FORCE_RAID_REMOVAL=false
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Logging functions
+log_info() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+log_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+
+log_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Parse command line arguments
+parse_args() {
+    while [[ $# -gt 0 ]]; do
+        case $1 in
+            --dry-run)
+                DRY_RUN=true
+                log_info "Running in dry-run mode - no changes will be made"
+                shift
+                ;;
+            --remove-raid)
+                REMOVE_RAID=true
+                shift
+                ;;
+            --force-raid-removal)
+                FORCE_RAID_REMOVAL=true
+                REMOVE_RAID=true
+                shift
+                ;;
+            -h|--help)
+                show_help
+                exit 0
+                ;;
+            *)
+                log_error "Unknown option: $1"
+                show_help
+                exit 1
+                ;;
+        esac
+    done
+}
+
+# Show help
+show_help() {
+    cat << EOF
+Longhorn Disk Setup Automation Script
+
+Usage: $0 [OPTIONS]
+
+OPTIONS:
+    --dry-run              Show what would be done without making changes
+    --remove-raid          Remove detected RAID configurations
+    --force-raid-removal   Force RAID removal without confirmation
+    -h, --help            Show this help message
+
+Examples:
+    $0 --dry-run          # See recommendations without changes
+    $0                    # Interactive setup
+    $0 --remove-raid      # Handle RAID removal if needed
+
+This script will:
+1. Check disk space requirements
+2. Detect and optionally remove software RAID
+3. Set up Longhorn-compatible disk configuration
+4. Create proper fstab entries for persistent mounting
+
+EOF
+}
+
+# Check if running as root
+check_root() {
+    if [[ $EUID -ne 0 ]]; then
+        log_error "This script must be run as root (use sudo)"
+        exit 1
+    fi
+}
+
+# Check disk space requirements
+check_disk_space() {
+    log_info "Checking disk space requirements..."
+    
+    local root_available_kb root_available_gb var_available_kb var_available_gb
+    
+    # Get available space in KB
+    root_available_kb=$(df --output=avail / | tail -1)
+    root_available_gb=$((root_available_kb / 1024 / 1024))
+    
+    log_info "Root partition available space: ${root_available_gb}GB"
+    
+    # Check minimum requirements
+    if [[ $root_available_kb -lt 10485760 ]]; then  # Less than 10GB
+        log_error "Root partition has less than 10GB available (minimum requirement)"
+        return 1
+    elif [[ $root_available_kb -lt 20971520 ]]; then  # Less than 20GB
+        log_warning "Root partition has less than 20GB available (recommended)"
+        log_warning "Consider creating dedicated /var/lib/rancher partition"
+        echo "RECOMMEND_VAR_LIB_RANCHER=true"
+    else
+        log_success "Root partition has sufficient space (${root_available_gb}GB available)"
+        echo "RECOMMEND_VAR_LIB_RANCHER=false"
+    fi
+    
+    # Check /var if separately mounted
+    if mountpoint -q /var; then
+        var_available_kb=$(df --output=avail /var | tail -1)
+        var_available_gb=$((var_available_kb / 1024 / 1024))
+        log_info "/var partition available space: ${var_available_gb}GB"
+        
+        if [[ $var_available_kb -lt 5242880 ]]; then  # Less than 5GB
+            log_warning "/var partition has less than 5GB available (recommended for container images)"
+        fi
+    fi
+}
+
+# Detect RAID arrays
+detect_raid() {
+    log_info "Checking for software RAID configurations..."
+    
+    if [[ ! -f /proc/mdstat ]]; then
+        log_info "No software RAID support detected"
+        return 1
+    fi
+    
+    local md_arrays
+    md_arrays=$(awk '/^md/ {print $1}' /proc/mdstat)
+    
+    if [[ -z "$md_arrays" ]]; then
+        log_info "No active RAID arrays found"
+        return 1
+    fi
+    
+    log_warning "Found active RAID arrays:"
+    cat /proc/mdstat
+    echo ""
+    
+    log_warning "⚠️  Longhorn does NOT support RAID configurations!"
+    log_warning "RAID arrays must be removed before configuring Longhorn storage"
+    
+    return 0
+}
+
+# Backup RAID configuration
+backup_raid_config() {
+    log_info "Backing up RAID configuration..."
+    
+    if [[ $DRY_RUN == true ]]; then
+        log_info "[DRY-RUN] Would backup RAID config to $BACKUP_DIR"
+        return 0
+    fi
+    
+    # Create backup directory
+    mkdir -p "$BACKUP_DIR"
+    
+    # Backup mdadm config
+    if command -v mdadm >/dev/null 2>&1; then
+        mdadm --detail --scan > "$BACKUP_DIR/mdadm.conf.backup"
+        cp /proc/mdstat "$BACKUP_DIR/mdstat.backup"
+        
+        # Backup individual array details
+        for md_device in /dev/md*; do
+            if [[ -b "$md_device" ]]; then
+                local md_name
+                md_name=$(basename "$md_device")
+                mdadm --detail "$md_device" > "$BACKUP_DIR/${md_name}_detail.backup" 2>/dev/null || true
+            fi
+        done
+        
+        log_success "RAID configuration backed up to $BACKUP_DIR"
+        log_info "Backup includes: mdadm.conf, mdstat, and individual array details"
+    else
+        log_error "mdadm not found - cannot backup RAID configuration"
+        return 1
+    fi
+}
+
+# Remove RAID arrays
+remove_raid_arrays() {
+    log_warning "Removing RAID arrays..."
+    
+    if [[ $DRY_RUN == true ]]; then
+        log_info "[DRY-RUN] Would remove the following RAID arrays:"
+        awk '/^md/ {print "  /dev/" $1}' /proc/mdstat
+        return 0
+    fi
+    
+    # Get list of RAID arrays
+    local md_arrays
+    md_arrays=$(awk '/^md/ {print "/dev/" $1}' /proc/mdstat)
+    
+    if [[ -z "$md_arrays" ]]; then
+        log_info "No RAID arrays to remove"
+        return 0
+    fi
+    
+    # Confirm removal unless forced
+    if [[ $FORCE_RAID_REMOVAL != true ]]; then
+        echo ""
+        log_warning "This will DESTROY the following RAID arrays:"
+        printf '%s\n' $md_arrays
+        echo ""
+        read -p "Are you sure you want to proceed? (yes/no): " -r
+        
+        if [[ $REPLY != "yes" ]]; then
+            log_info "RAID removal cancelled by user"
+            return 1
+        fi
+    fi
+    
+    # Stop and remove arrays
+    for md_array in $md_arrays; do
+        log_info "Stopping RAID array: $md_array"
+        
+        # Unmount if mounted
+        if mount | grep -q "$md_array"; then
+            log_info "Unmounting $md_array"
+            umount "$md_array" || log_warning "Failed to unmount $md_array"
+        fi
+        
+        # Stop the array
+        mdadm --stop "$md_array" || log_warning "Failed to stop $md_array"
+        
+        # Remove the array
+        mdadm --remove "$md_array" 2>/dev/null || true
+    done
+    
+    # Zero superblocks on member disks
+    log_info "Clearing RAID superblocks on member disks..."
+    for disk in /dev/sd* /dev/nvme*n1; do
+        if [[ -b "$disk" ]]; then
+            mdadm --zero-superblock "$disk" 2>/dev/null || true
+        fi
+    done
+    
+    log_success "RAID arrays removed successfully"
+    log_info "Individual disks are now available for Longhorn use"
+}
+
+# Restore RAID configuration (if needed)
+restore_raid_config() {
+    local backup_file="$BACKUP_DIR/mdadm.conf.backup"
+    
+    if [[ ! -f "$backup_file" ]]; then
+        log_error "No RAID backup found at $backup_file"
+        return 1
+    fi
+    
+    log_warning "Restoring RAID configuration from backup..."
+    log_warning "This will recreate the original RAID arrays"
+    
+    read -p "Are you sure you want to restore RAID? (yes/no): " -r
+    if [[ $REPLY != "yes" ]]; then
+        log_info "RAID restoration cancelled"
+        return 1
+    fi
+    
+    # Restore using backed up configuration
+    while IFS= read -r line; do
+        if [[ $line == ARRAY* ]]; then
+            log_info "Restoring: $line"
+            eval "mdadm --assemble $line"
+        fi
+    done < "$backup_file"
+    
+    log_success "RAID configuration restored"
+    log_info "Check /proc/mdstat to verify arrays"
+}
+
+# Discover candidate disks (prioritized: NVMe > SSD > HDD)
+discover_disks() {
+    local disks=()
+    
+    # Priority 1: NVMe drives (excluding those in RAID)
+    for disk in /dev/nvme*n1; do
+        if [[ -b "$disk" ]] && ! is_disk_in_raid "$disk"; then
+            disks+=("$disk")
+        fi
+    done
+    
+    # Priority 2: SATA/SCSI drives (excluding sda and those in RAID)
+    for disk in /dev/sd[b-z]; do
+        if [[ -b "$disk" ]] && ! is_disk_in_raid "$disk"; then
+            disks+=("$disk")
+        fi
+    done
+    
+    printf '%s\n' "${disks[@]}" | sort
+}
+
+# Check if disk is part of a RAID array
+is_disk_in_raid() {
+    local disk="$1"
+    
+    # Check if disk or its partitions are part of any md array
+    if [[ -f /proc/mdstat ]]; then
+        local disk_base
+        disk_base=$(basename "$disk")
+        grep -q "$disk_base" /proc/mdstat 2>/dev/null
+    else
+        return 1
+    fi
+}
+
+# Check if disk is formatted
+is_disk_formatted() {
+    local disk="$1"
+    blkid -s UUID -o value "$disk" >/dev/null 2>&1
+}
+
+# Get disk UUID
+get_disk_uuid() {
+    local disk="$1"
+    blkid -s UUID -o value "$disk"
+}
+
+# Check if disk is mounted
+is_disk_mounted() {
+    local disk="$1"
+    mount | grep -q "^$disk"
+}
+
+# Get next available mount point
+get_next_mount_point() {
+    local counter=0
+    while [[ -d "${MOUNT_BASE}${counter}" ]]; do
+        if mount | grep -q "${MOUNT_BASE}${counter}"; then
+            ((counter++))
+        else
+            break
+        fi
+    done
+    echo "${MOUNT_BASE}${counter}"
+}
+
+# Format disk with ext4
+format_disk() {
+    local disk="$1"
+    
+    if [[ $DRY_RUN == true ]]; then
+        log_info "[DRY-RUN] Would format $disk with $FILESYSTEM_TYPE"
+        return 0
+    fi
+    
+    log_warning "Formatting $disk with $FILESYSTEM_TYPE (THIS WILL DESTROY ALL DATA)"
+    read -p "Are you sure you want to format $disk? (yes/no): " -r
+    
+    if [[ $REPLY == "yes" ]]; then
+        mkfs.ext4 -F "$disk"
+        log_success "Formatted $disk successfully"
+    else
+        log_info "Skipping formatting of $disk"
+        return 1
+    fi
+}
+
+# Create mount point
+create_mount_point() {
+    local mount_point="$1"
+    
+    if [[ $DRY_RUN == true ]]; then
+        if [[ ! -d "$mount_point" ]]; then
+            log_info "[DRY-RUN] Would create mount point: $mount_point"
+        else
+            log_info "[DRY-RUN] Mount point already exists: $mount_point"
+        fi
+        return 0
+    fi
+    
+    if [[ ! -d "$mount_point" ]]; then
+        mkdir -p "$mount_point"
+        log_success "Created mount point: $mount_point"
+    else
+        log_info "Mount point already exists: $mount_point"
+    fi
+}
+
+# Add entry to fstab
+add_to_fstab() {
+    local uuid="$1"
+    local mount_point="$2"
+    
+    local fstab_entry="UUID=$uuid $mount_point $FILESYSTEM_TYPE $FSTAB_OPTIONS"
+    
+    if [[ $DRY_RUN == true ]]; then
+        log_info "[DRY-RUN] Would add to /etc/fstab: $fstab_entry"
+        return 0
+    fi
+    
+    # Check if entry already exists
+    if grep -q "$uuid" /etc/fstab; then
+        log_warning "Entry for UUID $uuid already exists in /etc/fstab"
+        return 0
+    fi
+    
+    # Add entry to fstab
+    echo "$fstab_entry" >> /etc/fstab
+    log_success "Added to /etc/fstab: $fstab_entry"
+}
+
+# Validate mounts
+validate_mounts() {
+    log_info "Validating mounts..."
+    
+    if [[ $DRY_RUN == true ]]; then
+        log_info "[DRY-RUN] Would validate all fstab entries can mount"
+        log_info "[DRY-RUN] Current Longhorn mounts:"
+        df -h | grep "${MOUNT_BASE}" || log_info "[DRY-RUN] No Longhorn disks currently mounted"
+        return 0
+    fi
+    
+    # Test mount all
+    if mount -a; then
+        log_success "All fstab entries mounted successfully"
+    else
+        log_error "Failed to mount some entries from fstab"
+        return 1
+    fi
+    
+    # Show mounted disks
+    echo ""
+    log_info "Currently mounted Longhorn storage disks:"
+    df -h | grep "${MOUNT_BASE}" || log_warning "No Longhorn disks currently mounted"
+}
+
+# Display summary
+display_summary() {
+    echo ""
+    echo "=========================================="
+    log_info "Longhorn Disk Setup Summary"
+    echo "=========================================="
+    
+    echo ""
+    log_info "Mounted disks:"
+    df -h | grep "${MOUNT_BASE}" || echo "None"
+    
+    echo ""
+    log_info "fstab entries for Longhorn disks:"
+    grep "${MOUNT_BASE}" /etc/fstab || echo "None"
+    
+    echo ""
+    log_info "Next steps:"
+    echo "1. Access Longhorn UI at: https://longhorn.cluster-name"
+    echo "2. Navigate to Node tab"
+    echo "3. For each node, select 'Edit node and disks'"
+    echo "4. Add each mounted disk path (e.g., /mnt/disk0, /mnt/disk1)"
+    echo "5. Enable scheduling for each disk"
+}
+
+# Main function
+main() {
+    # Parse command line arguments
+    parse_args "$@"
+    
+    log_info "Starting Longhorn disk setup automation..."
+    if [[ $DRY_RUN == true ]]; then
+        log_info "=== DRY RUN MODE - No changes will be made ==="
+    fi
+    
+    # Check prerequisites
+    check_root
+    
+    # Check disk space requirements
+    check_disk_space
+    echo ""
+    
+    # Check for RAID and handle if necessary
+    if detect_raid; then
+        echo ""
+        if [[ $REMOVE_RAID == true ]] || [[ $DRY_RUN == true ]]; then
+            backup_raid_config
+            remove_raid_arrays
+        else
+            log_warning "RAID detected but not removing. Use --remove-raid to handle this."
+            log_warning "Longhorn requires individual disks, not RAID arrays."
+            echo ""
+            log_info "Options:"
+            echo "  1. Run with --remove-raid to safely remove RAID"
+            echo "  2. Run with --dry-run to see what would be done"
+            echo "  3. Manually remove RAID configuration first"
+            exit 1
+        fi
+        echo ""
+    fi
+    
+    # Discover available disks
+    log_info "Discovering candidate disks..."
+    mapfile -t candidate_disks < <(discover_disks)
+    
+    if [[ ${#candidate_disks[@]} -eq 0 ]]; then
+        log_warning "No candidate disks found"
+        if [[ -f /proc/mdstat ]] && grep -q "^md" /proc/mdstat; then
+            log_info "Note: Disks may be in RAID arrays. Use --remove-raid to make them available."
+        fi
+        exit 0
+    fi
+    
+    log_info "Found ${#candidate_disks[@]} candidate disk(s):"
+    printf '  %s\n' "${candidate_disks[@]}"
+    
+    echo ""
+    
+    # Process each disk
+    for disk in "${candidate_disks[@]}"; do
+        log_info "Processing disk: $disk"
+        
+        # Skip if already mounted
+        if is_disk_mounted "$disk"; then
+            log_info "Disk $disk is already mounted, skipping"
+            continue
+        fi
+        
+        # Check if formatted
+        if ! is_disk_formatted "$disk"; then
+            log_warning "Disk $disk appears to be unformatted"
+            if ! format_disk "$disk"; then
+                continue
+            fi
+        else
+            log_info "Disk $disk is already formatted"
+        fi
+        
+        # Get UUID (skip in dry run if not formatted)
+        if [[ $DRY_RUN == true ]] && ! is_disk_formatted "$disk"; then
+            log_info "[DRY-RUN] Would assign UUID after formatting"
+            uuid="<uuid-after-format>"
+        else
+            uuid=$(get_disk_uuid "$disk")
+            log_info "Disk UUID: $uuid"
+        fi
+        
+        # Get mount point
+        mount_point=$(get_next_mount_point)
+        log_info "Mount point: $mount_point"
+        
+        # Create mount point
+        create_mount_point "$mount_point"
+        
+        # Add to fstab
+        add_to_fstab "$uuid" "$mount_point"
+        
+        echo ""
+    done
+    
+    # Validate mounts
+    validate_mounts
+    
+    # Display summary
+    display_summary
+    
+    if [[ $DRY_RUN == true ]]; then
+        log_info "=== DRY RUN COMPLETE - No changes were made ==="
+        log_info "Run without --dry-run to apply these changes"
+    else
+        log_success "Longhorn disk setup completed successfully!"
+        log_info "This is the way to configure storage that eliminates impurities."
+    fi
+}
+
+# Run main function
+main "$@"
\ No newline at end of file

From 42dd1d5221e2d00731aa816ed0b952f398807025 Mon Sep 17 00:00:00 2001
From: Daniel Vaskivaara <daniel.vaskivaara@amd.com>
Date: Fri, 16 Jan 2026 09:37:44 +0200
Subject: [PATCH 3/3] docs: add non-robust disclaimer for
 longhorn-disk-setup.sh; refactor: restructure with entrypoint README.md to
 elimiate doc file numbering prefix (was not scalable or convenient to manage)

---
 docs/PRD.md                                   | 57 +++++++------
 docs/README.md                                | 79 +++++++++++++++++++
 ...anagement.md => certificate-management.md} |  0
 ...ompatibility.md => cloud-compatibility.md} |  0
 ...ns.md => cluster-sizing-configurations.md} |  0
 ...eference.md => configuration-reference.md} |  0
 ...llation-guide.md => installation-guide.md} |  0
 ...d => longhorn-drive-setup-and-recovery.md} | 25 +++---
 ...nce.md => manual-steps-quick-reference.md} |  0
 ...figuration.md => network-configuration.md} |  0
 ...thentication.md => oidc-authentication.md} |  0
 ...-rke2-deployment.md => rke2-deployment.md} |  0
 docs/{02-rocm-support.md => rocm-support.md}  |  0
 ...ge-management.md => storage-management.md} |  0
 ...hitecture.md => technical-architecture.md} |  0
 docs/{06-terminal-ui.md => terminal-ui.md}    |  0
 16 files changed, 125 insertions(+), 36 deletions(-)
 create mode 100644 docs/README.md
 rename docs/{05-certificate-management.md => certificate-management.md} (100%)
 rename docs/{09-cloud-compatibility.md => cloud-compatibility.md} (100%)
 rename docs/{00-cluster-sizing-configurations.md => cluster-sizing-configurations.md} (100%)
 rename docs/{10-configuration-reference.md => configuration-reference.md} (100%)
 rename docs/{08-installation-guide.md => installation-guide.md} (100%)
 rename docs/{11-longhorn-drive-setup-and-recovery.md => longhorn-drive-setup-and-recovery.md} (93%)
 rename docs/{00-manual-steps-quick-reference.md => manual-steps-quick-reference.md} (100%)
 rename docs/{04-network-configuration.md => network-configuration.md} (100%)
 rename docs/{12-oidc-authentication.md => oidc-authentication.md} (100%)
 rename docs/{01-rke2-deployment.md => rke2-deployment.md} (100%)
 rename docs/{02-rocm-support.md => rocm-support.md} (100%)
 rename docs/{03-storage-management.md => storage-management.md} (100%)
 rename docs/{07-technical-architecture.md => technical-architecture.md} (100%)
 rename docs/{06-terminal-ui.md => terminal-ui.md} (100%)

diff --git a/docs/PRD.md b/docs/PRD.md
index ecd686c..64f4030 100644
--- a/docs/PRD.md
+++ b/docs/PRD.md
@@ -39,42 +39,47 @@ Designed for growth to 100+ nodes.
 
 ## Core Features
 
-### 1. Automated RKE2 Kubernetes Deployment
+### Automated RKE2 Kubernetes Deployment
 Automated deployment of production-ready RKE2 clusters with first node initialization, additional node joining, Cilium CNI integration, and compliance-ready audit logging.
 
-**[📄 Detailed Documentation](./01-rke2-deployment.md)**
+**[📄 Detailed Documentation](./rke2-deployment.md)**
 
-### 2. AMD GPU Support with ROCm
+### AMD GPU Support with ROCm
 Automated AMD GPU driver installation, device detection, permission configuration, and Kubernetes GPU resource integration for AI/ML workloads.
 
-**[📄 Detailed Documentation](./02-rocm-support.md)**
+**[📄 Detailed Documentation](./rocm-support.md)**
 
-### 3. Storage Management with Longhorn
+### Storage Management with Longhorn
 Distributed block storage with automatic disk detection, interactive selection, persistent mounting, and Longhorn CSI integration for reliable persistent volumes.
 
-**[📄 Detailed Documentation](./03-storage-management.md)**
+**[📄 Detailed Documentation](./storage-management.md)**
 
-### 4. Network Configuration
+### Longhorn Drive Setup and Recovery
+Comprehensive drive recovery procedures including RAID detection and removal, disk space analysis, automated formatting and mounting, and troubleshooting for storage issues after node reboots.
+
+**[📄 Detailed Documentation](./longhorn-drive-setup-and-recovery.md)**
+
+### Network Configuration
 Comprehensive networking with MetalLB load balancing, firewall configuration, multipath storage networking, and time synchronization across cluster nodes.
 
-**[📄 Detailed Documentation](./04-network-configuration.md)**
+**[📄 Detailed Documentation](./network-configuration.md)**
 
-### 5. Interactive Terminal UI
+### Interactive Terminal UI
 Rich terminal interface with real-time progress tracking, live log streaming, interactive configuration wizards, and comprehensive error handling and recovery options.
 
-**[📄 Detailed Documentation](./06-terminal-ui.md)**
+**[📄 Detailed Documentation](./terminal-ui.md)**
 
-### 6. Configuration Management
+### Configuration Management
 Flexible configuration system supporting YAML files, environment variables, and CLI flags with comprehensive validation and an interactive wizard for guided setup.
 
-**[📄 Configuration Reference](./10-configuration-reference.md)**
+**[📄 Configuration Reference](./configuration-reference.md)**
 
-### 7. Node Validation and Testing
+### Node Validation and Testing
 Comprehensive pre-deployment validation ensures node readiness, connectivity, GPU availability, and proper firewall configuration before any system modifications.
 
-**[📄 Installation Guide](./08-installation-guide.md)**
+**[📄 Installation Guide](./installation-guide.md)**
 
-### 8. TLS Certificate Management
+### TLS Certificate Management
 
 Flexible certificate management with three deployment options:
 
@@ -95,25 +100,25 @@ Flexible certificate management with three deployment options:
 
 All certificates are stored as Kubernetes secrets in the `kgateway-system` namespace and integrated with the cluster's ingress controller for HTTPS traffic.
 
-**[📄 Certificate Management Details](./05-certificate-management.md)**
+**[📄 Certificate Management Details](./certificate-management.md)**
 
-### 9. Web UI and Monitoring Interface
+### Web UI and Monitoring Interface
 Browser-based configuration wizard with real-time monitoring dashboard, error recovery interface, and responsive design for remote cluster management from any device.
 
-**[📄 Technical Architecture](./07-technical-architecture.md)**
+**[📄 Technical Architecture](./technical-architecture.md)**
 
-### 10. Comprehensive Configuration Validation
+### Comprehensive Configuration Validation
 Pre-flight validation system checks all configuration, resources, and system requirements before making any changes, providing clear error messages with actionable fixes.
 
-**[📄 Configuration Reference](./10-configuration-reference.md)**
+**[📄 Configuration Reference](./configuration-reference.md)**
 
 ## Technical Architecture
 
 ClusterBloom uses a modular architecture with command-based interfaces, sequential installation pipelines, and multiple interaction modes (CLI, TUI, Web UI). The system executes in three phases: pre-Kubernetes system preparation, Kubernetes cluster setup, and post-Kubernetes add-on deployment.
 
-**[📄 Technical Architecture Documentation](./07-technical-architecture.md)**
+**[📄 Technical Architecture Documentation](./technical-architecture.md)**
 
-**[📄 Configuration Reference](./10-configuration-reference.md)**
+**[📄 Configuration Reference](./configuration-reference.md)**
 
 ## User Experience
 
@@ -270,16 +275,16 @@ Browser-based testing with chromedp and comprehensive mock system:
 
 ### For Developers and Operators
 
-**[📄 Manual Installation Guide](./08-installation-guide.md)**  
+**[📄 Manual Installation Guide](./installation-guide.md)**  
 Complete manual installation procedures for understanding automation or performing custom installations.
 
-**[📄 Cloud Platform Compatibility](./09-cloud-compatibility.md)**  
+**[📄 Cloud Platform Compatibility](./cloud-compatibility.md)**  
 Infrastructure dependencies, migration strategies, and configuration for multi-platform deployments (EKS, AKS, GKE).
 
-**[📄 Configuration Reference](./10-configuration-reference.md)**  
+**[📄 Configuration Reference](./configuration-reference.md)**  
 Comprehensive configuration variable reference with examples and validation rules.
 
-**[📄 Technical Architecture](./07-technical-architecture.md)**  
+**[📄 Technical Architecture](./technical-architecture.md)**  
 Detailed technical architecture, component organization, and implementation patterns.
 
 ## Conclusion
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..045cb6c
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,79 @@
+# Cluster-Bloom Documentation
+
+Welcome to the comprehensive documentation for Cluster-Bloom, an enterprise-ready AI/ML cluster deployment platform built on RKE2 and Kubernetes.
+
+## Documentation Overview
+
+This documentation provides complete guidance for deploying, configuring, and managing Cluster-Bloom environments. Each document covers specific aspects of the platform, from initial sizing to advanced configuration.
+
+## Documentation Index
+
+### Getting Started
+- [**Cluster Sizing and Configurations**](cluster-sizing-configurations.md) - Hardware requirements, sizing guidelines, and deployment planning
+- [**Manual Steps Quick Reference**](manual-steps-quick-reference.md) - Essential commands and procedures for cluster management
+
+### Core Deployment
+- [**RKE2 Deployment**](rke2-deployment.md) - Kubernetes cluster foundation setup and configuration
+- [**ROCm Support**](rocm-support.md) - AMD GPU support and ROCm integration for AI workloads
+- [**Storage Management**](storage-management.md) - Longhorn distributed storage configuration and management
+- [**Longhorn Drive Setup and Recovery**](longhorn-drive-setup-and-recovery.md) - Detailed drive recovery, RAID handling, and storage troubleshooting
+
+### Infrastructure Configuration  
+- [**Network Configuration**](network-configuration.md) - Networking setup, load balancing, and connectivity
+- [**Certificate Management**](certificate-management.md) - TLS/SSL certificate handling and automation
+- [**Terminal UI**](terminal-ui.md) - Interactive command-line interface and user experience
+- [**Technical Architecture**](technical-architecture.md) - System design, component interactions, and architectural decisions
+
+### Operations and Maintenance
+- [**Installation Guide**](installation-guide.md) - Complete step-by-step installation procedures
+- [**Cloud Compatibility**](cloud-compatibility.md) - Multi-cloud deployment strategies and platform-specific considerations
+- [**Configuration Reference**](configuration-reference.md) - Comprehensive configuration options and parameters
+- [**OIDC Authentication**](oidc-authentication.md) - Single sign-on integration and identity management
+
+## Quick Navigation
+
+### For New Users
+1. Start with [Cluster Sizing and Configurations](cluster-sizing-configurations.md) to plan your deployment
+2. Follow the [Installation Guide](installation-guide.md) for step-by-step setup
+3. Reference [Manual Steps Quick Reference](manual-steps-quick-reference.md) for common operations
+
+### For System Administrators
+- [Technical Architecture](technical-architecture.md) - Understand system design
+- [Storage Management](storage-management.md) + [Longhorn Drive Setup and Recovery](longhorn-drive-setup-and-recovery.md) - Complete storage configuration
+- [Configuration Reference](configuration-reference.md) - Detailed parameter documentation
+
+### For DevOps Engineers
+- [RKE2 Deployment](rke2-deployment.md) - Kubernetes foundation
+- [Network Configuration](network-configuration.md) - Infrastructure networking
+- [Certificate Management](certificate-management.md) - Security configuration
+
+### Troubleshooting and Recovery
+- [Longhorn Drive Setup and Recovery](longhorn-drive-setup-and-recovery.md) - Storage troubleshooting and RAID handling
+- [Manual Steps Quick Reference](manual-steps-quick-reference.md) - Emergency procedures and common fixes
+
+## Documentation Standards
+
+- **Comprehensive Coverage**: Each document provides complete information for its topic area
+- **Practical Examples**: Real-world configurations and command examples
+- **Cross-References**: Links between related topics for easy navigation
+- **Version Compatibility**: All procedures tested with current platform versions
+
+## Contributing
+
+This documentation is maintained as part of the Cluster-Bloom project. For updates, corrections, or additions:
+
+1. Follow the established documentation patterns
+2. Include practical examples and command snippets  
+3. Test all procedures before documentation
+4. Maintain cross-references between related topics
+
+## Support
+
+For questions about the documentation or Cluster-Bloom platform:
+- Reference the [Configuration Reference](configuration-reference.md) for parameter details
+- Check [Technical Architecture](technical-architecture.md) for design questions
+- Use [Manual Steps Quick Reference](manual-steps-quick-reference.md) for operational procedures
+
+---
+
+*This is the way to build enterprise-grade AI infrastructure that eliminates impurities.*
\ No newline at end of file
diff --git a/docs/05-certificate-management.md b/docs/certificate-management.md
similarity index 100%
rename from docs/05-certificate-management.md
rename to docs/certificate-management.md
diff --git a/docs/09-cloud-compatibility.md b/docs/cloud-compatibility.md
similarity index 100%
rename from docs/09-cloud-compatibility.md
rename to docs/cloud-compatibility.md
diff --git a/docs/00-cluster-sizing-configurations.md b/docs/cluster-sizing-configurations.md
similarity index 100%
rename from docs/00-cluster-sizing-configurations.md
rename to docs/cluster-sizing-configurations.md
diff --git a/docs/10-configuration-reference.md b/docs/configuration-reference.md
similarity index 100%
rename from docs/10-configuration-reference.md
rename to docs/configuration-reference.md
diff --git a/docs/08-installation-guide.md b/docs/installation-guide.md
similarity index 100%
rename from docs/08-installation-guide.md
rename to docs/installation-guide.md
diff --git a/docs/11-longhorn-drive-setup-and-recovery.md b/docs/longhorn-drive-setup-and-recovery.md
similarity index 93%
rename from docs/11-longhorn-drive-setup-and-recovery.md
rename to docs/longhorn-drive-setup-and-recovery.md
index b3d593d..a5c50eb 100644
--- a/docs/11-longhorn-drive-setup-and-recovery.md
+++ b/docs/longhorn-drive-setup-and-recovery.md
@@ -19,10 +19,10 @@ This documentation provides comprehensive instructions forsetting up, recovering
 
 Longhorn is a distributed block storage system for Kubernetes that requires proper disk configuration to ensure data persistence across node reboots. This documentation covers:
 
-- **Drive Priority**: NVMe drives (preferred) → SSD drives → HDD drives (sda, sdb, etc.)
+- **Drive Priority**: NVMe drives (preferred) → SSD drives → HDD drives
 - **RAID Restriction**: Longhorn explicitly does NOT support RAID configurations
 - **Special Requirements**: `/var/lib/rancher` needs dedicated mountpoint only if root partition is space-constrained
-- **Mount Pattern**: Disks mounted at `/mnt/diskX` where X starts from 0 and increments
+- **Mount Pattern**: Disks mounted at `/mnt/diskX` where X starts from 0 and increments by one for each additional disk
 - **Filesystem**: ext4 with UUID-based mounting for reliability
 
 ## Prerequisites
@@ -104,17 +104,19 @@ In the above example, **md0** shows a RAID0 array using multiple NVMe drives - t
 
 ### RAID Removal Process
 
-The automation script can safely backup, remove, and optionally restore RAID configurations:
+**⚠️ WARNING ** The automation [script](../experimental/longhorn-disk-setup.sh) is not robustly tested, but rather serves as a starting point for your particular use case.
+
+The automation script (`/cluster-bloom/experimental/longhorn-disk-setup.sh`) can backup, remove, and optionally restore RAID configurations:
 
 ```bash
 # Check if RAID is present
 cat /proc/mdstat
 
 # Backup and remove RAID (interactive)
-sudo bash longhorn-disk-setup.sh --remove-raid
+sudo bash experimental/longhorn-disk-setup.sh --remove-raid
 
 # Force RAID removal without confirmation
-sudo bash longhorn-disk-setup.sh --force-raid-removal
+sudo bash experimental/longhorn-disk-setup.sh --force-raid-removal
 ```
 
 #### RAID Backup and Restore
@@ -293,19 +295,22 @@ Expected output:
 
 ## Automation Script
 
-The automation script provides comprehensive disk management including RAID detection and removal:
+The automation [script](../experimental/longhorn-disk-setup.sh) at `cluster-forge/experimental/longhorn-disk-setup.sh` provides comprehensive disk management capabilities, including RAID handling, disk discovery, formatting, mounting, and fstab configuration.
 
 ### Script Usage
 
 ```bash
+# The automation script is available in the experimental folder
+# Script location: experimental/longhorn-disk-setup.sh
+
 # Dry run to see recommendations without making changes
-sudo bash longhorn-disk-setup.sh --dry-run
+sudo bash experimental/longhorn-disk-setup.sh --dry-run
 
 # Full interactive setup (with RAID handling if needed)  
-sudo bash longhorn-disk-setup.sh
+sudo bash experimental/longhorn-disk-setup.sh
 
 # Force RAID backup and removal (if detected)
-sudo bash longhorn-disk-setup.sh --remove-raid
+sudo bash experimental/longhorn-disk-setup.sh --remove-raid
 ```
 
 ### Script Capabilities
@@ -477,4 +482,4 @@ df -h / | tail -1 | awk '{print "Root: " $4 " available (" $5 " used)"}'
 3. **Test mount operations** before rebooting
 4. **Monitor disk space** regularly
 5. **Keep fstab entries simple** and well-documented
-6. **Use `nofail` option** to prevent boot issues
\ No newline at end of file
+6. **Use `nofail` option** to prevent boot issues
diff --git a/docs/00-manual-steps-quick-reference.md b/docs/manual-steps-quick-reference.md
similarity index 100%
rename from docs/00-manual-steps-quick-reference.md
rename to docs/manual-steps-quick-reference.md
diff --git a/docs/04-network-configuration.md b/docs/network-configuration.md
similarity index 100%
rename from docs/04-network-configuration.md
rename to docs/network-configuration.md
diff --git a/docs/12-oidc-authentication.md b/docs/oidc-authentication.md
similarity index 100%
rename from docs/12-oidc-authentication.md
rename to docs/oidc-authentication.md
diff --git a/docs/01-rke2-deployment.md b/docs/rke2-deployment.md
similarity index 100%
rename from docs/01-rke2-deployment.md
rename to docs/rke2-deployment.md
diff --git a/docs/02-rocm-support.md b/docs/rocm-support.md
similarity index 100%
rename from docs/02-rocm-support.md
rename to docs/rocm-support.md
diff --git a/docs/03-storage-management.md b/docs/storage-management.md
similarity index 100%
rename from docs/03-storage-management.md
rename to docs/storage-management.md
diff --git a/docs/07-technical-architecture.md b/docs/technical-architecture.md
similarity index 100%
rename from docs/07-technical-architecture.md
rename to docs/technical-architecture.md
diff --git a/docs/06-terminal-ui.md b/docs/terminal-ui.md
similarity index 100%
rename from docs/06-terminal-ui.md
rename to docs/terminal-ui.md