From 4516d756782513179a4293e16e473f002d1d2abb Mon Sep 17 00:00:00 2001 From: Mika Ranta Date: Thu, 15 Jan 2026 17:13:00 +0200 Subject: [PATCH 1/3] feat(cluster): extend kubectl setup to all control plane nodes and fix file paths --- pkg/ansible/runtime/playbook.go | 8 ++++++-- pkg/ansible/runtime/playbooks/cluster-bloom.yaml | 12 +++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pkg/ansible/runtime/playbook.go b/pkg/ansible/runtime/playbook.go index becb366..14b08d7 100644 --- a/pkg/ansible/runtime/playbook.go +++ b/pkg/ansible/runtime/playbook.go @@ -80,8 +80,12 @@ func RunPlaybook(config map[string]any, playbookName string, dryRun bool, tags s extraArgs := configToAnsibleVars(config) - // Add BLOOM_DIR to Ansible variables - extraArgs = append(extraArgs, "-e", fmt.Sprintf(`{"BLOOM_DIR": "%s"}`, workDir)) + // Add BLOOM_DIR to Ansible variables (current working directory, not .bloom subdir) + cwd, err := os.Getwd() + if err != nil { + return 1, fmt.Errorf("get current directory: %w", err) + } + extraArgs = append(extraArgs, "-e", fmt.Sprintf(`{"BLOOM_DIR": "%s"}`, cwd)) exitCode := RunContainer(rootfs, playbookDir, playbookName, extraArgs, dryRun, tags) return exitCode, nil diff --git a/pkg/ansible/runtime/playbooks/cluster-bloom.yaml b/pkg/ansible/runtime/playbooks/cluster-bloom.yaml index aceaf8e..71318b7 100644 --- a/pkg/ansible/runtime/playbooks/cluster-bloom.yaml +++ b/pkg/ansible/runtime/playbooks/cluster-bloom.yaml @@ -903,9 +903,15 @@ state: started tags: [rke2, deploy_cluster] - - name: Setup KubeConfig (First Node) - when: FIRST_NODE + - name: Setup KubeConfig (Control Plane Nodes) + when: FIRST_NODE or CONTROL_PLANE block: + - name: Wait for RKE2 kubeconfig to be available + wait_for: + path: /etc/rancher/rke2/rke2.yaml + state: present + timeout: 300 + - name: Update RKE2 kubeconfig with actual IP replace: path: /etc/rancher/rke2/rke2.yaml @@ -1277,7 +1283,7 @@ copy: content: | echo -e 'FIRST_NODE: false\nJOIN_TOKEN: {{ JOIN_TOKEN_content.content | b64decode | trim }}\nSERVER_IP: {{ node_ip }}' > bloom.yaml && sudo ./bloom --config bloom.yaml - dest: "/tmp/additional_node_command.txt" + dest: "{{ BLOOM_DIR }}/additional_node_command.txt" mode: "0644" become: no From 7a76fe202b3554f0fd11fc3a2b5bdf7f838d05b1 Mon Sep 17 00:00:00 2001 From: Mika Ranta Date: Fri, 16 Jan 2026 10:01:57 +0200 Subject: [PATCH 2/3] fix: bug in fstab handling --- .../runtime/playbooks/cluster-bloom.yaml | 17 +- tmp/ANSIBLE_ARCHITECTURE.md | 394 ---------- tmp/BLOOM_V2_ARCHITECTURE.md | 393 ---------- tmp/BLOOM_V2_PLAN.md | 702 ------------------ tmp/BLOOM_V2_PRD.md | 576 -------------- tmp/BLOOM_YAML_SPEC_V1.md | 112 --- 6 files changed, 7 insertions(+), 2187 deletions(-) delete mode 100644 tmp/ANSIBLE_ARCHITECTURE.md delete mode 100644 tmp/BLOOM_V2_ARCHITECTURE.md delete mode 100644 tmp/BLOOM_V2_PLAN.md delete mode 100644 tmp/BLOOM_V2_PRD.md delete mode 100644 tmp/BLOOM_YAML_SPEC_V1.md diff --git a/pkg/ansible/runtime/playbooks/cluster-bloom.yaml b/pkg/ansible/runtime/playbooks/cluster-bloom.yaml index 71318b7..1d56ba7 100644 --- a/pkg/ansible/runtime/playbooks/cluster-bloom.yaml +++ b/pkg/ansible/runtime/playbooks/cluster-bloom.yaml @@ -446,16 +446,6 @@ loop: "{{ cluster_disks_list | default([]) }}" when: cluster_disks_list | length > 0 - - name: Mount cluster disks - mount: - path: "/mnt/disk{{ item.0 }}" - src: "{{ item.1 }}" - fstype: ext4 - opts: defaults,nofail - state: mounted - loop: "{{ range(cluster_disks_list | length) | list | zip(cluster_disks_list) | list }}" - when: cluster_disks_list | length > 0 - - name: Get UUIDs for cluster disks shell: blkid -s UUID -o value {{ item.1 }} loop: "{{ range(cluster_disks_list | length) | list | zip(cluster_disks_list) | list }}" @@ -470,6 +460,13 @@ state: present loop: "{{ disk_uuids.results }}" when: cluster_disks_list | length > 0 and not item.skipped | default(false) + + - name: Mount cluster disks using fstab entries (verifies fstab) + shell: mount /mnt/disk{{ item.item.0 }} + loop: "{{ disk_uuids.results }}" + when: cluster_disks_list | length > 0 and not item.skipped | default(false) + register: mount_results + failed_when: mount_results.rc != 0 tags: [storage, prep_node] - name: Prepare RKE2 diff --git a/tmp/ANSIBLE_ARCHITECTURE.md b/tmp/ANSIBLE_ARCHITECTURE.md deleted file mode 100644 index 62bddc9..0000000 --- a/tmp/ANSIBLE_ARCHITECTURE.md +++ /dev/null @@ -1,394 +0,0 @@ -# Bloom Ansible Command - Technical Architecture - -**Date:** 2025-12-10 -**Status:** Design Complete - Ready for Implementation -**Issue:** #609 - Bloom V2 - -## Overview - -The `bloom ansible` command executes Kubernetes cluster deployment using embedded Ansible playbooks. It reads a `bloom.yaml` configuration file, validates it, and runs the deployment in a containerized Ansible environment. - -## Design Decisions - -### 1. Command Structure -**Decision:** Subcommand in cmd/bloom/main.go -**Rationale:** Consistent with `bloom webui`, maintains single binary distribution - -```bash -bloom ansible bloom.yaml -``` - -### 2. Configuration Reading -**Decision:** Reuse existing internal/config package -**Rationale:** Single source of truth, DRY principle, already validated - -- Parse bloom.yaml → internal/config.Config (map[string]any) -- Validate using internal/config.Validate() -- Pass directly to Ansible as extra vars - -### 3. Variable Mapping -**Decision:** No conversion - use UPPERCASE in playbook -**Rationale:** Simplest solution, no mapping code needed - -- bloom.yaml: `FIRST_NODE: true` -- Playbook: `{{ FIRST_NODE }}` -- Ansible command: `-e FIRST_NODE=true` - -### 4. Playbook Embedding -**Decision:** Embed entire playbooks/ directory -**Rationale:** Future-proof, supports modular playbooks, minimal overhead - -```go -//go:embed playbooks/* -var embeddedPlaybooks embed.FS -``` - -### 5. Runtime Architecture -**Decision:** Extract into pkg/ansible/runtime package -**Rationale:** Clean separation, reusable, testable - -``` -pkg/ansible/ -├── runtime/ -│ ├── container.go # Image pull/cache with go-containerregistry -│ ├── executor.go # Linux namespace creation & execution -│ └── playbook.go # Playbook running logic -└── playbooks/ # Embedded via go:embed - └── cluster-bloom.yaml -``` - -### 6. Step Filtering -**Decision:** Defer to v2.1 -**Rationale:** Get core working first, add features incrementally - -DISABLED_STEPS and ENABLED_STEPS will be implemented later using Ansible `--skip-tags` and `--tags`. - -## Component Architecture - -### File Structure - -``` -cluster-bloom/ -├── cmd/bloom/ -│ └── main.go # Add ansibleCmd cobra.Command -│ -├── pkg/ansible/ -│ ├── runtime/ -│ │ ├── container.go # pullImage(), extractLayers(), cacheImage() -│ │ ├── executor.go # createNamespaces(), mountHost(), runAnsible() -│ │ └── playbook.go # RunPlaybook(config, playbook) -│ └── playbooks/ # go:embed directory -│ ├── cluster-bloom.yaml # Main deployment playbook (UPPERCASE vars) -│ └── hello.yml # Test playbook -│ -├── internal/config/ # Existing - reuse as-is -│ ├── schema.go # Schema with all arguments -│ ├── validator.go # Validation logic -│ ├── generator.go # YAML generation -│ └── types.go # Config map type -│ -└── go.mod # Add go-containerregistry dependency -``` - -### Data Flow - -``` -User runs: bloom ansible bloom.yaml - ↓ -cmd/bloom/main.go (ansibleCmd) - ↓ -internal/config.LoadConfig(bloom.yaml) → Config map - ↓ -internal/config.Validate(config) → []string errors - ↓ -pkg/ansible/runtime.RunPlaybook(config, "cluster-bloom.yaml") - ↓ - ├─ Pull/cache Ansible image (willhallonline/ansible:latest) - ├─ Extract playbook from embed.FS - ├─ Convert Config map → Ansible extra vars (-e KEY=value) - ├─ Create Linux namespaces (UTS, PID, Mount) - ├─ Mount host filesystem at /host - └─ Execute: ansible-playbook -e ... /playbooks/cluster-bloom.yaml -``` - -## Implementation Pattern (from bloomv2 experiment) - -### Container Runtime - -**Image Pull & Cache:** -```go -// Uses go-containerregistry/pkg/crane -img, err := crane.Pull("willhallonline/ansible:latest") -layers, err := img.Layers() -for layer := range layers { - extractLayer(layer, "/var/lib/bloom/rootfs") -} -``` - -**Caching:** -- Location: `/var/lib/bloom/rootfs` -- First run: ~500MB download -- Subsequent runs: Reuse cached rootfs -- Check: `stat /var/lib/bloom/rootfs/usr` - -### Namespace Creation - -**Linux Namespaces:** -```go -cmd := exec.Command("/proc/self/exe", "__child__", ...) -cmd.SysProcAttr = &syscall.SysProcAttr{ - Cloneflags: syscall.CLONE_NEWUTS | syscall.CLONE_NEWPID | syscall.CLONE_NEWNS, -} -``` - -**Host Mount:** -```go -// Inside namespace -syscall.Mount("/", "/mnt/host", "", syscall.MS_BIND|syscall.MS_REC, "") -``` - -### Ansible Execution - -**Command Structure:** -```bash -ansible-playbook \ - -i localhost, \ - -c local \ - -e FIRST_NODE=true \ - -e GPU_NODE=false \ - -e DOMAIN=example.com \ - /playbooks/cluster-bloom.yaml -``` - -**Variable Passing:** -```go -args := []string{"ansible-playbook", "-i", "localhost,", "-c", "local"} -for key, value := range config { - args = append(args, "-e", fmt.Sprintf("%s=%v", key, value)) -} -args = append(args, "/playbooks/cluster-bloom.yaml") -``` - -## Playbook Updates Required - -### Variable Name Changes - -Update `/workspace/platform/experiments/bloomv2/playbooks/cluster-bloom.yaml`: - -```yaml -# FROM (lowercase): -vars: - first_node: true - gpu_node: true - domain: "" - -# TO (UPPERCASE): -vars: - FIRST_NODE: true - GPU_NODE: true - DOMAIN: "" -``` - -All variable references in tasks must also change: -```yaml -# FROM: -when: first_node - -# TO: -when: FIRST_NODE -``` - -### Host Filesystem Access - -Playbook already uses `host_root: /host` pattern: -```yaml -vars: - host_root: /host - -tasks: - - name: Example task - copy: - src: /local/file - dest: "{{ host_root }}/etc/config" -``` - -This works because the runtime mounts host at `/host`. - -## User Experience - -### Installation Workflow - -```bash -# Step 1: Generate configuration -bloom webui -# Fill form, save bloom.yaml to /workspace/cluster - -# Step 2: Deploy cluster -cd /workspace/cluster -sudo bloom ansible bloom.yaml - -# Output: -# Checking for Ansible image... -# Downloading Ansible image (500MB)... [first run only] -# Image ready. -# Running playbook: cluster-bloom.yaml -# [Ansible output follows...] -# Cluster deployment complete! -``` - -### Subsequent Runs - -```bash -sudo bloom ansible bloom.yaml - -# Output: -# Using cached Ansible image. -# Running playbook: cluster-bloom.yaml -# [Ansible output follows...] -``` - -### Error Handling - -**Invalid Config:** -```bash -bloom ansible invalid.yaml - -# Output: -# Error validating configuration: -# - DOMAIN: must match pattern ^[a-z0-9]([a-z0-9-]{0,61}[a-z0-9])?(\.[a-z0-9]... -# - CLUSTER_DISKS: must be valid block device paths -``` - -**Missing Config:** -```bash -bloom ansible missing.yaml - -# Output: -# Error: configuration file not found: missing.yaml -``` - -**Ansible Failure:** -```bash -sudo bloom ansible bloom.yaml - -# Output: -# [Ansible task output...] -# TASK [Install RKE2 server] ***** -# fatal: [localhost]: FAILED! => {"msg": "Unable to download..."} -# -# Deployment failed. Check logs at: /var/log/bloom/run-20251210-143022.log -``` - -## Dependencies - -### New Dependency -``` -github.com/google/go-containerregistry -``` - -### Existing Dependencies (reuse) -``` -github.com/spf13/cobra # CLI framework -gopkg.in/yaml.v3 # YAML parsing -``` - -## Testing Strategy - -### Unit Tests -- `pkg/ansible/runtime`: Mock container operations -- Config to Ansible vars conversion -- Playbook extraction from embed.FS - -### Integration Tests -- Pull real Ansible image -- Run hello.yml playbook -- Verify marker file created - -### Robot Framework Tests (Phase 4) -- Single node deployment -- Multi-node cluster -- GPU node configuration -- Idempotency (run twice) - -## Security Considerations - -### Root Requirement -The command requires root because: -- Linux namespace creation needs CAP_SYS_ADMIN -- Cluster deployment modifies system configuration -- Alternative: Use sudo in docs, check in code - -### Image Trust -- Uses official willhallonline/ansible:latest -- Cached at /var/lib/bloom/rootfs -- Future: Add image signature verification - -### Host Filesystem Access -- Container has full host access via /host mount -- Necessary for system configuration -- Same security model as V1 - -## Performance Considerations - -### First Run -- Image download: ~500MB, 2-5 minutes depending on bandwidth -- Layer extraction: ~1 minute -- Total first run overhead: 3-6 minutes - -### Subsequent Runs -- Image check: <1 second -- No download needed -- Deployment time: Same as V1 (~10-15 minutes) - -### Disk Usage -- Ansible image: ~500MB at /var/lib/bloom/rootfs -- Logs: Rotated at /var/log/bloom/ - -## Future Enhancements (Post-v2.0) - -### v2.1 - Step Filtering -```bash -bloom ansible bloom.yaml --skip-tags gpu,longhorn -bloom ansible bloom.yaml --tags rke2 -``` - -Map DISABLED_STEPS/ENABLED_STEPS from schema to Ansible flags. - -### v2.2 - Custom Playbooks -```bash -bloom ansible bloom.yaml --playbook custom.yml -bloom ansible bloom.yaml --playbook https://example.com/playbook.yml -``` - -Already supported by bloomv2 experiment pattern. - -### v2.3 - Dry Run -```bash -bloom ansible bloom.yaml --check -``` - -Pass `--check` to Ansible for dry-run mode. - -## Success Criteria - -- [ ] `bloom ansible bloom.yaml` runs without errors -- [ ] Validates config before execution -- [ ] Downloads and caches Ansible image -- [ ] Executes cluster-bloom.yaml playbook -- [ ] Logs saved to /var/log/bloom/ -- [ ] Deploys identical cluster to V1 -- [ ] Binary size < 50MB -- [ ] No external dependencies (Docker, Python, Ansible) - -## References - -- **Experiment Code:** `/workspace/platform/experiments/bloomv2/` -- **Existing Playbook:** `experiments/bloomv2/playbooks/cluster-bloom.yaml` -- **V1 Steps:** `pkg/steps.go` (26 steps → Ansible tasks) -- **Config Schema:** `internal/config/schema.go` - ---- - -**Status:** Design approved. Ready for implementation. -**Next:** Begin implementation with go.mod dependency addition. -**ETA:** 3-5 days for core implementation, 1-2 days for testing. diff --git a/tmp/BLOOM_V2_ARCHITECTURE.md b/tmp/BLOOM_V2_ARCHITECTURE.md deleted file mode 100644 index 284e1be..0000000 --- a/tmp/BLOOM_V2_ARCHITECTURE.md +++ /dev/null @@ -1,393 +0,0 @@ -# Bloom V2 Architecture - -**Issue:** #609 - Bloom V2 -**Branch:** bloom-v2 -**Date:** 2025-12-08 -**Status:** Design Complete - -## Design Decisions - -### 1. Execution Engine: Embedded Ansible - -**Decision:** Use embedded Ansible container runtime (like PoC) - -**Implementation:** -- Embed Ansible container image extraction logic -- Pull `willhallonline/ansible:latest` (~500MB, one-time) -- Cache at `/var/lib/bloom/rootfs` -- Run playbooks in Linux namespaces -- Mount host filesystem at `/host` - -**Rationale:** -- Leverage mature Ansible ecosystem -- Idempotency built-in -- Less code to write than pure Go -- Battle-tested modules - -### 2. Config Schema: YAML Schema as Single Source of Truth - -**Decision:** Use schema/bloom.yaml.schema.yaml as the single source of truth - -**Implementation:** -- YAML schema defines all field definitions, patterns, and examples -- Schema loaded at runtime by Go backend (schema_loader.go) -- Frontend tests extract examples directly from schema -- Parse same field names (FIRST_NODE, DOMAIN, etc.) -- Validation rules defined in schema types -- Defaults specified in schema -- Dependencies mapped from schema conditions -- Pass as Ansible extra vars - -**Schema Location:** `schema/bloom.yaml.schema.yaml` - -**Schema Structure:** -- Type definitions with patterns and examples (domain, ipv4, url, etc.) -- Field mappings with type, default, description, section -- Conditional visibility via `applicable` and `required` fields -- Constraint definitions (mutually_exclusive, one_of) - -**Validation Architecture:** -- **Frontend (HTML5)**: Real-time pattern validation using schema patterns via HTML5 `pattern` attribute -- **Frontend (JS)**: Pre-submit validation for required fields and enum values in `validateForm()` -- **Backend (Go)**: Authoritative validation at `/api/generate` and `/api/save` endpoints - - Pattern validation: Loads from schema types at runtime (validator.go) - - Constraint validation: Validates mutually_exclusive and one_of rules (constraints.go) - - Type preservation: Custom types (domain, ipv4) preserved for accurate validation - -**Rationale:** -- Single source of truth eliminates duplication -- Schema drives both validation and testing -- Easy to add new fields or patterns -- Tests automatically stay in sync with schema -- Frontend and backend use identical patterns -- All validation rules centralized in schema YAML - -**Reference:** See `schema/bloom.yaml.schema.yaml` - -### 3. Task Orchestration: Linear, Fail-Fast - -**Decision:** Sequential playbook execution, stop on first error - -**Implementation:** -- Run playbooks in order: - 1. Validation (ROCm check if GPU_NODE=true) - 2. System prep (packages, firewall) - 3. Disk setup - 4. RKE2 installation - 5. Longhorn deployment - 6. MetalLB setup - 7. ClusterForge (if enabled) -- Exit code != 0 stops execution -- No state tracking -- User re-runs from beginning (Ansible handles idempotency) - -**Rationale:** -- Simpler implementation -- Clear failure points -- Ansible makes re-runs safe - -### 4. Code Organization: Single Binary, Subcommands - -**Decision:** One binary with multiple subcommands - -**Binary:** `bloom` - -**Subcommands:** -```bash -bloom deploy [config.yaml] # Deploy cluster (default) -bloom webui [--port 8080] # Start web UI server -bloom config # CLI wizard -bloom validate config.yaml # Validate config -bloom version # Show version -``` - -**Project Structure:** -``` -cluster-bloom/ (bloom-v2 branch) -├── cmd/ -│ ├── main.go # Entry point, subcommand routing -│ └── web/ -│ └── static/ -│ ├── index.html # Main page -│ ├── js/ -│ │ ├── app.js # Application logic -│ │ ├── form.js # Form generation from schema -│ │ ├── constraints.js # Constraint validation -│ │ └── validator.js # Frontend validation -│ └── css/ -│ └── styles.css # Styling -├── pkg/ -│ ├── ansible/ -│ │ ├── runtime.go # Container runtime (namespaces) -│ │ ├── image.go # Image pull & cache -│ │ └── executor.go # Playbook execution -│ ├── config/ -│ │ ├── config.go # Config type definition -│ │ ├── validator.go # Schema-driven validation -│ │ ├── validate_test.go # Validation tests -│ │ ├── validate_integration_test.go # Schema-driven integration tests -│ │ ├── schema.go # Argument struct definition -│ │ ├── schema_loader.go # Load schema from YAML at runtime -│ │ ├── schema_loader_test.go # Schema loader tests -│ │ ├── constraints.go # Constraint validation logic -│ │ ├── constraints_test.go # Constraint tests -│ │ └── generator.go # Generate bloom.yaml -│ └── webui/ -│ ├── server.go # HTTP server -│ ├── handlers.go # API endpoints (/api/schema, /api/generate, /api/save) -│ └── embed.go # Embedded web assets -├── playbooks/ -│ ├── validate.yml # Pre-flight checks -│ ├── system.yml # System preparation -│ ├── disks.yml # Disk configuration -│ ├── rke2.yml # RKE2 installation -│ ├── longhorn.yml # Longhorn deployment -│ ├── metallb.yml # MetalLB setup -│ └── clusterforge.yml # ClusterForge integration -├── schema/ -│ └── bloom.yaml.schema.yaml # Schema definition (single source of truth) -├── tests/ -│ └── robot/ -│ ├── api.robot # API endpoint tests -│ ├── ui.robot # UI loading tests -│ ├── config_generation.robot # Config generation tests -│ ├── constraint_validation_dynamic.robot # Constraint tests -│ ├── schema_validation.robot # Schema-driven validation tests -│ ├── yaml_loader.py # Schema example extraction -│ └── run_tests_docker.sh # Test runner -├── tmp/ # Planning docs (gitignored) -└── Makefile # Build automation -``` - -**Rationale:** -- Single binary simplifies distribution -- Subcommands provide clear UX -- Internal packages prevent API exposure - -### 5. Web UI Scope: Generator Only - -**Decision:** Web UI generates bloom.yaml, does NOT deploy - -**Features:** -- Form-based bloom.yaml editor -- Field validation -- Conditional field display (dependencies) -- YAML preview -- Download button -- No deployment capability -- No live progress - -**User Flow:** -1. Run `bloom webui` → Opens browser to localhost:8080 -2. Fill out form (domain, disks, certificates, etc.) -3. Validate config -4. Preview YAML -5. Click "Download bloom.yaml" -6. Exit web UI -7. Run `bloom deploy bloom.yaml` separately - -**API Endpoints:** -``` -GET / # Serve web UI -POST /api/generate # Generate YAML from JSON (includes validation) -POST /api/save # Save YAML to file (includes validation) -GET /api/schema # Get field definitions, constraints & dependencies -``` - -**Rationale:** -- Simpler implementation (no websockets, no streaming) -- Clear separation: config generation vs deployment -- Web UI is just a nice config editor -- Deployment stays in CLI (where logging/errors work well) - -## Architecture Diagram - -``` -┌─────────────────────────────────────────────────┐ -│ bloom (binary) │ -├─────────────────────────────────────────────────┤ -│ │ -│ Subcommands: │ -│ ┌──────────────┐ ┌──────────────┐ │ -│ │ bloom deploy │ │ bloom webui │ │ -│ └──────┬───────┘ └──────┬───────┘ │ -│ │ │ │ -│ v v │ -│ ┌──────────────┐ ┌──────────────┐ │ -│ │ Deploy │ │ Web Server │ │ -│ │ Orchestrator │ │ (Generator) │ │ -│ └──────┬───────┘ └──────────────┘ │ -│ │ │ -│ v │ -│ ┌──────────────┐ │ -│ │ Ansible │ │ -│ │ Runtime │ │ -│ └──────┬───────┘ │ -│ │ │ -│ v │ -│ ┌──────────────┐ │ -│ │ Embedded │ │ -│ │ Playbooks │ │ -│ └──────────────┘ │ -│ │ -└─────────────────────────────────────────────────┘ - │ - v -┌─────────────────────────────────────────────────┐ -│ Host System (Ubuntu) │ -│ - /var/lib/bloom/rootfs (Ansible cache) │ -│ - /var/log/bloom/*.log │ -│ - RKE2, Longhorn, etc. installed │ -└─────────────────────────────────────────────────┘ -``` - -## Data Flow - -### Deploy Command -``` -bloom.yaml - ↓ -[Config Parser] → Validate - ↓ -[Deploy Orchestrator] - ↓ -[Ansible Runtime] → Pull/Cache Image (if needed) - ↓ -[Run Playbooks] → validate.yml - ↓ system.yml - ↓ disks.yml - ↓ rke2.yml - ↓ longhorn.yml - ↓ metallb.yml - ↓ clusterforge.yml - ↓ -Success/Failure Exit Code -``` - -### Web UI Flow -``` -User Browser - ↓ -[Web UI Form] → Fill fields - ↓ -[HTML5 Pattern Validation] → Real-time feedback - ↓ -[Client-side Validation] → Required fields + enum checks - ↓ -[POST /api/generate] → Full validation + generate YAML - ↓ -[Preview YAML] - ↓ -[POST /api/save] → Save to server filesystem - ↓ -Download bloom.yaml - ↓ -User runs: bloom deploy bloom.yaml -``` - -## Technology Stack - -### Backend -- **Language:** Go 1.21+ -- **Config:** `gopkg.in/yaml.v3` for YAML parsing -- **Validation:** Custom validators + `github.com/go-playground/validator` -- **HTTP:** Standard `net/http` -- **CLI:** `github.com/spf13/cobra` for subcommands -- **Ansible Image:** `go-containerregistry` for pulling -- **Namespaces:** `golang.org/x/sys/unix` for Linux syscalls - -### Frontend (Web UI) -- **Framework:** Vanilla JavaScript (or Alpine.js if needed) -- **Build:** None required (simple HTML/CSS/JS) -- **Embedding:** `go:embed` for static files - -### Ansible -- **Image:** `willhallonline/ansible:latest` -- **Playbook Format:** Standard Ansible YAML -- **Variables:** Passed via `-e key=value` - -## Build Process - -```bash -# Build web UI (if using build step) -cd web && npm run build - -# Build Go binary -CGO_ENABLED=0 GOOS=linux GOARCH=amd64 \ - go build -ldflags="-s -w" \ - -o dist/bloom \ - ./cmd/bloom - -# Result: Single static binary at dist/bloom -``` - -## Deployment Example - -```bash -# Generate config using Web UI -bloom webui -# Browser opens, fill form, download bloom.yaml - -# OR generate using CLI wizard -bloom config > bloom.yaml - -# Validate (optional) -bloom validate bloom.yaml - -# Deploy -sudo bloom deploy bloom.yaml - -# Logs written to: -# /var/log/bloom/deploy-20251208-120000.log -``` - -## Success Criteria - -1. **Functionality:** - - Web UI generates valid bloom.yaml ✅ - - CLI wizard generates valid bloom.yaml ⬜ (deferred) - - bloom deploy successfully deploys cluster ⬜ (in progress) - - Same success rate as V1 ⬜ (pending deployment) - -2. **Compatibility:** - - V1 bloom.yaml files work in V2 ✅ - - All V1 config options supported ✅ - -3. **User Experience:** - - Single binary distribution ✅ - - Clear subcommands ✅ - - Web UI is intuitive ✅ - - Good error messages ✅ - -4. **Code Quality:** - - Clean architecture ✅ - - Testable components ✅ - - Robot Framework tests ✅ (18/18 passing) - - Schema-driven validation ✅ - - Comprehensive test coverage (all patterns) ✅ - - Go unit tests ✅ (100% passing) - -## Out of Scope (V2.0) - -- Resume capability -- Parallel playbook execution -- Web-based deployment monitoring -- State tracking -- Rollback functionality -- Multi-cluster orchestration -- Config versioning/migration - -## Future Considerations (V2.x) - -- Resume from failure -- Progress bars in CLI -- Web UI with deployment capability -- Diff between configs -- Dry-run mode -- Plugin system - ---- - -**Status:** Architecture finalized, ready for implementation -**Next:** Begin Phase 1 implementation -**Date:** 2025-12-08 diff --git a/tmp/BLOOM_V2_PLAN.md b/tmp/BLOOM_V2_PLAN.md deleted file mode 100644 index 0767549..0000000 --- a/tmp/BLOOM_V2_PLAN.md +++ /dev/null @@ -1,702 +0,0 @@ -# Bloom V2 Implementation Plan - -**Issue:** #609 - Bloom V2 -**Priority:** P0 (Critical) -**Repository:** cluster-bloom -**Branch:** bloom-v2 -**Date:** 2025-12-08 - -## Context - -Issue #609 requires implementing Bloom V2 with focus on: -- **HIGH PRIORITY:** Web UI for generating bloom.yaml -- **HIGH PRIORITY:** CLI tool for generating bloom.yaml -- **HIGH PRIORITY:** Blooming only with bloom.yaml (simplified workflow) -- **LOW PRIORITY:** Monitoring deployment -- **TESTING:** Robot Framework tests - -## Reference Code (For Ideas Only) - -### PoC in platform/experiments/bloomv2/ (bloomV2 branch) -Ideas to consider: -- Self-contained Go binary approach -- Linux namespace container runtime concept -- Image caching strategy -- Playbook execution patterns - -### Existing Bloom v1 in cluster-bloom -Ideas to consider: -- Deployment step flow -- Configuration validation patterns -- Disk management logic -- Test structure - -**IMPORTANT:** Both are reference only - NOT to be ported. This is a clean reimplementation. - -## Bloom V2 Goals - -### Architecture Shift - -**From:** Imperative Go code with embedded shell scripts -**To:** Declarative Ansible playbooks with config generation tools - -**Benefits:** -- More maintainable (Ansible best practices) -- Easier to extend (add playbooks vs modify Go code) -- Better idempotency (Ansible modules) -- Separation of concerns (config generation vs execution) - -### User Experience Flow - -#### Current (v1): -``` -User manually writes bloom.yaml → Run bloom binary → Deploy -``` - -#### Target (v2): -``` -Option A: User uses Web UI → Generate bloom.yaml → Run bloom → Deploy -Option B: User uses CLI wizard → Generate bloom.yaml → Run bloom → Deploy -Option C: User manually writes bloom.yaml → Run bloom → Deploy -``` - -## Implementation Phases - -### Phase 1: Core Architecture (Week 1-2) - -**Goal:** Design and implement clean V2 architecture from scratch - -**Design Decisions:** -1. **Execution model:** How to run deployment tasks? - - Option A: Embedded Ansible (like PoC) - - Option B: Pure Go implementation - - Option C: Hybrid (Go + external tools) - -2. **Configuration:** How to handle bloom.yaml? - - Schema definition - - Validation rules - - Variable substitution - -3. **Deployment flow:** Step orchestration - - Sequential vs parallel execution - - Error handling and rollback - - Progress reporting - -**Tasks:** -1. Define V2 architecture - - Clean module structure - - Clear separation of concerns - - Extensibility for future features - -2. Implement config parser - - YAML schema for bloom.yaml - - Validation library - - Type-safe config structures - -3. Create minimal deployment engine - - Basic task execution framework - - Logging infrastructure - - Error handling patterns - -4. Implement one end-to-end deployment - - Pick simplest use case (e.g., RKE2 install) - - Prove architecture works - - Establish patterns for other components - -**Deliverable:** Working minimal bloom v2 that can deploy one component with proper config handling - -### Phase 2: CLI Generator (Week 2-3) - -**Goal:** CLI tool for interactive bloom.yaml generation - -**Tasks:** -1. Create `cmd/bloom-config/` subcommand - - Interactive prompts for common configs - - Validation of user inputs - - Smart defaults based on system detection - -2. Question flow: - - Node role (first node vs additional node) - - Domain name - - Certificate options (Let's Encrypt, self-signed, existing) - - GPU configuration (detect AMD GPUs, ROCm version) - - Disk selection (list available disks, let user choose) - - Network settings (if needed) - -3. Output generation: - - Write bloom.yaml to current directory or specified path - - Show preview before writing - - Validate generated config - -**Example Usage:** -```bash -bloom config init # Interactive wizard -bloom config init --quick # Quick mode with defaults -bloom config validate bloom.yaml # Validate existing config -bloom config show-defaults # Show all default values -``` - -**Deliverable:** CLI wizard that generates valid bloom.yaml files - -### Phase 3: Web UI (Week 3-5) - HIGH PRIORITY - -**Goal:** Web-based bloom.yaml generator - -**Architecture:** -- Embedded web server in bloom binary -- Single-page application (SPA) for config generation -- Static files embedded in Go binary - -**Tech Stack:** -- Backend: Go net/http (embedded in bloom) -- Frontend: Vanilla JS or lightweight framework (Preact/Alpine.js) -- No external dependencies at runtime - -**Features:** -1. Form-based configuration - - Step-by-step wizard interface - - Auto-detection of system capabilities - - Real-time validation - - Preview generated YAML - -2. Templates - - Pre-configured scenarios (single node, multi-node, GPU cluster) - - Load/save/export configurations - - Import existing bloom.yaml for editing - -3. Documentation - - Inline help for each field - - Link to full docs - - Example values - -**UI Flow:** -``` -1. Node Type → 2. Network → 3. Storage → 4. GPU → 5. Review → 6. Download/Deploy -``` - -**Example Usage:** -```bash -bloom webui # Start web UI on http://localhost:8080 -bloom webui --port 9090 # Custom port -bloom webui --no-browser # Don't auto-open browser -``` - -**Deliverable:** Web UI accessible at localhost that generates bloom.yaml files - -### Phase 4: Testing (Week 4-5) ✅ COMPLETED - -**Goal:** Robot Framework test suite - -**Test Coverage:** -1. ✅ Schema validation tests - - Schema-driven validation (all pattern types) - - Tests ALL valid/invalid examples from schema - - Automatic field visibility handling - -2. ✅ Web UI tests - - UI loads correctly - - Form generation from schema - - Pattern validation through UI - - Required fields validation - -3. ✅ API tests - - Schema endpoint returns valid JSON - - Config validation endpoint - -4. ✅ Config generation tests - - Generate valid first node config - - Generate valid additional node config - - Generate config with TLS certificates - - Generate config with advanced options - - API generate endpoint - - Invalid config rejection - - Default values in generated config - - Field visibility affects generated config - -**Implemented Structure:** -``` -tests/robot/ -├── api.robot # API endpoint tests -├── ui.robot # UI loading tests -├── validation.robot # Form validation tests -├── schema_validation.robot # Schema-driven validation (comprehensive) -├── config_generation.robot # Config generation tests (NEW) -├── yaml_loader.py # Helper to extract examples from schema -└── run_tests_docker.sh # Docker-based test runner -``` - -**Key Achievement:** Schema-driven testing approach -- Tests automatically stay in sync with schema changes -- All 9 pattern types validated with complete example coverage -- Eliminated test duplication (removed 245 lines of redundant tests) - -**Deliverable:** ✅ Automated Robot Framework test suite (COMPLETE) - -### Phase 4.5: Schema Refactoring ✅ COMPLETED - -**Goal:** Consolidate schema definition to single source of truth - -**Implementation:** -1. ✅ Created schema_loader.go - - Loads schema from schema/bloom.yaml.schema.yaml at runtime - - Converts YAML schema to Argument structs for API - - Maps types, dependencies, patterns, and validation messages - -2. ✅ Removed hardcoded schema - - Eliminated 270 lines of hardcoded Go schema definitions - - Schema.go now only contains Argument struct definition - - All field definitions driven by YAML schema - -3. ✅ Enhanced YAML schema - - Added comprehensive examples for all 9 pattern types - - Added error messages for better UX - - Added section groupings for UI organization - - Removed empty strings from valid examples (browser validation issue) - -4. ✅ Updated frontend - - Fixed bug where fields with "URL" in name got wrong input type - - Frontend now uses pattern attribute for validation when available - - Respects schema-driven field visibility - -5. ✅ Schema-driven testing - - Created yaml_loader.py to extract examples from schema - - Tests automatically use all schema examples - - Added visibility step handling (GPU_NODE, FIRST_NODE, CERT_OPTION) - - Fixed double-click issue for checkbox validation - -**Benefits:** -- Schema is the single source of truth (no duplication) -- Adding new fields only requires updating YAML schema -- Tests automatically stay in sync with schema -- Frontend validation driven by schema patterns -- Consistent validation across backend and frontend - -**Files:** -- `schema/bloom.yaml.schema.yaml` (enhanced with examples) -- `internal/config/schema_loader.go` (NEW - 192 lines) -- `internal/config/schema_loader_test.go` (NEW - 170 lines) -- `internal/config/schema.go` (reduced from 285 to 16 lines) -- `tests/robot/schema_validation.robot` (NEW - 112 lines) -- `tests/robot/yaml_loader.py` (NEW - 123 lines) -- `tests/robot/validation.robot` (reduced from 266 to 21 lines) - -**Deliverable:** ✅ Schema-driven architecture (COMPLETE) - -### Phase 5: Refactoring & UI Cleanup (In Progress) - -**Goal:** Improve code organization, schema handling, and UI polish - -**Schema Handling Tasks:** -1. ⬜ Consolidate validation logic - - Move all pattern validation to use schema types - - Eliminate any remaining hardcoded validation - - Ensure frontend and backend use same patterns - -2. ⬜ Schema versioning - - Add schema version field - - Plan for future schema migrations - - Document schema extension process - -3. ⬜ Schema documentation - - Auto-generate field reference from schema - - Add inline examples to all fields - - Document pattern syntax and requirements - -4. ⬜ Error message improvements - - Ensure all patterns have clear error messages - - Add contextual help for common validation failures - - Improve error display in UI - -**UI Cleanup Tasks:** -1. ⬜ Form organization - - Review section groupings - - Improve conditional field visibility logic - - Add field dependency visualization - -2. ⬜ Validation feedback - - Real-time validation indicators - - Clear error/success states - - Help text for complex fields - -3. ⬜ YAML preview improvements - - Syntax highlighting - - Copy to clipboard button - - Download with timestamp - -4. ⬜ Accessibility - - Keyboard navigation - - Screen reader support - - Focus management - -5. ⬜ Responsive design - - Mobile-friendly layout - - Touch-friendly controls - - Adaptive form sections - -**Code Organization Tasks:** -1. ⬜ Extract reusable components - - Form field generators - - Validation utilities - - Schema parsers - -2. ⬜ Consistent error handling - - Standardize error types - - Improve error propagation - - Add error context - -3. ⬜ Code documentation - - Add godoc comments - - Document public APIs - - Add usage examples - -4. ⬜ Test coverage improvements - - Add unit tests for schema loader - - Test edge cases in validation - - Add integration tests - -**Deliverable:** Polished, maintainable codebase with excellent UX - -### Phase 6: Documentation & Polish (Week 5-6) - -**Tasks:** -1. Update README with v2 usage -2. Migration guide from v1 to v2 -3. API documentation for playbook variables -4. Video/GIF demos of Web UI -5. Performance optimization -6. Error message improvements - -**Deliverable:** Production-ready Bloom V2 - -## File Structure (Target) - -``` -cluster-bloom/ -├── cmd/ -│ ├── bloom/ -│ │ └── main.go # Main bloom binary -│ └── bloom-config/ -│ └── main.go # Config generator CLI -├── pkg/ -│ ├── ansible/ -│ │ ├── container.go # Ansible container runtime -│ │ ├── runner.go # Playbook execution -│ │ └── cache.go # Image caching -│ ├── config/ -│ │ ├── parser.go # YAML parsing -│ │ ├── validator.go # Config validation -│ │ └── generator.go # Config generation logic -│ └── webui/ -│ ├── server.go # Web server -│ ├── handlers.go # API handlers -│ └── static/ # Embedded web assets -│ ├── index.html -│ ├── app.js -│ └── styles.css -├── playbooks/ -│ ├── main.yml # Orchestration playbook -│ ├── rocm.yml -│ ├── disks.yml -│ ├── rke2.yml -│ ├── longhorn.yml -│ ├── metallb.yml -│ └── cluster-forge.yml -├── tests/ -│ ├── robot/ # Robot Framework tests -│ └── e2e/ # Existing e2e tests -├── tmp/ # Planning docs (gitignored) -└── dist/ - └── bloom # Compiled binary -``` - -## Key Decisions - -### 1. Backward Compatibility - -**Question:** Should v2 support v1 deployment logic? - -**Decision:** NO - clean break -- V2 is a complete reimplementation -- V1 remains available on main/release branches for existing users -- V2 starts fresh with new design -- Migration guide will help users transition - -### 2. Config Format - -**Question:** Should we extend/change bloom.yaml format? - -**Decision:** Design optimal format for v2 -- Learn from v1 config structure -- Design clean, intuitive schema -- Use schema versioning for future changes -- Provide conversion tool for v1 → v2 configs (optional, low priority) - -### 3. Web UI Distribution - -**Question:** Separate binary or embedded? - -**Decision:** Embedded in main bloom binary -- Single binary distribution -- `bloom webui` command launches server -- No separate deployment needed - -### 4. Ansible Image - -**Question:** Which Ansible container image to use? - -**Decision:** Use `willhallonline/ansible:latest` (PoC proven) -- ~500MB download (one-time) -- Cached at `/var/lib/bloom/rootfs` -- Contains full Ansible with common modules - -### 5. State Management - -**Question:** Track deployment state? - -**Decision:** Phase 1 - No state tracking (rely on Ansible idempotency) -- Future: Consider state file for resume capability -- Let Ansible modules handle "already done" checks - -## Success Metrics - -1. **Functionality:** - - CLI wizard generates valid configs (100% success rate) - - Web UI generates valid configs (100% success rate) - - Bloom deploys clusters successfully (same success rate as v1) - -2. **User Experience:** - - Config generation time: < 5 minutes (CLI/Web UI) - - First-time deployment: similar to v1 (~10-15 min) - - Subsequent runs: < 5 minutes (idempotent) - -3. **Code Quality:** - - Robot Framework tests: > 80% coverage - - All tests passing in CI/CD - - Documentation complete - -4. **Adoption:** - - Internal team uses v2 for new deployments - - Migration guide helps v1 → v2 transition - -## Risks & Mitigations - -### Risk 1: Ansible Learning Curve -**Impact:** Medium -**Mitigation:** Reference PoC, use well-documented modules, start simple - -### Risk 2: Binary Size Increase -**Impact:** Low -**Mitigation:** Web UI assets are small, overall binary still < 30MB - -### Risk 3: Breaking Changes from V1 -**Impact:** Medium (V2 is separate implementation) -**Mitigation:** -- V1 remains on main branch for existing users -- Clear documentation that V2 is new implementation -- Migration guide for transitioning users -- V2 developed on bloom-v2 branch until ready - -### Risk 4: Web UI Complexity -**Impact:** Medium -**Mitigation:** Keep UI simple, use lightweight framework, progressive enhancement - -## Timeline - -| Phase | Duration | Deliverable | -|-------|----------|-------------| -| Phase 1: Foundation | 2 weeks | Working bloom binary with Ansible | -| Phase 2: CLI Generator | 1 week | Interactive config wizard | -| Phase 3: Web UI | 2 weeks | Web-based config generator | -| Phase 4: Testing | 1 week | Robot Framework test suite | -| Phase 5: Documentation | 1 week | Complete docs and polish | -| **Total** | **7 weeks** | **Production-ready Bloom V2** | - -## Design Questions to Answer (Phase 1) - -Before starting implementation, need to decide: - -1. **Execution Engine:** - - Pure Go vs embedded Ansible vs hybrid? - - Tradeoffs: complexity, maintainability, capabilities - -2. **Config Schema:** - - Minimal required fields vs comprehensive? - - How to handle optional components? - - Validation strategy? - -3. **Task Orchestration:** - - Linear steps vs DAG? - - Parallel execution? - - Retry/rollback mechanisms? - -4. **Code Organization:** - - Monorepo vs separate CLI/server? - - Package structure? - - Plugin architecture? - -## Implementation Status - -### ✅ Completed - -**Phase 3: Web UI (HIGH PRIORITY)** - COMPLETE -- ✅ Web-based bloom.yaml generator (`bloom webui` command) -- ✅ Schema-driven dynamic form generation from Go backend -- ✅ HTML5 real-time validation with V1 pattern compatibility -- ✅ Conditional field visibility based on dependencies -- ✅ File save to server's cwd with custom filename -- ✅ Minimal YAML output (only non-default values) -- ✅ FIRST_NODE and GPU_NODE always included -- ✅ Port management (auto-discovery from 62078, explicit with --port) -- ✅ Robot Framework tests (10 essential tests, 100% passing) -- ✅ V1/V2 schema parity - all V1 arguments present - -**Commits:** -- `eb4d523` feat(webui): add file save with custom filename and minimal YAML output -- `8f5d384` feat(webui): implement schema-driven validation with V1 pattern compatibility -- `3a7b079` feat(webui): add HTML5 field validation with real-time feedback -- `9a3895a` feat(webui): implement smart port management with auto-discovery - -**Phase 1: Core Architecture** - COMPLETE ✅ -- ✅ Config parser (pkg/config/schema.go - single source of truth) -- ✅ Config validator (pkg/config/validator.go - schema-driven validation) -- ✅ Config generator (Web UI + YAML generation) -- ✅ **Deployment engine** - IMPLEMENTED - -### ✅ Completed - -**Phase 1b: Ansible Deployment Engine** - COMPLETE ✅ - -**Design Complete** - All architectural decisions finalized (2025-12-10) - -**Architecture:** -- Command: `bloom ansible ` subcommand -- Runtime: pkg/ansible/runtime package (extracted from bloomv2 experiment) -- Playbooks: Embedded in pkg/ansible/runtime/playbooks/ -- Config: Reuses pkg/config package (no conversion needed) -- Filtering: DISABLED_STEPS/ENABLED_STEPS deferred to v2.1 - -**Implementation Tasks:** -1. ✅ Add go-containerregistry dependency to go.mod -2. ✅ Create pkg/ansible/runtime package (container execution) - - container.go: Image pulling and caching - - executor_linux.go: Linux namespace container execution - - executor_other.go: Stub for non-Linux platforms - - playbook.go: Playbook execution orchestration -3. ✅ Copy and embed playbooks/ from experiments/bloomv2 - - cluster-bloom.yaml (main deployment playbook) - - hello.yml (test playbook) -4. ✅ Playbooks already use UPPERCASE variable names -5. ✅ Add ansible subcommand to cmd/main.go -6. ✅ Wire up bloom.yaml reading with pkg/config - - Config loading with LoadConfig() - - Validation with Validate() - - Pass config as Ansible extra vars -7. ⬜ Test basic deployment workflow (NEXT STEP) - -### 📋 Not Started - -**Phase 2: CLI Generator** - LOW PRIORITY -- Interactive CLI wizard for bloom.yaml generation -- Deprioritized since Web UI is complete and working - -**Phase 4: Testing (Deployment)** - BLOCKED -- Waiting for ansible command implementation -- Web UI tests already complete - -**Phase 5: Documentation & Polish** - FUTURE -- Update README with v2 usage -- Migration guide from v1 to v2 -- Performance optimization - -## Next Actions - -1. ✅ Create bloom-v2 branch in cluster-bloom -2. ✅ Write planning document -3. ✅ Design and implement Web UI -4. ✅ Implement schema-driven validation -5. ✅ Add Robot Framework tests for Web UI -6. ✅ Implement `bloom ansible` command - - ✅ Copy pattern from /workspace/platform/experiments/bloomv2 - - ✅ Adapt to use bloom.yaml as input - - ✅ Embed cluster-bloom.yaml playbook - - ✅ Add to cluster-bloom repository -7. **→ Test ansible command with generated bloom.yaml files (CURRENT)** - - Test hello.yml playbook execution - - Verify config loading and validation - - Test cluster-bloom.yaml with sample config -8. Complete deployment tests -9. Documentation and polish - -## Outstanding Design Questions - -### 1. ✅ Execution Engine - ANSWERED -**Decision:** Embedded Ansible (from platform/experiments/bloomv2) -- Proven pattern in PoC -- Self-contained binary -- No external dependencies -- Uses willhallonline/ansible:latest image - -### 2. ✅ Config Schema - ANSWERED -**Decision:** Comprehensive schema matching V1 -- 26 arguments across 6 sections -- Pattern validation for all fields -- Conditional field visibility -- Type-safe with proper defaults - -### 3. ✅ Web UI Distribution - ANSWERED -**Decision:** Embedded in main bloom binary -- `bloom webui` command -- Static assets embedded via go:embed -- No separate deployment - -### 4. ✅ Task Orchestration - ANSWERED -**Decision:** Use existing cluster-bloom.yaml playbook -- Embed entire playbooks/ directory from experiments/bloomv2 -- Update playbook to use UPPERCASE variable names (no conversion needed) -- Step filtering (DISABLED_STEPS/ENABLED_STEPS) deferred to v2.1 - -### 5. ✅ Command Structure - ANSWERED -**Decision:** Subcommand architecture -- `bloom ansible ` as subcommand in cmd/bloom/main.go -- Consistent with `bloom webui` pattern -- Single binary distribution - -### 6. ✅ Config Reading - ANSWERED -**Decision:** Reuse existing internal/config package -- Parse bloom.yaml using internal/config -- Validate with existing validators -- Pass Config map directly as Ansible extra vars (-e KEY=value) -- No conversion logic needed - -### 7. ✅ Runtime Architecture - ANSWERED -**Decision:** Extract into pkg/ansible/runtime package -- Clean separation from command logic -- Reusable container runtime: image pulling, layer extraction, namespace creation -- Better testing isolation -- More maintainable structure - -## Current Blockers - -**None** - Path forward is clear: -1. Implement `bloom ansible` command using bloomv2 experiment pattern -2. Existing playbooks in platform/experiments/bloomv2/playbooks/cluster-bloom.yaml -3. Integration with bloom.yaml schema already defined - -## Updated Timeline - -| Phase | Status | Actual Duration | Notes | -|-------|--------|-----------------|-------| -| Phase 1: Foundation | 🔄 Partial | 2 weeks | Config done, need deployment | -| Phase 2: CLI Generator | ⏸️ Skipped | - | Web UI supersedes this | -| Phase 3: Web UI | ✅ Complete | 3 weeks | Done with full validation | -| **Phase 1b: Ansible Command** | 📋 Next | ~1 week | Copy pattern from bloomv2 | -| Phase 4: Testing | 📋 Pending | ~1 week | After ansible command | -| Phase 5: Documentation | 📋 Pending | ~3 days | Final polish | -| **Remaining** | - | **~2-3 weeks** | **To production** | - ---- - -**Status:** Phase 3 (Web UI) complete. Next: Implement ansible deployment engine. -**Last Updated:** 2025-12-10 -**Branch:** bloom-v2 -**Issue:** #609 (Open, In Progress) diff --git a/tmp/BLOOM_V2_PRD.md b/tmp/BLOOM_V2_PRD.md deleted file mode 100644 index 197aba9..0000000 --- a/tmp/BLOOM_V2_PRD.md +++ /dev/null @@ -1,576 +0,0 @@ -# Product Requirements Document: ClusterBloom V2 - -**Version:** 2.0 -**Status:** In Development -**Issue:** #609 - Bloom V2 -**Branch:** bloom-v2 -**Last Updated:** 2025-12-10 - -## Executive Summary - -ClusterBloom V2 is a complete reimagination of the Kubernetes cluster deployment tool, transitioning from imperative Go code to a declarative Ansible-based architecture while adding a modern web-based configuration generator. V2 maintains all V1 capabilities for AMD GPU environments while dramatically improving maintainability, extensibility, and user experience. - -## Product Overview - -### Purpose -ClusterBloom V2 automates Kubernetes cluster deployment with AMD GPU support through: -- **Web-based configuration generator** - No more manual YAML editing -- **Declarative Ansible playbooks** - More maintainable than shell scripts -- **Self-contained binary** - No external dependencies (Docker, Python, or pre-installed Ansible) -- **Separation of concerns** - Config generation decoupled from deployment - -### Target Users -- DevOps Engineers managing AMD GPU workloads -- Platform Teams deploying Kubernetes infrastructure -- Organizations requiring automated cluster provisioning with AMD GPU support -- Teams needing reliable storage configuration with Longhorn -- **NEW:** Users preferring web interfaces over CLI/YAML editing - -### What's New in V2 - -**Architecture Changes:** -- ✅ Web UI for configuration generation (no manual YAML editing required) -- ✅ Schema-driven validation (V1 pattern compatibility) -- ✅ Ansible playbooks instead of Go shell execution -- ✅ Self-contained binary with embedded Ansible runtime -- ✅ Minimal YAML output (only non-default values) - -**User Experience Improvements:** -- ✅ Real-time form validation in browser -- ✅ Conditional field visibility (smart forms) -- ✅ Custom filename support for generated configs -- ✅ File saved to server's working directory -- ✅ Port auto-discovery (no conflicts) - -**Developer Experience:** -- ✅ Single source of truth for configuration schema -- ✅ Easier to extend (add playbooks vs modify Go code) -- ✅ Better idempotency (Ansible modules) -- ✅ Clean separation: generate config → deploy with Ansible - -## Core Features - -### 1. Web-Based Configuration Generator ⭐ NEW - -Browser-based configuration wizard that generates valid `bloom.yaml` files without manual editing. - -**Features:** -- Schema-driven dynamic form generation -- Real-time HTML5 validation with custom error messages -- Conditional field visibility based on dependencies -- 6 organized sections: Basic, Node, Storage, SSL/TLS, Advanced, CLI Options -- Preview generated YAML before saving -- Save with custom filename to server's current directory -- Port management (auto-discovery from 62078 or explicit with `--port`) - -**Technical Implementation:** -- Backend: Go with embedded static assets (`go:embed`) -- Frontend: Vanilla JavaScript (no external dependencies) -- Validation: HTML5 patterns matching V1 validators -- Schema: Single source of truth in `internal/config/schema.go` - -**User Flow:** -``` -1. Run: bloom webui -2. Open browser to http://localhost:62080 -3. Fill configuration form with real-time validation -4. Click "Generate bloom.yaml" -5. Preview YAML output -6. Save with custom filename -7. Use saved bloom.yaml with deployment command -``` - -**Current Status:** ✅ COMPLETE - -**[📄 Implementation Details](./BLOOM_V2_PLAN.md#phase-3-web-ui)** - -### 2. Ansible-Based Deployment Engine 🔄 IN PROGRESS - -Self-contained Go binary that runs embedded Ansible playbooks without requiring Docker, Python, or pre-installed Ansible. - -**Features:** -- Embedded Ansible runtime using Linux namespaces -- Containerized Ansible image cached locally (~500MB one-time download) -- Host filesystem mounted at `/host` inside container -- Reads `bloom.yaml` and passes as Ansible variables (UPPERCASE, no conversion) -- Embedded playbooks from experiments/bloomv2 -- Step filtering deferred to v2.1 - -**Technical Implementation:** -- Command: `bloom ansible ` subcommand -- Runtime: `pkg/ansible/runtime` package (extracted from bloomv2 experiment) -- Config: Reuses `internal/config` package for parsing/validation -- Container image: `willhallonline/ansible:latest` -- Image library: `go-containerregistry` for pulling/caching -- Isolation: Linux namespaces (UTS, PID, Mount) -- Cache location: `/var/lib/bloom/rootfs` -- Logs: `/var/log/bloom/run-*.log` - -**Architecture:** -``` -pkg/ansible/ -├── runtime/ -│ ├── container.go # Image pull/cache -│ ├── executor.go # Namespace creation & execution -│ └── playbook.go # Playbook running logic -└── playbooks/ # Embedded via go:embed - └── cluster-bloom.yaml (UPPERCASE vars) -``` - -**User Flow:** -``` -1. Generate bloom.yaml via Web UI or manually -2. Run: sudo bloom ansible bloom.yaml -3. First run: Downloads Ansible image (~500MB) -4. Subsequent runs: Uses cached image -5. Executes cluster-bloom.yaml playbook -6. Cluster deployed and ready -``` - -**Current Status:** 🎯 DESIGN COMPLETE (2025-12-10) - Ready for implementation - -**[📄 Reference Implementation](https://github.com/silogen/platform/tree/bloomV2/experiments/bloomv2)** - -### 3. Automated RKE2 Kubernetes Deployment - -Same as V1 - automated deployment of production-ready RKE2 clusters. - -**V2 Changes:** -- Implemented via Ansible playbook instead of Go code -- Ansible tasks in `playbooks/cluster-bloom.yaml` -- Idempotent by default (Ansible module behavior) - -**Status:** ✅ Playbook exists, needs integration - -### 4. AMD GPU Support with ROCm - -Same as V1 - automated AMD GPU driver installation and configuration. - -**V2 Changes:** -- ROCm installation via Ansible apt module -- Device detection via Ansible facts -- Permission configuration via Ansible file module - -**Status:** ✅ Playbook exists, needs integration - -### 5. Storage Management with Longhorn - -Same as V1 - distributed block storage with automatic disk detection. - -**V2 Changes:** -- Disk preparation via Ansible mount/filesystem modules -- Longhorn deployment via Ansible kubernetes modules -- Better error handling with Ansible's built-in retries - -**Status:** ✅ Playbook exists, needs integration - -### 6. Network Configuration - -Same as V1 - MetalLB load balancing, firewall configuration, multipath. - -**V2 Changes:** -- Firewall rules via Ansible ufw/firewalld modules -- MetalLB config via Ansible template module -- Chrony setup via Ansible service module - -**Status:** ✅ Playbook exists, needs integration - -### 7. Configuration Management - -**V2 Improvements:** -- ✅ Web UI for guided configuration (PRIMARY METHOD) -- ✅ Schema-driven validation (single source of truth) -- ✅ Real-time validation in browser -- ✅ Minimal YAML output (only non-default values) -- ✅ V1 pattern compatibility (all validators match) - -**Configuration Sources (Priority Order):** -1. Web UI generated YAML (recommended) -2. Manually written YAML -3. Environment variables (via `.env` file) -4. CLI flags (for Ansible execution) - -**Status:** ✅ COMPLETE - -### 8. TLS Certificate Management - -Same as V1 - three options (cert-manager, existing, self-signed). - -**V2 Changes:** -- Certificate deployment via Ansible k8s module -- Cert-manager installation via Ansible helm module -- Certificate validation via Ansible openssl module - -**Status:** ✅ Playbook exists, needs integration - -### 9. Validation and Testing - -**Pre-deployment Validation:** -- ✅ Web UI: Real-time form validation -- ✅ Backend: Schema validation before Ansible execution -- 📋 Ansible: System requirements check tasks - -**Testing Framework:** -- ✅ Robot Framework tests for Web UI (10 tests, 100% passing) -- 📋 Robot Framework tests for Ansible deployment (pending) -- 📋 E2E tests for full workflow (pending) - -**Status:** ✅ Web UI tested, deployment tests pending - -## Technical Architecture - -### V2 Architecture Shift - -**From (V1):** -``` -User → Manual YAML → bloom binary → Go code + shell scripts → Deployed cluster -``` - -**To (V2):** -``` -User → Web UI → bloom.yaml → bloom ansible → Ansible playbooks → Deployed cluster - ↓ - Validation -``` - -### Component Organization - -``` -cluster-bloom/ -├── cmd/ -│ ├── bloom/ # Main binary -│ │ └── main.go # Entry point with webui command -│ └── ansible/ # Ansible command (NEW) -│ └── main.go # Embedded Ansible runtime -├── pkg/ -│ ├── config/ -│ │ ├── schema.go # ✅ Single source of truth -│ │ ├── validator.go # ✅ Field validators (V1 compat) -│ │ ├── generator.go # ✅ YAML generation -│ │ └── types.go # ✅ Type definitions -│ ├── webui/ -│ │ ├── server.go # ✅ Web server -│ │ ├── handlers.go # ✅ API endpoints -│ │ └── static/ # ✅ Embedded web assets -│ └── ansible/ # 📋 Ansible runtime (TODO) -│ ├── container.go # Container runtime -│ ├── runner.go # Playbook execution -│ └── cache.go # Image caching -├── cmd/bloom/web/ -│ └── static/ # ✅ Web UI assets -│ ├── index.html -│ ├── js/ -│ │ ├── app.js -│ │ ├── form.js -│ │ ├── validator.js -│ │ └── schema.js -│ └── css/styles.css -├── playbooks/ # 📋 Ansible playbooks (TODO) -│ └── cluster-bloom.yaml # Main orchestration playbook -├── internal/config/ # ✅ Configuration handling -├── tests/robot/ # ✅ Robot Framework tests -│ ├── api.robot # API tests -│ ├── ui.robot # UI tests -│ └── validation.robot # Validation tests -└── dist/ - └── bloom-v2 # Compiled binary -``` - -### API Endpoints - -**Web UI Backend:** -- `GET /` - Serve Web UI -- `GET /api/schema` - Return configuration schema -- `POST /api/validate` - Validate configuration -- `POST /api/generate` - Generate YAML preview -- `POST /api/save` - Save YAML to file - -**Status:** ✅ All implemented - -### Data Flow - -**Configuration Generation:** -``` -Browser → /api/schema → Schema JSON -Browser Form → /api/validate → Validation errors -Browser Form → /api/save → bloom.yaml file -``` - -**Deployment Execution:** -``` -bloom.yaml → Ansible vars → Embedded playbook → Deployed cluster -``` - -## User Experience - -### Installation Workflows - -#### Web UI Configuration (Recommended) -```bash -# Start Web UI -bloom webui - -# Or with custom port -bloom webui --port 9090 - -# Browser opens to http://localhost:62080 -# Fill form, generate bloom.yaml -# Click "Save bloom.yaml" -``` - -#### Deploy with Generated Config -```bash -# After generating bloom.yaml via Web UI -sudo bloom ansible bloom.yaml - -# First run downloads Ansible image (~500MB) -# Subsequent runs use cached image -``` - -#### First Node Setup -```bash -# 1. Generate config via Web UI (FIRST_NODE=true) -bloom webui - -# 2. Deploy cluster -sudo bloom ansible bloom.yaml -``` - -#### Additional Node Setup -```bash -# 1. Generate config via Web UI (FIRST_NODE=false) -# Provide SERVER_IP and JOIN_TOKEN from first node -bloom webui - -# 2. Join cluster -sudo bloom ansible bloom.yaml -``` - -#### Manual Configuration (Advanced) -```bash -# Create bloom.yaml manually -cat > bloom.yaml <95% (pending ansible command) - -### Secondary Metrics -- **Binary Size**: Target <50MB (TBD after Ansible embedding) -- **First Run Time**: Target <5 minutes (image download) -- **Subsequent Run Time**: Target <30 minutes (cluster deployment) -- **Test Coverage**: Web UI 100% ✅, Deployment 0% 📋 - -## Known Limitations - -### V2 Specific -1. **Ansible Command Not Implemented**: Deployment engine pending -2. **No CLI Wizard**: Only Web UI for config generation (acceptable trade-off) -3. **Requires Root**: Ansible runtime needs root for namespaces -4. **First Run Download**: ~500MB Ansible image (one-time, cached) - -### Inherited from V1 -1. **No Backup/Recovery**: Same as V1 -2. **No Built-in Monitoring**: Same as V1 -3. **Ubuntu Only**: Same as V1 -4. **No HA Automation**: Same as V1 - -## Future Roadmap - -### Immediate (Next 2-3 Weeks) -1. **Implement Ansible Command**: Top priority -2. **Deployment Testing**: Robot Framework tests -3. **Documentation**: README, migration guide -4. **Release V2.0**: Production-ready - -### Near-term (3-6 Months) -1. **CLI Config Generator**: If Web UI proves insufficient -2. **Enhanced Validation**: Pre-flight system checks via Ansible -3. **Monitoring Integration**: Optional Prometheus/Grafana playbook -4. **Backup Playbooks**: Automated backup via Ansible - -### Medium-term (6-12 Months) -1. **Multi-OS Support**: Ansible playbooks for CentOS/RHEL -2. **Cloud Playbooks**: AWS/Azure/GCP specific tasks -3. **HA Playbooks**: Automated HA configuration -4. **Scaling Playbooks**: Automated cluster scaling - -## Migration from V1 - -### Compatibility -- ✅ Config schema matches V1 (all arguments present) -- ✅ Validation patterns match V1 exactly -- ✅ Same deployment outcomes expected - -### Migration Path -1. **Generate new config**: Use V2 Web UI instead of editing YAML -2. **Deploy with Ansible**: Use `bloom ansible` instead of `bloom` -3. **Same clusters**: V2 deploys identical clusters to V1 - -### Breaking Changes -- Command changed: `bloom` → `bloom ansible bloom.yaml` -- Requires root: Ansible runtime needs root access -- First run slower: One-time Ansible image download - -## Conclusion - -ClusterBloom V2 represents a significant architectural improvement while maintaining full compatibility with V1 cluster deployments. The Web UI dramatically improves user experience for configuration generation, while the Ansible-based deployment engine provides better maintainability and extensibility for future enhancements. - -**Key Achievements:** -- ✅ Web UI eliminates manual YAML editing -- ✅ Schema-driven validation ensures correctness -- ✅ V1 pattern compatibility maintained -- ✅ Comprehensive testing framework - -**Remaining Work:** -- 🔄 Ansible deployment engine (2-3 weeks) -- 📋 Deployment testing (~1 week) -- 📋 Documentation (~3 days) - -**Timeline to Production:** ~2-3 weeks - ---- - -**Status:** Phase 3 (Web UI) complete. Phase 1b (Ansible) in progress. -**Last Updated:** 2025-12-10 -**Branch:** bloom-v2 -**Issue:** #609 (Open, In Progress) diff --git a/tmp/BLOOM_YAML_SPEC_V1.md b/tmp/BLOOM_YAML_SPEC_V1.md deleted file mode 100644 index dce2896..0000000 --- a/tmp/BLOOM_YAML_SPEC_V1.md +++ /dev/null @@ -1,112 +0,0 @@ -# bloom.yaml Specification (V1 - Must Remain Identical in V2) - -**Decision:** Bloom V2 reimplements the code but keeps the exact same bloom.yaml format. - -## All Configuration Variables - -Based on v1 implementation in `pkg/args/args_test.go`: - -```yaml -# Node Configuration -FIRST_NODE: true # bool - Is this the first node in cluster? -GPU_NODE: true # bool - Does this node have GPUs? -CONTROL_PLANE: false # bool - Control plane node? (only when FIRST_NODE=false) - -# Cluster Join (for additional nodes) -SERVER_IP: "" # string - RKE2 server IP (required when FIRST_NODE=false) -JOIN_TOKEN: "" # string - Join token (required when FIRST_NODE=false) - -# Domain & Networking -DOMAIN: "" # string - Domain name (required when FIRST_NODE=true) - -# Certificates -USE_CERT_MANAGER: false # bool - Use cert-manager + Let's Encrypt? -CERT_OPTION: "" # enum: "existing" or "generate" (when USE_CERT_MANAGER=false) -TLS_CERT: "" # file path - TLS cert (when CERT_OPTION=existing) -TLS_KEY: "" # file path - TLS key (when CERT_OPTION=existing) - -# Authentication -ADDITIONAL_OIDC_PROVIDERS: [] # array - Additional OIDC providers - -# GPU/ROCm -ROCM_BASE_URL: "https://repo.radeon.com/amdgpu-install/6.3.2/ubuntu/" # url (when GPU_NODE=true) - -# Storage -CLUSTER_DISKS: "" # string - Comma-separated disk paths -CLUSTER_PREMOUNTED_DISKS: "" # string - Premounted disk paths -NO_DISKS_FOR_CLUSTER: false # bool - Skip disk operations? -SKIP_RANCHER_PARTITION_CHECK: false # bool - Skip partition size check? - -# ClusterForge -CLUSTERFORGE_RELEASE: "https://..." # url or "none" -CF_VALUES: "" # string - Path to CF values file - -# Step Control -DISABLED_STEPS: "" # string - Comma-separated step names to skip -ENABLED_STEPS: "" # string - Comma-separated steps to run (if empty, run all) - -# Misc -PRELOAD_IMAGES: "" # string - Images to preload -``` - -## Example: First Node - -```yaml -FIRST_NODE: true -GPU_NODE: false -DOMAIN: cluster.example.com -USE_CERT_MANAGER: false -CERT_OPTION: generate -CLUSTER_DISKS: /dev/nvme0n1,/dev/nvme1n1 -NO_DISKS_FOR_CLUSTER: false -CLUSTERFORGE_RELEASE: none -PRELOAD_IMAGES: "" -``` - -## Example: Additional Node - -```yaml -FIRST_NODE: false -GPU_NODE: false -SERVER_IP: 10.100.100.11 -JOIN_TOKEN: K10abc123...xyz::server:abc123 -CLUSTER_DISKS: /dev/nvme0n1,/dev/nvme1n1 -NO_DISKS_FOR_CLUSTER: false -CLUSTERFORGE_RELEASE: none -``` - -## Dependencies - -Some fields only apply when others are set: - -- `CONTROL_PLANE`: Only when `FIRST_NODE=false` -- `SERVER_IP`, `JOIN_TOKEN`: Required when `FIRST_NODE=false` -- `DOMAIN`, `USE_CERT_MANAGER`, `CERT_OPTION`: Only when `FIRST_NODE=true` -- `TLS_CERT`, `TLS_KEY`: Required when `CERT_OPTION=existing` -- `ROCM_BASE_URL`: Only when `GPU_NODE=true` - -## Validation Rules - -From v1 validators: -- `JOIN_TOKEN`: Must be valid RKE2/K3s token format -- Step names in `DISABLED_STEPS`/`ENABLED_STEPS`: Must match valid step IDs -- `DISABLED_STEPS`/`ENABLED_STEPS`: Cannot both be set -- URLs: Must be valid URLs -- File paths: Must exist -- IP addresses: Must be valid IPs - -## What V2 Must Do - -1. **Parse exactly this format** - no changes to field names or structure -2. **Same validation rules** - maintain compatibility -3. **Same defaults** - users expect same behavior -4. **Same dependencies** - conditional fields work the same way - -## What Web UI/CLI Must Generate - -Web UI and CLI wizard must generate bloom.yaml files in exactly this format with these exact field names. - ---- - -**Status:** V1 spec documented for V2 implementation -**Date:** 2025-12-08 From 040f9f27f6e35cd33c9dcde17bd1fccd4c34b68b Mon Sep 17 00:00:00 2001 From: Mika Ranta Date: Fri, 16 Jan 2026 10:24:09 +0200 Subject: [PATCH 3/3] fix: version info --- Makefile | 24 ------------------------ cmd/main.go | 7 ++++++- justfile | 12 ++++++++++++ 3 files changed, 18 insertions(+), 25 deletions(-) delete mode 100644 Makefile create mode 100644 justfile diff --git a/Makefile b/Makefile deleted file mode 100644 index aef3445..0000000 --- a/Makefile +++ /dev/null @@ -1,24 +0,0 @@ -.PHONY: build clean help - -BINARY_NAME=bloom-v2 -BUILD_DIR=dist -CMD_DIR=cmd/bloom - -help: - @echo "Bloom V2 Build" - @echo "" - @echo "Targets:" - @echo " build Build the bloom-v2 binary" - @echo " clean Remove build artifacts" - @echo " help Show this help message" - -build: - @echo "Building $(BINARY_NAME)..." - @mkdir -p $(BUILD_DIR) - CGO_ENABLED=0 go build -o $(BUILD_DIR)/$(BINARY_NAME) ./$(CMD_DIR) - @echo "Built: $(BUILD_DIR)/$(BINARY_NAME)" - -clean: - @echo "Cleaning build artifacts..." - @rm -rf $(BUILD_DIR) - @echo "Clean complete" diff --git a/cmd/main.go b/cmd/main.go index a118886..a476bff 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -13,6 +13,7 @@ import ( ) var ( + Version string // Set via ldflags during build port int playbookName string dryRun bool @@ -129,7 +130,11 @@ Requires a configuration file (typically bloom.yaml). Use --playbook to specify Use: "version", Short: "Show version information", Run: func(cmd *cobra.Command, args []string) { - fmt.Println("Bloom V2.0.0-alpha") + if Version != "" { + fmt.Printf("%s\n", Version) + } else { + fmt.Println("dev") + } }, } diff --git a/justfile b/justfile new file mode 100644 index 0000000..2a42680 --- /dev/null +++ b/justfile @@ -0,0 +1,12 @@ +# ClusterBloom build recipes + +# Default recipe - show available commands +default: + @just --list + +# Build the bloom binary with optional version parameter +build version="dev-build": + @echo "Building bloom (version: {{version}})..." + @mkdir -p dist + CGO_ENABLED=0 go build -ldflags="-X 'github.com/silogen/cluster-bloom/cmd.Version={{version}}'" -o dist/bloom + @echo "Built: dist/bloom" \ No newline at end of file