diff --git a/.github/ISSUE_TEMPLATE/decision.yml b/.github/ISSUE_TEMPLATE/decision.yml new file mode 100644 index 0000000..b0bf032 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/decision.yml @@ -0,0 +1,34 @@ +name: Decision Record Proposal +description: Propose or revise an architecture decision +labels: [decision] +body: + - type: input + id: adr + attributes: + label: Proposed ADR ID/Title + placeholder: ADR-00X - Title + validations: + required: true + - type: textarea + id: context + attributes: + label: Context + description: What problem or tradeoff is being addressed? + validations: + required: true + - type: textarea + id: options + attributes: + label: Options Considered + validations: + required: true + - type: textarea + id: recommendation + attributes: + label: Recommendation + validations: + required: true + - type: textarea + id: consequences + attributes: + label: Expected Consequences diff --git a/.github/ISSUE_TEMPLATE/epic.yml b/.github/ISSUE_TEMPLATE/epic.yml new file mode 100644 index 0000000..1b0050e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/epic.yml @@ -0,0 +1,27 @@ +name: Epic +description: Track a multi-step initiative with multiple issues +labels: [epic] +body: + - type: textarea + id: objective + attributes: + label: Objective + description: What larger outcome does this epic deliver? + validations: + required: true + - type: textarea + id: milestones + attributes: + label: Milestones + placeholder: | + - [ ] M1 ... + - [ ] M2 ... + - type: textarea + id: child_issues + attributes: + label: Child Issues + description: Link planned tasks + - type: textarea + id: risks + attributes: + label: Risks / Unknowns diff --git a/.github/ISSUE_TEMPLATE/task.yml b/.github/ISSUE_TEMPLATE/task.yml new file mode 100644 index 0000000..f027cb3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/task.yml @@ -0,0 +1,31 @@ +name: Task +description: Track a concrete, reviewable unit of work +title: "[Task] " +labels: [task] +body: + - type: textarea + id: summary + attributes: + label: Summary + description: What needs to be done? + validations: + required: true + - type: textarea + id: scope + attributes: + label: Scope + description: What is in scope and out of scope? + - type: textarea + id: acceptance + attributes: + label: Acceptance Criteria + description: What conditions indicate completion? + placeholder: | + - [ ] ... + - [ ] ... + validations: + required: true + - type: textarea + id: dependencies + attributes: + label: Dependencies / Blockers diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..72a5134 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,28 @@ +## Summary + +- What was changed? +- Why was it changed? + +## What was restructured + +- + +## What was archived + +- + +## New docs added + +- + +## Intentional TODOs / unknowns + +- + +## Review order + +1. `README.md` +2. `docs/vision/` +3. `docs/architecture/` +4. `docs/decisions/` +5. `docs/roadmap/` diff --git a/README.md b/README.md new file mode 100644 index 0000000..80fdd4a --- /dev/null +++ b/README.md @@ -0,0 +1,64 @@ +# Homelab Infrastructure (Documentation-First) + +This repository is the planning, architecture, and operations source of truth for a homelab platform pivoting to a **Proxmox-first** model with: + +- Virtualized Kubernetes running inside Proxmox +- Argo CD for GitOps +- TrueNAS as a separate storage/media anchor +- A documented path toward future AI/agent platform capabilities + +## Current Direction + +The prior bare-metal Talos/PXE direction is now treated as legacy reference material. The active path is: + +1. Stabilize documentation and inventory +2. Build/validate Proxmox platform foundations +3. Stand up Kubernetes in VMs on Proxmox +4. Bootstrap Argo CD and GitOps workflows +5. Incrementally migrate/support media and future AI workloads + +## Repository Structure + +- `docs/vision/` - mission, goals, and operating principles +- `docs/inventory/` - hardware, network, and service inventories +- `docs/architecture/` - target architecture and strategy documents +- `docs/decisions/` - ADRs (architecture decision records) +- `docs/runbooks/` - operational bootstrap and procedures +- `docs/roadmap/` - milestones, backlog, and current phase +- `docs/archive/` - preserved legacy material +- `proxmox/` - Proxmox-specific implementation notes/artifacts +- `kubernetes/` - Kubernetes platform design/implementation notes +- `argocd/` - Argo CD bootstrap and app-of-apps patterns +- `automation/` - automation workflow notes/scripts +- `ai/` - future AI/agent platform planning +- `scripts/` - helper scripts used by docs/runbooks +- `.github/` - issue and PR templates for GitHub-native planning + +## Current Phase + +See `docs/roadmap/current-phase.md` for the active phase and near-term focus. + +## How Decisions Are Recorded + +Architectural decisions are captured as ADRs in `docs/decisions/` using a consistent format: + +- Title +- Status +- Context +- Decision +- Alternatives considered +- Consequences + +## How Work Is Tracked + +- Use GitHub Issues with templates in `.github/ISSUE_TEMPLATE/` +- Use PR review for implementation changes +- Keep roadmap and backlog updated under `docs/roadmap/` + +## Start Here + +1. `docs/vision/homelab-vision.md` +2. `docs/inventory/hardware-inventory.md` +3. `docs/architecture/target-architecture.md` +4. `docs/decisions/ADR-001-proxmox-first.md` +5. `docs/runbooks/bootstrap-sequence.md` diff --git a/ai/README.md b/ai/README.md new file mode 100644 index 0000000..a0aa109 --- /dev/null +++ b/ai/README.md @@ -0,0 +1,3 @@ +# AI Area + +Future self-hosted AI and agent platform planning artifacts belong here. diff --git a/argocd/README.md b/argocd/README.md new file mode 100644 index 0000000..6e2da10 --- /dev/null +++ b/argocd/README.md @@ -0,0 +1,8 @@ +# Argo CD Area + +Argo CD bootstrap and GitOps structure artifacts belong here. + +## TODO + +- Add Argo CD bootstrap plan +- Add initial app-of-apps repository pattern notes diff --git a/automation/README.md b/automation/README.md new file mode 100644 index 0000000..4e60e6c --- /dev/null +++ b/automation/README.md @@ -0,0 +1,3 @@ +# Automation Area + +Automation workflows and supporting assets for this repo belong here. diff --git a/docs/architecture/media-platform.md b/docs/architecture/media-platform.md new file mode 100644 index 0000000..3079ca1 --- /dev/null +++ b/docs/architecture/media-platform.md @@ -0,0 +1,27 @@ +# Media Platform Architecture (Current + Future) + +## Current State + +- Media stack runs as Docker Compose inside a Proxmox VM. +- Includes Plex plus supporting media ecosystem services. +- Uses NFS-mounted media storage from TrueNAS. +- Uses NVIDIA-backed Plex transcoding in current deployment. + +## Position in Overall Platform + +- Media is important but remains a separate domain from core platform modernization. +- Core lab platform work (Proxmox, Kubernetes, Argo CD) should not be blocked by unresolved media end-state decisions. + +## Future State (Under Evaluation) + +Open questions: + +- Preferred media server platform (Plex vs Jellyfin vs Emby) +- Runtime location (apps directly on TrueNAS vs dedicated media VM) +- Long-term operational model and migration phasing + +## TODO + +- TODO: Build decision matrix for media server/runtime options. +- TODO: Define migration guardrails and rollback strategy. +- TODO: Identify which services remain VM-native vs move to other runtime models. diff --git a/docs/architecture/network-topology.md b/docs/architecture/network-topology.md new file mode 100644 index 0000000..2d97854 --- /dev/null +++ b/docs/architecture/network-topology.md @@ -0,0 +1,22 @@ +# Network Topology + +## Current Known Components + +- Ubiquiti Cloud Gateway Fiber +- Ubiquiti Pro XG 8 PoE +- Ubiquiti Lite 16 PoE +- Three MS-A2 compute nodes (10GbE) +- Separate TrueNAS storage node + +## Target Topology Intent + +- Isolate management and workload concerns where practical. +- Preserve clear path for storage traffic between compute and TrueNAS. +- Support VM-based Kubernetes networking without overcomplicating initial rollout. + +## TODO + +- TODO: Produce canonical topology diagram. +- TODO: Define management network, workload network, and storage network boundaries. +- TODO: Define VLAN IDs and subnet/IP plan. +- TODO: Define baseline firewall and east-west controls. diff --git a/docs/architecture/proxmox-k8s-strategy.md b/docs/architecture/proxmox-k8s-strategy.md new file mode 100644 index 0000000..e97a468 --- /dev/null +++ b/docs/architecture/proxmox-k8s-strategy.md @@ -0,0 +1,24 @@ +# Proxmox + Kubernetes Strategy + +## Strategy Statement + +Use Proxmox as the primary platform substrate, then deploy Kubernetes as virtualized nodes to improve flexibility, isolation, and recoverability compared to direct bare-metal coupling. + +## Why This Approach + +- Faster platform iteration via VM-level change management +- Cleaner separation between host lifecycle and Kubernetes lifecycle +- Easier experimentation for future platform domains + +## Proposed Sequence + +1. Validate Proxmox host readiness and cluster plan. +2. Define Kubernetes VM topology and bootstrap path. +3. Bring up a baseline Kubernetes cluster in Proxmox. +4. Bootstrap Argo CD and transition app delivery to GitOps. + +## TODO + +- TODO: Select Kubernetes provisioning method (e.g., kubeadm, Talos-in-VM, other). +- TODO: Define VM networking mode(s) and failure domain expectations. +- TODO: Document etcd/control-plane resilience targets. diff --git a/docs/architecture/security-model.md b/docs/architecture/security-model.md new file mode 100644 index 0000000..0389759 --- /dev/null +++ b/docs/architecture/security-model.md @@ -0,0 +1,20 @@ +# Security Model (Initial) + +## Principles + +- Least privilege access for platform administration +- Separation of management, data, and workload planes where possible +- Secrets managed outside plaintext repo content +- Change tracking through PR review and ADR records + +## Controls (Planned) + +- GitHub PR-based review for platform changes +- Explicit ADR decisions for major architectural/security tradeoffs +- Redaction of sensitive environment details in shared docs + +## TODO + +- TODO: Define identity and access model for Proxmox/TrueNAS/Kubernetes. +- TODO: Define secrets management pattern for GitOps workflows. +- TODO: Define baseline backup, restore, and incident response procedure. diff --git a/docs/architecture/target-architecture.md b/docs/architecture/target-architecture.md new file mode 100644 index 0000000..7850f70 --- /dev/null +++ b/docs/architecture/target-architecture.md @@ -0,0 +1,24 @@ +# Target Architecture (Planned) + +## High-Level Direction + +1. **Proxmox-first virtualization layer** on the three MS-A2 hosts +2. **Virtualized Kubernetes cluster** hosted in Proxmox VMs +3. **Argo CD GitOps control plane** for Kubernetes applications +4. **TrueNAS as separate storage anchor** for shared storage/media +5. **Media platform as separate concern** from core platform modernization +6. **Future AI/agent capabilities** built after platform foundations stabilize + +## Logical Layers + +- **Layer 1: Physical + network** (power, switching, routing) +- **Layer 2: Virtualization** (Proxmox cluster and host operations) +- **Layer 3: Container platform** (Kubernetes in VMs) +- **Layer 4: GitOps & app delivery** (Argo CD) +- **Layer 5: Workload domains** (core apps, media, future AI) + +## TODO + +- TODO: Define initial control-plane and worker VM sizing assumptions. +- TODO: Define storage class strategy for Kubernetes workloads. +- TODO: Define ingress/DNS/TLS standards for platform services. diff --git a/docs/archive/legacy-baremetal-talos-pxe/README.md b/docs/archive/legacy-baremetal-talos-pxe/README.md new file mode 100644 index 0000000..e4cc31e --- /dev/null +++ b/docs/archive/legacy-baremetal-talos-pxe/README.md @@ -0,0 +1,19 @@ +# Legacy Archive: Bare-Metal Talos/PXE Direction + +This archive contains material from a **previous platform direction** focused on bare-metal Talos provisioning through PXE and Terraform/OpenTofu-style workflows. + +## Archive Intent + +- Preserve historical implementation/reference context +- Keep Git history and prior work accessible +- Avoid presenting this as the active platform path + +## Current Status + +- This is **not** the current primary direction. +- The active direction is documentation-first, Proxmox-first, virtualized Kubernetes, and Argo CD GitOps. + +## Notes + +- Some legacy files may contain environment-specific values from past operations. +- Treat archived content as reference only; do not assume it reflects current standards. diff --git a/infra/README.md b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/README.md similarity index 100% rename from infra/README.md rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/README.md diff --git a/infra/live/dev/.gitkeep b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/live/dev/.gitkeep similarity index 100% rename from infra/live/dev/.gitkeep rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/live/dev/.gitkeep diff --git a/infra/live/prod/.terraform.lock.hcl b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/live/prod/.terraform.lock.hcl similarity index 100% rename from infra/live/prod/.terraform.lock.hcl rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/live/prod/.terraform.lock.hcl diff --git a/infra/live/prod/main.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/live/prod/main.tf similarity index 100% rename from infra/live/prod/main.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/live/prod/main.tf diff --git a/infra/live/prod/providers.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/live/prod/providers.tf similarity index 100% rename from infra/live/prod/providers.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/live/prod/providers.tf diff --git a/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/live/prod/terraform.auto.tfvars b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/live/prod/terraform.auto.tfvars new file mode 100644 index 0000000..42b1aea --- /dev/null +++ b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/live/prod/terraform.auto.tfvars @@ -0,0 +1,28 @@ +# REDACTED LEGACY EXAMPLE +# +# This file preserves the structure of the previous environment-specific tfvars +# while removing private operational values. + +cluster_nodes = { + nx0 = { + hostname = "" + ip = "" + mac_address = "" + wake_on_lan_mac = "" + install_disk = "" + role = "controlplane" + bootstrap = true + } +} + +cluster_info = { + name = "" + nameservers = [""] + virtual_ip = "" + endpoint = "https://:6443" +} + +k8s_version = "" + +github_organization = "" +github_repository = "" diff --git a/infra/live/prod/variables.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/live/prod/variables.tf similarity index 100% rename from infra/live/prod/variables.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/live/prod/variables.tf diff --git a/infra/modules/baremetal/main.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/baremetal/main.tf similarity index 100% rename from infra/modules/baremetal/main.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/baremetal/main.tf diff --git a/infra/modules/baremetal/outputs.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/baremetal/outputs.tf similarity index 100% rename from infra/modules/baremetal/outputs.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/baremetal/outputs.tf diff --git a/infra/modules/baremetal/providers.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/baremetal/providers.tf similarity index 100% rename from infra/modules/baremetal/providers.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/baremetal/providers.tf diff --git a/infra/modules/baremetal/variables.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/baremetal/variables.tf similarity index 100% rename from infra/modules/baremetal/variables.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/baremetal/variables.tf diff --git a/infra/modules/cluster/main.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/cluster/main.tf similarity index 100% rename from infra/modules/cluster/main.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/cluster/main.tf diff --git a/infra/modules/cluster/outputs.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/cluster/outputs.tf similarity index 100% rename from infra/modules/cluster/outputs.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/cluster/outputs.tf diff --git a/infra/modules/cluster/providers.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/cluster/providers.tf similarity index 100% rename from infra/modules/cluster/providers.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/cluster/providers.tf diff --git a/infra/modules/cluster/variables.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/cluster/variables.tf similarity index 100% rename from infra/modules/cluster/variables.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/cluster/variables.tf diff --git a/infra/modules/config/main.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/config/main.tf similarity index 100% rename from infra/modules/config/main.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/config/main.tf diff --git a/infra/modules/config/outputs.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/config/outputs.tf similarity index 100% rename from infra/modules/config/outputs.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/config/outputs.tf diff --git a/infra/modules/config/providers.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/config/providers.tf similarity index 100% rename from infra/modules/config/providers.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/config/providers.tf diff --git a/infra/modules/config/variables.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/config/variables.tf similarity index 100% rename from infra/modules/config/variables.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/config/variables.tf diff --git a/infra/modules/flux/main.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/flux/main.tf similarity index 100% rename from infra/modules/flux/main.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/flux/main.tf diff --git a/infra/modules/flux/providers.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/flux/providers.tf similarity index 100% rename from infra/modules/flux/providers.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/flux/providers.tf diff --git a/infra/modules/flux/values/components.yaml b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/flux/values/components.yaml similarity index 100% rename from infra/modules/flux/values/components.yaml rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/flux/values/components.yaml diff --git a/infra/modules/flux/variables.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/flux/variables.tf similarity index 100% rename from infra/modules/flux/variables.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/flux/variables.tf diff --git a/infra/modules/github_deploy_key/main.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/github_deploy_key/main.tf similarity index 100% rename from infra/modules/github_deploy_key/main.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/github_deploy_key/main.tf diff --git a/infra/modules/github_deploy_key/outputs.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/github_deploy_key/outputs.tf similarity index 100% rename from infra/modules/github_deploy_key/outputs.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/github_deploy_key/outputs.tf diff --git a/infra/modules/github_deploy_key/providers.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/github_deploy_key/providers.tf similarity index 100% rename from infra/modules/github_deploy_key/providers.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/github_deploy_key/providers.tf diff --git a/infra/modules/github_deploy_key/variables.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/github_deploy_key/variables.tf similarity index 100% rename from infra/modules/github_deploy_key/variables.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/github_deploy_key/variables.tf diff --git a/infra/modules/node/main.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/node/main.tf similarity index 100% rename from infra/modules/node/main.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/node/main.tf diff --git a/infra/modules/node/providers.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/node/providers.tf similarity index 100% rename from infra/modules/node/providers.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/node/providers.tf diff --git a/infra/modules/node/templates/cilium-install.yaml.tftpl b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/node/templates/cilium-install.yaml.tftpl similarity index 100% rename from infra/modules/node/templates/cilium-install.yaml.tftpl rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/node/templates/cilium-install.yaml.tftpl diff --git a/infra/modules/node/templates/controlplane.yaml.tftpl b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/node/templates/controlplane.yaml.tftpl similarity index 100% rename from infra/modules/node/templates/controlplane.yaml.tftpl rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/node/templates/controlplane.yaml.tftpl diff --git a/infra/modules/node/templates/worker.yaml.tftpl b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/node/templates/worker.yaml.tftpl similarity index 100% rename from infra/modules/node/templates/worker.yaml.tftpl rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/node/templates/worker.yaml.tftpl diff --git a/infra/modules/node/variables.tf b/docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/node/variables.tf similarity index 100% rename from infra/modules/node/variables.tf rename to docs/archive/legacy-baremetal-talos-pxe/infra-legacy/modules/node/variables.tf diff --git a/docs/decisions/ADR-001-proxmox-first.md b/docs/decisions/ADR-001-proxmox-first.md new file mode 100644 index 0000000..cf529fd --- /dev/null +++ b/docs/decisions/ADR-001-proxmox-first.md @@ -0,0 +1,19 @@ +# ADR-001: Proxmox-First Infrastructure Direction + +## Status +Accepted + +## Context +The repository previously centered on a bare-metal Talos/PXE provisioning path. Current goals prioritize virtualization flexibility, operational clarity, and staged modernization. + +## Decision +Adopt Proxmox as the primary infrastructure foundation for current and near-term homelab operations. + +## Alternatives considered +- Continue bare-metal Talos/PXE as primary path +- Mixed ad-hoc host virtualization without cluster-level direction + +## Consequences +- Requires Proxmox cluster planning and host lifecycle runbooks. +- Enables cleaner layering for Kubernetes and future workloads. +- Legacy bare-metal workflows move to archive/reference status. diff --git a/docs/decisions/ADR-002-k8s-inside-proxmox.md b/docs/decisions/ADR-002-k8s-inside-proxmox.md new file mode 100644 index 0000000..3a62e39 --- /dev/null +++ b/docs/decisions/ADR-002-k8s-inside-proxmox.md @@ -0,0 +1,19 @@ +# ADR-002: Kubernetes Runs Inside Proxmox VMs + +## Status +Accepted + +## Context +Kubernetes remains a core target platform, but direct bare-metal coupling increases platform migration friction and can complicate host-level changes. + +## Decision +Run Kubernetes as virtualized nodes inside Proxmox rather than as the direct bare-metal base layer. + +## Alternatives considered +- Bare-metal Kubernetes on the MS-A2 nodes +- Delay Kubernetes entirely until later platform phases + +## Consequences +- Adds one extra abstraction layer (VMs) to manage. +- Improves host/platform isolation and recovery flexibility. +- Requires clear VM sizing/network/storage decisions (tracked as TODOs). diff --git a/docs/decisions/ADR-003-argocd-over-flux.md b/docs/decisions/ADR-003-argocd-over-flux.md new file mode 100644 index 0000000..ba54bd5 --- /dev/null +++ b/docs/decisions/ADR-003-argocd-over-flux.md @@ -0,0 +1,18 @@ +# ADR-003: Argo CD Preferred Over Flux for GitOps + +## Status +Accepted + +## Context +A GitOps controller is required for Kubernetes application lifecycle. Previous code included Flux-related modules, but current platform direction prefers Argo CD. + +## Decision +Use Argo CD as the primary GitOps control plane for Kubernetes workloads. + +## Alternatives considered +- Continue with Flux-based approach +- Manual kubectl/Helm workflows without a GitOps controller + +## Consequences +- Repository structure and runbooks prioritize Argo CD bootstrap patterns. +- Existing Flux-oriented legacy code is preserved only for historical reference. diff --git a/docs/decisions/ADR-004-truenas-separate.md b/docs/decisions/ADR-004-truenas-separate.md new file mode 100644 index 0000000..ab5890a --- /dev/null +++ b/docs/decisions/ADR-004-truenas-separate.md @@ -0,0 +1,18 @@ +# ADR-004: TrueNAS Remains a Separate Storage Anchor + +## Status +Accepted + +## Context +Storage/media needs are currently anchored to a separate TrueNAS system. Platform direction requires clear separation between compute virtualization and storage anchor responsibilities. + +## Decision +Keep TrueNAS as a separate system of record for storage/media responsibilities rather than folding storage into Proxmox hosts. + +## Alternatives considered +- Converged storage directly on Proxmox nodes +- Immediate redesign of storage around a new distributed stack + +## Consequences +- Requires explicit network and mount strategy between Proxmox/Kubernetes and TrueNAS. +- Preserves current media data workflows while core platform evolves. diff --git a/docs/decisions/ADR-005-media-platform-stays-separate-from-core-lab.md b/docs/decisions/ADR-005-media-platform-stays-separate-from-core-lab.md new file mode 100644 index 0000000..425977a --- /dev/null +++ b/docs/decisions/ADR-005-media-platform-stays-separate-from-core-lab.md @@ -0,0 +1,18 @@ +# ADR-005: Media Platform Stays Separate from Core Lab Modernization + +## Status +Accepted + +## Context +The media platform is operational today and includes multiple tightly related services. Its final architecture is undecided, while core platform work requires immediate focus. + +## Decision +Treat media platform redesign as a dedicated parallel track and keep it separate from core lab foundation milestones. + +## Alternatives considered +- Full media redesign before any core platform changes +- Immediate migration of all media services into Kubernetes + +## Consequences +- Core platform progress is decoupled from unresolved media architecture decisions. +- Media migration planning is documented as roadmap work with explicit TODOs and options. diff --git a/docs/inventory/current-media-stack.md b/docs/inventory/current-media-stack.md new file mode 100644 index 0000000..4ffd3e1 --- /dev/null +++ b/docs/inventory/current-media-stack.md @@ -0,0 +1,45 @@ +# Current Media Stack (As-Is) + +## Runtime Model + +- The media platform currently runs as a Docker Compose stack in a VM hosted on Proxmox. +- It is a broader media stack around Plex, not a Plex-only deployment. + +## Known Services + +- Plex +- Tautulli +- nginx reverse proxy +- ACME companion +- Cloudflare tunnel +- Cloudflare DDNS +- Radarr +- Sonarr +- Bazarr +- Prowlarr +- Overseerr +- Homepage +- Recyclarr +- FlareSolverr +- torrent/VPN-related services + +## Storage and Acceleration + +- Media storage is mounted over NFS from TrueNAS. +- Current Plex operation uses NVIDIA-backed GPU access for transcoding. + +## Migration Intent + +- The media platform will be redesigned, but end-state is still open. +- Open options include: + - Plex vs Jellyfin vs Emby + - Media apps on TrueNAS vs dedicated media VM + +## Security / Data Handling Notes + +- This document intentionally excludes live credentials, tokens, domains, and private operational values. + +## TODO + +- TODO: Document current data paths, backup strategy, and restore procedure. +- TODO: Define acceptance criteria for media platform migration options. diff --git a/docs/inventory/hardware-inventory.md b/docs/inventory/hardware-inventory.md new file mode 100644 index 0000000..53ba1a3 --- /dev/null +++ b/docs/inventory/hardware-inventory.md @@ -0,0 +1,26 @@ +# Hardware Inventory + +## Current Confirmed Core Compute + +| Asset | Count | Known Specs | Role | +|---|---:|---|---| +| Minisforum MS-A2 mini PCs | 3 | 96 GB RAM each, 10GbE | Primary compute nodes for Proxmox-first direction | + +## Current Confirmed Storage Anchor + +| Asset | Count | Known Specs | Role | +|---|---:|---|---| +| TrueNAS system | 1 | TODO: record exact hardware specs | Storage and media anchor | + +## Non-Core / Deferred Hardware + +- Main desktop exists but is not part of core hosting. +- Older rack hardware is deferred for current planning horizon. + +## TODO + +- TODO: Capture exact CPU model(s) for each compute node. +- TODO: Capture local disk layout (capacity, type, intended usage). +- TODO: Capture NIC mapping per host (1GbE/10GbE ports, management links). +- TODO: Capture firmware/BIOS and update baselines. +- TODO: Capture UPS/power dependency map. diff --git a/docs/inventory/network-inventory.md b/docs/inventory/network-inventory.md new file mode 100644 index 0000000..1a26cc3 --- /dev/null +++ b/docs/inventory/network-inventory.md @@ -0,0 +1,20 @@ +# Network Inventory + +## Current Confirmed Network Stack + +- Ubiquiti Cloud Gateway Fiber +- Ubiquiti Pro XG 8 PoE +- Ubiquiti Lite 16 PoE + +## Current Network Notes + +- The environment has 10GbE-capable compute nodes. +- Detailed VLAN and subnet strategy is intentionally not assumed here. + +## TODO + +- TODO: Document current VLANs (ID, purpose, routing policy). +- TODO: Document IP ranges and DHCP/static allocation conventions. +- TODO: Document DNS/NTP authority and forwarding behavior. +- TODO: Document firewall segmentation policy between management, storage, and workload networks. +- TODO: Document switch port mappings and link speeds per critical device. diff --git a/docs/inventory/service-inventory.md b/docs/inventory/service-inventory.md new file mode 100644 index 0000000..f3ffc01 --- /dev/null +++ b/docs/inventory/service-inventory.md @@ -0,0 +1,18 @@ +# Service Inventory + +## Current Service Domains + +1. Core platform services (in transition) +2. Media platform services (currently Docker-based inside Proxmox VM) + +## Current State Summary + +- Repository is pivoting from older bare-metal Talos/PXE workflows to Proxmox-first virtualization. +- Kubernetes is planned to run inside Proxmox VMs. +- Argo CD is the preferred GitOps controller for the next platform iteration. + +## TODO + +- TODO: Enumerate currently active non-media infrastructure services and ownership. +- TODO: Record service criticality tiers and recovery priorities. +- TODO: Add external dependency register (DNS provider, tunnel provider, etc.) at a non-sensitive level. diff --git a/docs/roadmap/backlog.md b/docs/roadmap/backlog.md new file mode 100644 index 0000000..15258f7 --- /dev/null +++ b/docs/roadmap/backlog.md @@ -0,0 +1,31 @@ +# Backlog (Starter) + +## Inventory and Discovery + +- [ ] Finalize hardware inventory (CPU/disk/NIC specifics) +- [ ] Finalize network inventory (VLANs/subnets/routes) +- [ ] Document current non-media services and ownership +- [ ] Document current media stack operational dependencies + +## Platform Foundation + +- [ ] Install/plan Proxmox cluster baseline +- [ ] Define VLAN/IP strategy for management/workload/storage traffic +- [ ] Document TrueNAS role and access contracts + +## Kubernetes and GitOps + +- [ ] Define Kubernetes VM approach (control-plane/worker topology) +- [ ] Define Kubernetes bootstrap method and day-2 operations model +- [ ] Define Argo CD bootstrap approach and repository layout + +## Media Platform + +- [ ] Define media migration options and decision matrix +- [ ] Document current GPU transcoding dependency and alternatives +- [ ] Define migration success criteria and rollback plan + +## Automation and Quality + +- [ ] Define lightweight docs lint/check process +- [ ] Add initial automation scripts for inventory/validation tasks diff --git a/docs/roadmap/current-phase.md b/docs/roadmap/current-phase.md new file mode 100644 index 0000000..1bc6d21 --- /dev/null +++ b/docs/roadmap/current-phase.md @@ -0,0 +1,22 @@ +# Current Phase + +## Active Phase + +**Foundation / Documentation Restructuring (M0)** + +## Current Goals + +- Make documentation the primary navigation path +- Preserve old bare-metal direction as clearly labeled archive +- Capture explicit decisions and immediate next tasks + +## In Progress Focus + +- Close inventory gaps with real values (no guesswork) +- Prepare M1 (Network Core) and M2 (Proxmox Platform) + +## Definition of Done (Current Phase) + +- Required top-level docs exist and are reviewable +- ADR baseline is complete for current known decisions +- Backlog has actionable first-wave tasks diff --git a/docs/roadmap/media-migration.md b/docs/roadmap/media-migration.md new file mode 100644 index 0000000..7800941 --- /dev/null +++ b/docs/roadmap/media-migration.md @@ -0,0 +1,38 @@ +# Media Migration Roadmap (Planning Track) + +## Objective +Plan a safe transition path for the media platform without forcing premature architecture decisions. + +## Current State Summary + +- Docker Compose media stack in Proxmox VM +- NFS-backed media from TrueNAS +- NVIDIA-backed Plex transcoding dependency + +## Phased Plan + +### Phase A - Document and Stabilize + +- Capture as-is architecture and dependencies +- Record backup/restore and failure modes + +### Phase B - Evaluate Target Options + +- Compare Plex/Jellyfin/Emby against requirements +- Compare runtime options (TrueNAS-hosted vs dedicated media VM) + +### Phase C - Pilot and Validate + +- Run limited-scope pilot of preferred option +- Validate transcoding, storage access, and operational overhead + +### Phase D - Execute Migration + +- Implement staged migration with rollback checkpoints +- Retire obsolete components after validation + +## TODO + +- TODO: Define weighted decision criteria for platform selection. +- TODO: Define exact migration windows and cutover sequencing. +- TODO: Define monitoring/SLO expectations for media services. diff --git a/docs/roadmap/milestones.md b/docs/roadmap/milestones.md new file mode 100644 index 0000000..e060567 --- /dev/null +++ b/docs/roadmap/milestones.md @@ -0,0 +1,50 @@ +# Milestones + +## M0 - Foundation / Repository Restructuring + +- Documentation-first repo structure established +- Legacy bare-metal direction archived and labeled +- ADR baseline captured + +## M1 - Network Core + +- Network inventory finalized +- VLAN/IP strategy documented +- Initial topology and segmentation documented + +## M2 - Proxmox Platform + +- Proxmox installation/cluster plan documented +- Host baseline runbook drafted + +## M3 - Storage Integration + +- TrueNAS integration model documented +- Mount/data access patterns for platform domains documented + +## M4 - Virtualized Kubernetes + +- Kubernetes VM approach defined +- Initial cluster bootstrap plan documented + +## M5 - Argo CD GitOps + +- Argo CD bootstrap path defined +- Initial GitOps app structure documented + +## M6 - Media Platform Migration + +- Current-state media docs completed +- Migration options and decision criteria documented + +## M7 - Local CI/CD + +- Local pipeline/lint/test strategy documented + +## M8 - AI Platform + +- Initial AI platform architecture exploration documented + +## M9 - Agentic Development Workflows + +- Agent-assisted workflow patterns documented and trialed diff --git a/docs/runbooks/bootstrap-sequence.md b/docs/runbooks/bootstrap-sequence.md new file mode 100644 index 0000000..ef2cead --- /dev/null +++ b/docs/runbooks/bootstrap-sequence.md @@ -0,0 +1,37 @@ +# Bootstrap Sequence (Initial) + +## Objective +Provide a conservative, documentation-first bootstrap flow without assuming undeclared implementation details. + +## Sequence + +1. **Inventory baseline** + - Confirm hardware inventory + - Confirm network inventory + - Confirm current service inventory +2. **Architecture baseline** + - Validate target architecture docs + - Confirm/adjust ADR set +3. **Proxmox foundation** + - Plan and deploy Proxmox cluster baseline + - Validate host networking and access +4. **Storage integration baseline** + - Confirm TrueNAS role and required shared mounts +5. **Kubernetes virtualization plan** + - Define VM topology and bootstrap method + - Deploy initial virtualized Kubernetes cluster +6. **GitOps bootstrap** + - Install and validate Argo CD + - Start managing platform apps via GitOps +7. **Media migration planning** + - Document current media stack details and migration options + +## Exit Criteria (Phase-1/2 Oriented) + +- Documentation set is coherent and reviewable +- Core decisions captured as ADRs +- Backlog and milestones updated for execution + +## TODO + +- TODO: Add command-level runbook once implementation choices are finalized. diff --git a/docs/vision/homelab-vision.md b/docs/vision/homelab-vision.md new file mode 100644 index 0000000..cc6f9f4 --- /dev/null +++ b/docs/vision/homelab-vision.md @@ -0,0 +1,32 @@ +# Homelab Vision + +## Purpose + +Create a reliable, well-documented homelab platform that supports: + +- Learning and operating modern platform patterns +- Self-hosted services with clear operational boundaries +- A path to GitOps and higher-confidence change management +- Future experimentation with local AI and agentic workflows + +## Guiding Principles + +1. **Documentation-first**: design intent and operations should be understandable without tribal knowledge. +2. **Conservative assumptions**: unknown details are marked as TODO, not guessed. +3. **Separation of concerns**: + - Proxmox for virtualization foundation + - Kubernetes for cloud-native workloads + - TrueNAS for storage/media anchor +4. **Incremental migration**: preserve service continuity while modernizing. +5. **GitHub-native workflow**: issues, ADRs, roadmap, and PRs are the planning system. + +## Non-Goals (Current) + +- Immediate migration of every existing service into Kubernetes +- Immediate redesign of the media platform final architecture +- Incorporating non-core desktop systems into core hosting operations + +## Known Constraints + +- Final media architecture is still under evaluation. +- Exact CPU/disk/network segment details are incomplete and need inventory completion. diff --git a/infra/live/prod/terraform.auto.tfvars b/infra/live/prod/terraform.auto.tfvars deleted file mode 100644 index 132a6b1..0000000 --- a/infra/live/prod/terraform.auto.tfvars +++ /dev/null @@ -1,41 +0,0 @@ -cluster_nodes = { - nx0 = { - hostname = "nx0.feenx.io" - ip = "10.10.97.10" - mac_address = "58:47:CA:7F:E2:16" - wake_on_lan_mac = "58:47:CA:7F:E2:14" - install_disk = "/dev/nvme0n1" - role = "controlplane" - bootstrap = true - }, - nx1 = { - hostname = "nx1.feenx.io" - ip = "10.10.97.11" - mac_address = "58:47:CA:7F:D3:B6" - wake_on_lan_mac = "58:47:CA:7F:D3:B4" - install_disk = "/dev/nvme0n1" - role = "controlplane" - depends_on = ["nx0"] - }, - nx2 = { - hostname = "nx2.feenx.io" - ip = "10.10.97.12" - mac_address = "58:47:CA:7F:D7:7E" - wake_on_lan_mac = "58:47:CA:7F:D7:7C" - install_disk = "/dev/nvme0n1" - role = "controlplane" - depends_on = ["nx1"] - } -} - -cluster_info = { - name = "fnx-platform-prod-k8s" - nameservers = ["10.10.97.254"] - virtual_ip = "10.10.97.20" - endpoint = "https://k8s.feenx.io:6443" -} - -k8s_version = "1.33.0" - -github_organization = "feenx-lab" -github_repository = "platform" \ No newline at end of file diff --git a/kubernetes/README.md b/kubernetes/README.md new file mode 100644 index 0000000..a359a49 --- /dev/null +++ b/kubernetes/README.md @@ -0,0 +1,8 @@ +# Kubernetes Area + +Kubernetes-on-Proxmox design and implementation artifacts belong here. + +## TODO + +- Add virtual node topology proposal +- Add cluster bootstrap documentation diff --git a/proxmox/README.md b/proxmox/README.md new file mode 100644 index 0000000..7d1fddc --- /dev/null +++ b/proxmox/README.md @@ -0,0 +1,8 @@ +# Proxmox Area + +Proxmox-specific planning and implementation artifacts belong here. + +## TODO + +- Add host preparation checklist +- Add cluster bootstrap notes diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..f0a4761 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,3 @@ +# Scripts Area + +Operational helper scripts for docs/runbooks belong here.