From b144a57c47c322b58ec8254ed7292cc7b4161db2 Mon Sep 17 00:00:00 2001 From: Wander Grevink Date: Tue, 17 Mar 2026 14:20:56 +0000 Subject: [PATCH] possible future direction --- docker-compose.platform.yml | 274 ++++++++++++++++++++++++++++++++++++ docs/platform-compose.md | 98 +++++++++++++ 2 files changed, 372 insertions(+) create mode 100644 docker-compose.platform.yml create mode 100644 docs/platform-compose.md diff --git a/docker-compose.platform.yml b/docker-compose.platform.yml new file mode 100644 index 0000000..3931cf3 --- /dev/null +++ b/docker-compose.platform.yml @@ -0,0 +1,274 @@ +# Platform Services Docker Compose Stack +# All core infrastructure services: Traefik, Registry, OpenObserve, OTEL, Prefect +# +# Usage: +# docker compose -f docker-compose.platform.yml up -d +# +# Required: .env file or environment variables for secrets +# See .env.platform.example for required variables + +services: + # ============================================================================ + # Traefik: Reverse proxy with automatic HTTPS (Let's Encrypt) + # ============================================================================ + traefik: + image: traefik:v3.6.10 + container_name: traefik + restart: unless-stopped + ports: + - "0.0.0.0:80:80" + - "[::]:80:80" + - "0.0.0.0:443:443" + - "[::]:443:443" + - "127.0.0.1:57801:8080" # Dashboard/API (localhost only) + - "[::1]:57801:8080" + volumes: + # Docker socket for container discovery + - /var/run/docker.sock:/var/run/docker.sock:ro + # Config files (static + dynamic) + - /etc/traefik:/etc/traefik:ro + # Certificate storage (Let's Encrypt) + - /etc/traefik/certs:/etc/traefik/certs + # Access logs + - /var/log/traefik:/var/log/traefik + # Basic auth files (htpasswd for registry) + - /etc/traefik/auth:/etc/traefik/auth:ro + command: + - --configFile=/etc/traefik/traefik.yml + networks: + - traefik + - openobserve-network + logging: + driver: json-file + options: + max-size: "10m" + max-file: "3" + + # ============================================================================ + # Docker Registry: Private container image registry + # ============================================================================ + registry: + image: registry:3 + container_name: registry + restart: unless-stopped + ports: + - "127.0.0.1:5001:5000" # Internal access (localhost only) + volumes: + # Image storage + - /var/lib/docker-registry:/var/lib/registry + # Registry config + - /etc/docker-registry/config.yml:/etc/distribution/config.yml:ro + networks: + - traefik + labels: + # Traefik routing + traefik.enable: "true" + traefik.http.routers.registry.rule: "Host(`registry.${BASE_DOMAIN}`)" + traefik.http.routers.registry.entrypoints: "websecure" + traefik.http.routers.registry.tls.certresolver: "letsencrypt" + traefik.http.routers.registry.middlewares: "registry-headers,registry-auth" + # Security headers + traefik.http.middlewares.registry-headers.headers.customResponseHeaders.X-Frame-Options: "DENY" + traefik.http.middlewares.registry-headers.headers.customResponseHeaders.Referrer-Policy: "strict-origin-when-cross-origin" + traefik.http.middlewares.registry-headers.headers.customResponseHeaders.Strict-Transport-Security: "max-age=31536000; includeSubDomains" + traefik.http.middlewares.registry-headers.headers.customResponseHeaders.X-Content-Type-Options: "nosniff" + traefik.http.middlewares.registry-headers.headers.customResponseHeaders.Content-Security-Policy: "default-src 'self'" + traefik.http.middlewares.registry-headers.headers.customResponseHeaders.Permissions-Policy: "geolocation=(), microphone=(), camera=()" + # Basic auth (htpasswd file mounted via Traefik volume) + traefik.http.middlewares.registry-auth.basicauth.usersfile: "/etc/traefik/auth/htpasswd" + traefik.http.services.registry.loadbalancer.server.port: "5000" + healthcheck: + test: ["CMD-SHELL", "nc -z localhost 5000 || exit 1"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 10s + logging: + driver: json-file + options: + max-size: "10m" + max-file: "3" + + # ============================================================================ + # OpenObserve: Observability platform (logs, metrics, traces) + # ============================================================================ + openobserve: + image: openobserve/openobserve:v0.70.0 + container_name: openobserve + restart: unless-stopped + ports: + - "127.0.0.1:57800:5080" # Web UI (localhost only) + environment: + ZO_ROOT_USER_EMAIL: "${OPENOBSERVE_USERNAME}@observe.local" + ZO_ROOT_USER_PASSWORD: "${OPENOBSERVE_PASSWORD}" + ZO_LOCAL_MODE: "true" + ZO_RETENTION_DAYS: "7" + volumes: + # Data storage + - /var/lib/openobserve:/data + networks: + - openobserve-network + healthcheck: + test: ["CMD", "/openobserve", "view", "-c", "version"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 30s + logging: + driver: json-file + options: + max-size: "10m" + max-file: "3" + + # ============================================================================ + # OTEL Collector: Collects host/Docker metrics and logs → OpenObserve + # ============================================================================ + otel-collector: + image: otel/opentelemetry-collector-contrib:0.147.0 + container_name: otel-collector + restart: unless-stopped + user: "0" # Root to read Docker socket and log files + command: ["--config=/etc/otelcol/config.yaml"] + volumes: + # Collector config + - /etc/otel-collector/config.yaml:/etc/otelcol/config.yaml:ro + # Docker socket for container metrics + - /var/run/docker.sock:/var/run/docker.sock:ro + # Docker container logs + - /var/lib/docker/containers:/var/lib/docker/containers:ro + # Host logs (traefik, syslog, auth, fail2ban) + - /var/log:/var/log:ro + networks: + - openobserve-network + healthcheck: + test: ["CMD", "/otelcol-contrib", "--version"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 30s + logging: + driver: json-file + options: + max-size: "10m" + max-file: "3" + depends_on: + openobserve: + condition: service_healthy + + # ============================================================================ + # Prefect: Workflow orchestration - Database + # ============================================================================ + prefect-db: + image: postgres:16-alpine + container_name: prefect-db + restart: unless-stopped + environment: + POSTGRES_USER: prefect + POSTGRES_PASSWORD: "${PREFECT_DB_PASSWORD}" + POSTGRES_DB: prefect + volumes: + - prefect-db-data:/var/lib/postgresql/data + networks: + - prefect-network + healthcheck: + test: ["CMD-SHELL", "pg_isready -U prefect"] + interval: 5s + timeout: 5s + retries: 5 + start_period: 10s + + # ============================================================================ + # Prefect: Workflow orchestration - Server + # ============================================================================ + prefect-server: + image: prefecthq/prefect:3-python3.11 + container_name: prefect-server + restart: unless-stopped + command: prefect server start --host 0.0.0.0 + ports: + - "127.0.0.1:57802:4200" # Web UI + API (localhost only) + environment: + PREFECT_HOME: /data + PREFECT_API_DATABASE_CONNECTION_URL: "postgresql+asyncpg://prefect:${PREFECT_DB_PASSWORD}@prefect-db:5432/prefect" + PREFECT_UI_API_URL: "http://127.0.0.1:57802/api" + volumes: + - prefect-server-data:/data + networks: + - prefect-network + healthcheck: + test: ["CMD-SHELL", "python3 -c \"import urllib.request; urllib.request.urlopen('http://127.0.0.1:4200/api/health')\" || exit 1"] + interval: 15s + timeout: 5s + retries: 3 + start_period: 60s + logging: + driver: json-file + options: + max-size: "10m" + max-file: "3" + depends_on: + prefect-db: + condition: service_healthy + + # ============================================================================ + # Prefect: Workflow orchestration - Worker + # ============================================================================ + # Note: This service uses a custom-built image (prefect-worker) that must be + # built on the host before starting the stack. See ansible/roles/server/tasks/prefect.yml + # for the build process, or build manually from prefect/Dockerfile.worker + prefect-worker: + image: prefect-worker + container_name: prefect-worker + restart: unless-stopped + command: prefect worker start --pool host-pool + environment: + PREFECT_API_URL: "http://prefect-server:4200/api" + REGISTRY_URL: "registry.${BASE_DOMAIN}" + DOCKER_CONFIG: "/opt/iac/.docker" + volumes: + # Docker socket for running containers (backup jobs, etc.) + - /var/run/docker.sock:/var/run/docker.sock + # Access to /opt/iac tree (flow code, registry auth, backups) + - /opt/iac:/opt/iac + working_dir: /opt/iac/prefect/flows + networks: + - prefect-network + logging: + driver: json-file + options: + max-size: "10m" + max-file: "3" + depends_on: + prefect-server: + condition: service_healthy + +# ============================================================================== +# Networks +# ============================================================================== +networks: + # Public network for apps and exposed services (Traefik, registry) + traefik: + name: traefik + driver: bridge + + # Observability network (Traefik metrics, OpenObserve, OTEL) + openobserve-network: + name: openobserve-network + driver: bridge + + # Isolated network for Prefect components + prefect-network: + name: prefect-network + driver: bridge + +# ============================================================================== +# Volumes +# ============================================================================== +volumes: + # Prefect database data + prefect-db-data: + name: prefect-db-data + + # Prefect server data (flows, artifacts) + prefect-server-data: + name: prefect-server-data diff --git a/docs/platform-compose.md b/docs/platform-compose.md new file mode 100644 index 0000000..07e64ff --- /dev/null +++ b/docs/platform-compose.md @@ -0,0 +1,98 @@ +# Platform as Docker Compose + +> **Status: Design.** Plan to move the set of system services (Traefik, registry, OpenObserve, Prefect, optionally fail2ban) to a Docker Compose–native stack and reduce Ansible to host bootstrap and config deployment. + +## Goal + +- **Platform = one (or a few) Compose files.** All platform containers and networks are defined there; lifecycle is `docker compose up -d` (or equivalent), not many Ansible `docker_container` / `docker_network` tasks. +- **Ansible shrinks** to: host bootstrap (Docker, user, SSH, etc.), deploying secrets and generated config into the paths the Compose stack uses, and optional one-offs (Prefect work pool, OpenObserve dashboards). +- **Fail2ban** can be either a host service (current) or a container in the same stack; if containerized, Ansible no longer installs the package or manages systemd for it. +- **Devcontainer** is the control plane: run the platform locally with the same Compose file, and run the remaining automation (Ansible or scripts) from there to deploy to the server. + +## Current state + +Today the server role uses Ansible to: + +| Layer | What Ansible does | +|-------|--------------------| +| **Host** | Base packages, unattended-upgrades, SSH hardening, fail2ban (apt + systemd + config/filters), Docker install, iac user + `/opt/iac` tree. | +| **Containers** | Traefik, registry, OpenObserve, OTEL collector, Prefect (postgres, server, worker). For each: create networks, create/start containers, mount config dirs, set env, healthchecks, Traefik labels. | +| **Config** | Templates (Traefik static/dynamic, registry, OTEL, fail2ban) and files (logrotate, fail2ban filters, htpasswd, dashboard JSON). | +| **One-offs** | Prefect: sync flow code, build worker image, ensure work pool, `prefect deploy --all`. OpenObserve: dashboard import via API. | + +So the platform is already “Docker native” in the sense that the services run in containers; the non–Docker-native part is **orchestration**—many Ansible tasks per container/network instead of a single Compose stack. + +## Target state + +### What moves to Compose + +- **Traefik** — Same image, same mounts and ports; defined in the platform Compose file instead of Ansible `docker_container` tasks. +- **Registry** — Same image, volumes, and Traefik labels; defined in Compose. +- **OpenObserve + OTEL collector** — Both services in the same Compose file; networks and env in Compose. +- **Prefect** — Postgres, server, and worker as Compose services; volumes and env in Compose. +- **Fail2ban (optional)** — Either stay on host (Ansible keeps apt/systemd) or run as a container in the stack with bind mounts for `/var/log` and config; then Ansible only deploys config/filter files. + +Config files (Traefik YAML, registry config, OTEL config, fail2ban jail/filters) remain **generated and deployed** by Ansible (or a script) into a known directory that the Compose stack mounts—because they depend on secrets and inventory (domains, credentials). So: **topology and lifecycle in Compose; config content still from automation.** + +### What stays in Ansible (or equivalent) + +| Responsibility | Why it stays | +|----------------|--------------| +| **Docker + Compose install** | Host must have Docker to run the stack. | +| **Base, SSH, unattended-upgrades** | Host hardening; not replaced by Compose. | +| **iac user, `/opt/iac` tree** | Required for app deploys and Prefect paths. | +| **Secrets and config deployment** | Templates (SOPS-decrypted vars, domains) written to paths used by Compose. Could be Ansible or a small script (e.g. envsubst + sops). | +| **Prefect one-offs** | Sync flow code, build worker image, work pool, `prefect deploy --all`—either stay in Ansible or move to a script/Compose hook. | +| **OpenObserve dashboards** | API-based dashboard create/update—small; can stay in Ansible or a one-off script. | + +So **realistically a non-trivial amount of automation remains**—but it is focused on bootstrap and config, not on container lifecycle. The “lot of Ansible” becomes “a smaller Ansible playbook (or Ansible + scripts)” run from the devcontainer. + +## What becomes possible + +- **Run the platform anywhere** — Same Compose stack on laptop, dev VM, CI, or a new server; Ansible only bootstraps the host and drops the project. +- **Local / dev parity** — Run Traefik, registry, OpenObserve, Prefect (and optionally fail2ban) locally from the devcontainer; integration tests and debugging without a live server. +- **Upgrade and rollback like an app** — Change image tags or env in the Compose file and redeploy; rollback = revert file and `docker compose up -d`. +- **Single declarative stack** — One place for dependencies, healthchecks, and restarts; `docker compose config`, `docker compose logs -f` for the whole platform. +- **Compose features** — Profiles for optional services (e.g. no OpenObserve in minimal dev); one dependency graph. +- **Simpler recovery** — Rebuild = bootstrap OS + Docker, copy Compose project and config, `docker compose up -d`. +- **Clear separation** — Ansible = host and secrets; Compose = service topology and runtime. Changes to “what containers run” don’t require changing Ansible task logic. +- **Platform as a first-class stack** — Version the platform in the repo; optionally drive updates from CI (build/push images, then `docker compose pull && up -d` on the server). + +## Fail2ban + +- **Current:** Ansible installs fail2ban, deploys jail and filter config, manages systemd. Fail2ban reads host logs (e.g. Traefik, auth) and manages host firewall. +- **If kept on host:** No change; Ansible continues to own package and systemd. +- **If moved to Compose:** Run fail2ban in a container (e.g. image that reads logs and applies bans via host network or mounted socket). Ansible then only deploys config/filter files into the path mounted by the container and ensures the Compose stack is up. Package and systemd handling disappear from Ansible. + +## Devcontainer as control plane + +- **Local platform** — In the devcontainer, run the same platform Compose stack (with dev config/secrets). No Ansible needed for “run the platform” locally. +- **Deploy to server** — From the devcontainer, either: + - **Option A (minimal Ansible):** One playbook that (1) bootstraps the server if needed, (2) copies the Compose project and rendered config (or templates + vars), (3) runs `docker compose up -d` on the server. + - **Option B (Ansible only for bootstrap):** Bootstrap (Docker, user, SSH, etc.) is a one-time Ansible run. Ongoing platform deploy: from devcontainer, rsync/scp the Compose project and config, SSH and run `docker compose up -d` (via Taskfile or script). No Ansible in the hot path. + +So the devcontainer is the single place to run the platform locally and to run the remaining automation (Ansible or scripts) that deploys to the server. + +## Implementation order + +1. **Server layout** — Implement [server-layout.md](server-layout.md) first (single tree under `/opt/iac`, user `iac`). Platform Compose and paths should align with that layout. +2. **Define platform Compose** — Add `compose/platform/docker-compose.yml` (and optional overrides) with Traefik, registry, OpenObserve, OTEL, Prefect. Use the same images, ports, volumes, and env as today; only the definition moves from Ansible tasks to Compose. +3. **Config deployment** — Keep Ansible (or a script) that renders and copies config files into the directory used by the Compose project (e.g. `/opt/iac/platform/` or similar). Ensure SOPS and inventory vars feed into those templates. +4. **Ansible: remove container/network tasks** — Replace the server role’s `docker_container` and `docker_network` tasks for platform services with a single “copy project + config, run docker compose up -d” (or equivalent). Retain handlers/notify if config changes should trigger `docker compose up -d` or service restarts. +5. **Fail2ban** — Decide: keep on host (Ansible unchanged for fail2ban) or add fail2ban service to platform Compose and reduce Ansible to config deployment only. +6. **Prefect one-offs** — Keep flow sync, worker build, work pool, and deploy in Ansible or move to a script that runs from the devcontainer (or as a Compose hook / init step). +7. **Devcontainer** — Document and wire “run platform locally” (e.g. `docker compose -f compose/platform/docker-compose.yml up`) and “deploy platform to server” (Task or playbook that syncs and runs compose on the server). +8. **Docs and cleanup** — Update [workflows](workflows.md), [application-deployment](application-deployment.md), and server docs to describe platform-as-Compose; remove or archive obsolete Ansible task files. + +## Summary + +| Aspect | Before | After | +|--------|--------|-------| +| Platform topology | Many Ansible tasks (containers + networks) | One Compose file (or small set) | +| Platform lifecycle | Ansible playbook | `docker compose up -d` (from devcontainer or server) | +| Ansible server role | Large (traefik, registry, openobserve, prefect, fail2ban tasks) | Smaller (bootstrap + config deployment + optional one-offs) | +| Config (Traefik, registry, OTEL, fail2ban) | Ansible templates → host paths | Same; Ansible or script → paths used by Compose | +| Local platform | Not defined | Same Compose file in devcontainer | +| Deploy from | Ansible from devcontainer | Ansible or script from devcontainer (sync + compose) | + +Realistically, **a fair amount of automation remains** (bootstrap + secrets/config + one-offs), but it no longer has to be “a lot of Ansible.” The platform becomes a single, versioned, run-anywhere stack; the devcontainer is where you run it locally and where you run whatever is left of Ansible or scripts for the server.