diff --git a/.github/workflows/deploy-environment.yaml b/.github/workflows/deploy-environment.yaml index 1485e2e..5d39d8e 100644 --- a/.github/workflows/deploy-environment.yaml +++ b/.github/workflows/deploy-environment.yaml @@ -32,13 +32,37 @@ on: required: false type: string default: "15" - description: Wait sleep for model registration + description: Sleep seconds between model availability checks + smoke_models_wait_attempts: + required: false + type: string + default: "1" + description: Number of attempts to wait for models to become available include_aoai_host_check: required: false type: boolean default: false description: Include AOAI endpoint host validation + environment: + required: false + type: string + default: "" + description: GitHub environment to use secrets: + AZURE_CLIENT_ID: + required: true + AZURE_TENANT_ID: + required: true + AZURE_SUBSCRIPTION_ID: + required: true + TF_BACKEND_RG: + required: true + TF_BACKEND_SA: + required: true + TF_BACKEND_CONTAINER: + required: true + EXPECTED_AOAI_ENDPOINT_HOST: + required: false AZURE_OPENAI_ENDPOINT: required: true AZURE_OPENAI_API_KEY: @@ -49,8 +73,25 @@ on: required: true AIGATEWAY_KEY: required: true + STATE_SERVICE_CONTAINER_IMAGE: + required: false + STATE_SERVICE_SHARED_TOKEN: + required: false + STATE_SERVICE_REGISTRY_PASSWORD: + required: false + DASHBOARD_CONTAINER_IMAGE: + required: false + GRAFANA_URL: + required: false env: + AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} + AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} + AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + TF_BACKEND_RG: ${{ secrets.TF_BACKEND_RG }} + TF_BACKEND_SA: ${{ secrets.TF_BACKEND_SA }} + TF_BACKEND_CONTAINER: ${{ secrets.TF_BACKEND_CONTAINER }} + EXPECTED_AOAI_ENDPOINT_HOST: ${{ secrets.EXPECTED_AOAI_ENDPOINT_HOST }} TF_VAR_env: ${{ inputs.env_name }} TF_VAR_projname: "aigateway" TF_VAR_location: "southafricanorth" @@ -64,10 +105,15 @@ env: TF_VAR_codex_api_version: ${{ inputs.codex_api_version }} TF_VAR_embedding_deployment: "text-embedding-3-large" TF_VAR_embeddings_api_version: "2024-02-01" + TF_VAR_state_service_container_image: ${{ secrets.STATE_SERVICE_CONTAINER_IMAGE }} + TF_VAR_secrets_expiration_date: "2027-03-31T00:00:00Z" + TF_VAR_dashboard_container_image: ${{ secrets.DASHBOARD_CONTAINER_IMAGE || 'ghcr.io/phoenixvc/ai-gateway-dashboard:latest' }} + TF_VAR_grafana_url: ${{ secrets.GRAFANA_URL }} jobs: deploy: runs-on: ubuntu-latest + environment: ${{ inputs.environment || inputs.env_name }} defaults: run: working-directory: ${{ inputs.terraform_working_directory }} @@ -208,7 +254,7 @@ jobs: aoai_api_key: ${{ env.TF_VAR_azure_openai_api_key }} max_attempts: "3" retry_sleep: ${{ inputs.smoke_retry_sleep }} - models_wait_attempts: ${{ if(inputs.env_name == 'prod', '3', '1') }} + models_wait_attempts: ${{ inputs.smoke_models_wait_attempts }} models_wait_sleep: ${{ inputs.smoke_models_wait_sleep }} - name: Smoke test shared state API (dashboard proxy) diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml index 59ece73..7877e43 100644 --- a/.github/workflows/deploy.yaml +++ b/.github/workflows/deploy.yaml @@ -150,7 +150,6 @@ jobs: name: Deploy dev needs: plan if: github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'dev' - environment: dev uses: ./.github/workflows/deploy-environment.yaml with: env_name: dev @@ -160,19 +159,32 @@ jobs: terraform_working_directory: infra/env/dev smoke_retry_sleep: "10" smoke_models_wait_sleep: "15" + smoke_models_wait_attempts: "1" include_aoai_host_check: false + environment: dev secrets: + AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} + AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} + AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + TF_BACKEND_RG: ${{ secrets.TF_BACKEND_RG }} + TF_BACKEND_SA: ${{ secrets.TF_BACKEND_SA }} + TF_BACKEND_CONTAINER: ${{ secrets.TF_BACKEND_CONTAINER }} + EXPECTED_AOAI_ENDPOINT_HOST: ${{ secrets.EXPECTED_AOAI_ENDPOINT_HOST }} AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} AZURE_OPENAI_EMBEDDING_ENDPOINT: ${{ secrets.AZURE_OPENAI_EMBEDDING_ENDPOINT }} AZURE_OPENAI_EMBEDDING_API_KEY: ${{ secrets.AZURE_OPENAI_EMBEDDING_API_KEY }} AIGATEWAY_KEY: ${{ secrets.AIGATEWAY_KEY }} + STATE_SERVICE_CONTAINER_IMAGE: ${{ vars.STATE_SERVICE_CONTAINER_IMAGE }} + STATE_SERVICE_SHARED_TOKEN: ${{ secrets.STATE_SERVICE_SHARED_TOKEN }} + STATE_SERVICE_REGISTRY_PASSWORD: ${{ secrets.STATE_SERVICE_REGISTRY_PASSWORD }} + DASHBOARD_CONTAINER_IMAGE: ${{ vars.DASHBOARD_CONTAINER_IMAGE }} + GRAFANA_URL: ${{ secrets.GRAFANA_URL }} deploy-staging: name: Deploy staging needs: plan if: github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main' && contains(join(github.event.pull_request.labels.*.name, ','), 'run-staging') - environment: staging uses: ./.github/workflows/deploy-environment.yaml with: env_name: staging @@ -182,19 +194,32 @@ jobs: terraform_working_directory: infra/env/staging smoke_retry_sleep: "10" smoke_models_wait_sleep: "15" + smoke_models_wait_attempts: "1" include_aoai_host_check: false + environment: staging secrets: + AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} + AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} + AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + TF_BACKEND_RG: ${{ secrets.TF_BACKEND_RG }} + TF_BACKEND_SA: ${{ secrets.TF_BACKEND_SA }} + TF_BACKEND_CONTAINER: ${{ secrets.TF_BACKEND_CONTAINER }} + EXPECTED_AOAI_ENDPOINT_HOST: ${{ secrets.EXPECTED_AOAI_ENDPOINT_HOST }} AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} AZURE_OPENAI_EMBEDDING_ENDPOINT: ${{ secrets.AZURE_OPENAI_EMBEDDING_ENDPOINT }} AZURE_OPENAI_EMBEDDING_API_KEY: ${{ secrets.AZURE_OPENAI_EMBEDDING_API_KEY }} AIGATEWAY_KEY: ${{ secrets.AIGATEWAY_KEY }} + STATE_SERVICE_CONTAINER_IMAGE: ${{ vars.STATE_SERVICE_CONTAINER_IMAGE }} + STATE_SERVICE_SHARED_TOKEN: ${{ secrets.STATE_SERVICE_SHARED_TOKEN }} + STATE_SERVICE_REGISTRY_PASSWORD: ${{ secrets.STATE_SERVICE_REGISTRY_PASSWORD }} + DASHBOARD_CONTAINER_IMAGE: ${{ vars.DASHBOARD_CONTAINER_IMAGE }} + GRAFANA_URL: ${{ secrets.GRAFANA_URL }} deploy-prod: name: Deploy prod needs: plan if: github.event_name == 'workflow_dispatch' || (github.event_name == 'push' && github.ref == 'refs/heads/main') - environment: prod uses: ./.github/workflows/deploy-environment.yaml with: env_name: prod @@ -204,12 +229,26 @@ jobs: terraform_working_directory: infra/env/prod smoke_retry_sleep: "15" smoke_models_wait_sleep: "30" + smoke_models_wait_attempts: "3" include_aoai_host_check: true + environment: prod secrets: + AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} + AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} + AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + TF_BACKEND_RG: ${{ secrets.TF_BACKEND_RG }} + TF_BACKEND_SA: ${{ secrets.TF_BACKEND_SA }} + TF_BACKEND_CONTAINER: ${{ secrets.TF_BACKEND_CONTAINER }} + EXPECTED_AOAI_ENDPOINT_HOST: ${{ secrets.EXPECTED_AOAI_ENDPOINT_HOST }} AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} AZURE_OPENAI_EMBEDDING_ENDPOINT: ${{ secrets.AZURE_OPENAI_EMBEDDING_ENDPOINT }} AZURE_OPENAI_EMBEDDING_API_KEY: ${{ secrets.AZURE_OPENAI_EMBEDDING_API_KEY }} AIGATEWAY_KEY: ${{ secrets.AIGATEWAY_KEY }} + STATE_SERVICE_CONTAINER_IMAGE: ${{ vars.STATE_SERVICE_CONTAINER_IMAGE }} + STATE_SERVICE_SHARED_TOKEN: ${{ secrets.STATE_SERVICE_SHARED_TOKEN }} + STATE_SERVICE_REGISTRY_PASSWORD: ${{ secrets.STATE_SERVICE_REGISTRY_PASSWORD }} + DASHBOARD_CONTAINER_IMAGE: ${{ vars.DASHBOARD_CONTAINER_IMAGE }} + GRAFANA_URL: ${{ secrets.GRAFANA_URL }} # Legacy inline deployments removed - see deploy-environment.yaml diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..5f41681 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,227 @@ +# AGENTS.md - Guidance for AI Coding Agents + +This file provides guidance for AI coding agents operating in this repository. + +## Project Overview + +**ai-gateway** — OpenAI-compatible AI gateway built on LiteLLM, deployed to Azure Container Apps. Routes `/v1/responses` and `/v1/embeddings` to Azure OpenAI. + +### Tech Stack + +- **Gateway**: LiteLLM (Python) +- **Dashboard**: Node.js/pnpm (in `dashboard/`) +- **Infrastructure**: Terraform (>= 1.14.0) in `infra/` +- **State Service**: Python/FastAPI in `state-service/` +- **Type Checking**: mypy +- **Scripts**: Deployment/setup scripts in `scripts/` + +--- + +## Build / Lint / Test Commands + +### Dashboard (Node.js/pnpm) + +```bash +cd dashboard +pnpm install # Install dependencies +pnpm dev # Start dev server +pnpm format # Format code with prettier +pnpm format:check # Check formatting only +pnpm lint # Run format check +``` + +### Python (State Service) + +```bash +# Type checking +mypy . # Run mypy on entire project + +# Running a single Python test (if tests exist) +python -m pytest scripts/test_specific.py::TestClass::test_method + +# Individual script execution +python scripts/integration_test.py +python scripts/check_aoai_embeddings.py +``` + +### Terraform (Infrastructure) + +```bash +cd infra + +# Initialize and plan +terraform init +terraform plan + +# Format check +terraform fmt -check -recursive + +# Apply +terraform apply +``` + +### Combined Checks + +```bash +# Run all checks (format + terraform) +pnpm check +``` + +--- + +## Code Style Guidelines + +### Python (state-service/) + +**Imports** + +- Use relative imports within packages: `from .routes import router` +- Group imports: stdlib → third-party → local +- Use `import os`, `from typing import Optional`, etc. + +**Formatting** + +- Follow PEP 8 +- Use 4 spaces for indentation +- Maximum line length: 100 characters + +**Types (mypy)** + +- Python version: 3.13 (see `mypy.ini`) +- Use type hints for function parameters and return values +- Run `mypy .` before committing + +**Naming** + +- Variables/functions: `snake_case` +- Classes: `PascalCase` +- Constants: `UPPER_SNAKE_CASE` +- Private members: prefix with `_` + +**Error Handling** + +- Use custom exceptions with descriptive names +- Catch specific exceptions, not bare `except:` +- Include context in error messages + +```python +def selection_key(user_id: str) -> str: + if not user_id or not user_id.strip(): + raise ValueError("user_id must be a non-empty string") + # ... +``` + +### JavaScript (dashboard/) + +**Formatting** + +- Use Prettier for formatting (configured in `package.json`) +- Run `pnpm format` before committing + +**Naming** + +- Variables/functions: `camelCase` +- Constants: `UPPER_SNAKE_CASE` or `camelCase` with const +- Classes: `PascalCase` + +**General JS Style** + +- Use `const` by default, `let` when reassignment needed +- Prefer template literals over string concatenation +- Use strict equality (`===`) not loose equality (`==`) + +```javascript +const MAX_POINTS = 20; +const reqHistory = { labels: [], datasets: [...] }; +``` + +### Terraform (infra/) + +**Formatting** + +- Use `terraform fmt` to format files +- Run `terraform fmt -check -recursive` in CI + +**Naming** + +- Resources: `snake_case` +- Variables: `snake_case` +- Outputs: `snake_case` + +**General** + +- Use local values for repeated expressions +- Tag all resources with `env`, `project` +- Pin provider versions: `version = ">= 4.62.0"` + +### GitHub Actions (`.github/workflows/`) + +**Formatting** + +- Use Prettier for YAML files +- Run `pnpm format` to format workflow files + +**Naming** + +- Job names: descriptive, lowercase with hyphens +- Step names: descriptive + +### Documentation (docs/) + +**Formatting** + +- Use Prettier for Markdown files +- Run `pnpm format` to format docs + +**General** + +- Use ATX-style headers (`#`, `##`, etc.) +- Keep lines under 100 characters when practical +- Include code blocks with language identifiers + +--- + +## Architecture Overview + +```text +docs/architecture/ +├── systems/ # Individual system documentation +├── reference/ # Reference and planning docs +│ └── strategic/ # Strategic guidance +├── 01-*-*.md # ADR-style documents + +dashboard/ # Admin UI (Node.js/pnpm) +infra/ # Terraform IaC +scripts/ # Deployment automation +state-service/ # FastAPI state service +``` + +--- + +## Key Files + +| File | Purpose | +| ------------------------------------- | --------------------- | +| `CLAUDE.md` | Claude Code guidance | +| `dashboard/app.js` | Dashboard UI | +| `infra/modules/aigateway_aca/main.tf` | Main infrastructure | +| `state-service/state_service/` | FastAPI state service | +| `.github/workflows/deploy.yaml` | CI/CD pipeline | + +--- + +## Prerequisites + +- Azure CLI (`az login`) +- Terraform >= 1.14.0 +- Node.js + pnpm +- Python 3.13+ + +--- + +## Before Committing + +1. Run formatting: `pnpm format` +2. Run type checks: `mypy .` (if Python changed) +3. Run terraform fmt: `terraform fmt -check -recursive` +4. Test locally if possible diff --git a/CLAUDE.md b/CLAUDE.md index c0c4322..84f26a9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Project -**ai-gateway** — OpenAI-compatible AI gateway built on LiteLLM, deployed to Azure Container Apps. Routes `/v1/responses` and `/v1/embeddings` to Azure OpenAI. +**sluice** — OpenAI-compatible AI gateway built on LiteLLM, deployed to Azure Container Apps. Routes `/v1/responses` and `/v1/embeddings` to Azure OpenAI. ## Tech Stack @@ -44,4 +44,4 @@ python update_env_main.py # Update environment config ## AgentKit Forge -This project has not yet been onboarded to [AgentKit Forge](https://github.com/phoenixvc/agentkit-forge). To request onboarding, [create a ticket](https://github.com/phoenixvc/agentkit-forge/issues/new?title=Onboard+ai-gateway&labels=onboarding). +This project has not yet been onboarded to [retort](https://github.com/phoenixvc/retort). To request onboarding, [create a ticket](https://github.com/phoenixvc/retort/issues/new?title=Onboard+sluice&labels=onboarding). diff --git a/README.md b/README.md index dabd2de..8460995 100644 --- a/README.md +++ b/README.md @@ -1,68 +1,108 @@ -# ai-gateway +# sluice -OpenAI-compatible AI Gateway (LiteLLM) on Azure Container Apps. Routes `/v1/responses` and `/v1/embeddings` to Azure OpenAI. +OpenAI-compatible AI gateway for the phoenixvc platform — routes all org AI traffic through a single, observable, rate-limited data plane backed by Azure OpenAI. -## Prerequisites +## What it is -- [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli) (`az login`) -- [Terraform](https://www.terraform.io/downloads) >= 1.14.0 -- Bash or PowerShell (for scripts) +Direct Azure OpenAI calls scatter across projects with no shared observability, no rate governance, and duplicated secret management. **Sluice** fixes this: a thin LiteLLM proxy that presents an OpenAI-compatible surface to every consumer, while centralising auth, rate limiting, semantic caching, and telemetry in one place. -## Quick Start +``` +your code → POST /v1/responses ─┐ + POST /v1/embeddings ─┤ sluice (LiteLLM) → Azure OpenAI + GET /metrics ─┘ +``` -### 1. Bootstrap Terraform state (one-time) +All org AI traffic should route through sluice. That single rule gives you per-project spend attribution, a kill switch, and consistent model governance without changing any consumer code. -Creates the shared resource group, storage account, and container for Terraform state. +## Components -**Bash:** +| Component | Stack | Purpose | +|-----------|-------|---------| +| **Gateway** | LiteLLM + Docker | OpenAI-compatible proxy on Azure Container Apps | +| **State Service** | FastAPI + Redis | Per-user model selection and model catalog API | +| **Dashboard** | Node.js + Chart.js | Real-time request/token metrics and model switching UI | +| **Infrastructure** | Terraform + Azure | Container Apps, Key Vault, Log Analytics, optional Redis | -```bash -./scripts/bootstrap.sh [SCOPE] -``` +### Gateway -**PowerShell:** +Exposes two endpoints: -```powershell -.\scripts\bootstrap.ps1 -GITHUB_ORG -GITHUB_REPO [-SCOPE ] -``` +| Endpoint | Routes to | +|----------|-----------| +| `POST /v1/responses` | Azure OpenAI Responses API (configurable model per env) | +| `POST /v1/embeddings` | Azure OpenAI `text-embedding-3-large` | -### 2. Add GitHub secrets +Authentication uses a shared `master_key` passed as a Bearer token. Rate limiting (RPM, TPM, budget caps) and optional Redis semantic caching are configured in Terraform and enforced by LiteLLM. -Add these secrets to each GitHub **Environment** (dev, staging, prod): **Settings → Environments → <env> → Environment secrets**. +### State Service -| Secret | Description | Example | -| ----------------------- | --------------------------------- | --------------------------------------------- | -| **Infrastructure** | | | -| `TF_BACKEND_RG` | Terraform state resource group | `pvc-shared-tfstate-rg-san` | -| `TF_BACKEND_SA` | Terraform state storage account | `pvctfstatexxxxxxxx` | -| `TF_BACKEND_CONTAINER` | Terraform state container | `tfstate` | -| `AZURE_CLIENT_ID` | OIDC app (from bootstrap) | `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` | -| `AZURE_TENANT_ID` | Azure tenant ID | `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` | -| `AZURE_SUBSCRIPTION_ID` | Azure subscription ID | `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` | -| **Application** | | | -| `AZURE_OPENAI_ENDPOINT` | Azure OpenAI endpoint URL | `https://mys-shared-ai-san.openai.azure.com/` | -| `AZURE_OPENAI_API_KEY` | Azure OpenAI API key | Your Azure OpenAI key | -| `AIGATEWAY_KEY` | Gateway auth key (from bootstrap) | Base64 string from bootstrap output | +FastAPI service managing which model each user has selected and what models are available. -Bootstrap prints these values. For local runs, copy `infra/.env.local.example` to `infra/.env.local` with the infrastructure values. +| Endpoint | Description | +|----------|-------------| +| `GET /state/catalog` | Available models and their status | +| `PUT /state/catalog` | Update the model catalog (token-protected) | +| `GET /state/selection` | Fetch the calling user's selected model | +| `PUT /state/selection` | Update the calling user's model selection | +| `GET /state/selections` | Paginated list of all users' selections | -> **Key Vault firewall:** Deployments from GitHub Actions require Key Vault to allow public network access. The Terraform module defaults `key_vault_network_default_action` to `Allow` for CI. If you see `ForbiddenByFirewall`, ensure the `fix/key-vault-network-acls` changes are merged and applied. +Requires `X-User-Id` header. In production, backed by Azure Cache for Redis; falls back to in-memory for local dev. -### 3. Terraform init +### Dashboard -**Bash:** +Real-time monitoring UI: +- Request counts and token usage charts (polls `/metrics` Prometheus endpoint) +- Model catalog display and per-user model selection +- Session-based API key management + +### Observability + +| Signal | Tool | +|--------|------| +| Metrics | Prometheus (`/metrics`), Grafana Cloud (optional) | +| Tracing | OpenTelemetry → Application Insights | +| LLM observability | Langfuse (optional) | +| Structured logs | Log Analytics Workspace | + +--- + +## Quick Start + +### 1. Bootstrap Terraform state (one-time) + +Creates the shared resource group, storage account, and container for Terraform state; registers an Azure AD app for GitHub Actions OIDC; outputs the values you need for GitHub secrets. ```bash -./infra/scripts/terraform-init.sh dev # or staging, prod +./scripts/bootstrap.sh [SCOPE] +# PowerShell: .\scripts\bootstrap.ps1 -GITHUB_ORG -GITHUB_REPO ``` -**PowerShell:** +### 2. Add GitHub environment secrets -```powershell -.\infra\scripts\terraform-init.ps1 -Env dev # or staging, prod -``` +Add these to each GitHub **Environment** (dev, staging, prod) under **Settings → Environments → <env> → Environment secrets**: + +| Secret | Description | +|--------|-------------| +| `TF_BACKEND_RG` | Terraform state resource group | +| `TF_BACKEND_SA` | Terraform state storage account | +| `TF_BACKEND_CONTAINER` | Terraform state container (`tfstate`) | +| `AZURE_CLIENT_ID` | OIDC app registration client ID (from bootstrap) | +| `AZURE_TENANT_ID` | Azure tenant ID | +| `AZURE_SUBSCRIPTION_ID` | Azure subscription ID | +| `AZURE_OPENAI_ENDPOINT` | Azure OpenAI endpoint URL | +| `AZURE_OPENAI_API_KEY` | Azure OpenAI API key | +| `AIGATEWAY_KEY` | Gateway auth key (from bootstrap output) | + +Bootstrap prints all of these. For local runs, copy `infra/.env.local.example` → `infra/.env.local`. + +> **Key Vault firewall:** GitHub Actions deployments require Key Vault to allow public network access. The Terraform module defaults `key_vault_network_default_action` to `Allow` for CI. If you see `ForbiddenByFirewall`, verify the ACL changes are applied. + +### 3. Terraform init -Valid environments: `dev`, `staging`, `prod`. +```bash +./infra/scripts/terraform-init.sh dev # or staging, prod +# PowerShell: .\infra\scripts\terraform-init.ps1 -Env dev +``` ### 4. Plan and apply @@ -72,38 +112,64 @@ terraform plan terraform apply ``` +--- + ## Environments -| Env | Purpose | -| ------- | ----------- | -| dev | Development | -| staging | Staging | -| prod | Production | +| Env | Purpose | Default model | +|-----|---------|---------------| +| `dev` | Development | `gpt-5.3-codex` (preview) | +| `staging` | Pre-production validation | configurable | +| `prod` | Production | `gpt-4o` | + +Each environment is an independent Azure deployment with its own Container Apps, Key Vault, and (optionally) Redis cache. ## CI/CD -- CI/CD behavior, environment promotion rules, and smoke-test diagnostics are documented in [docs/CI_CD.md](docs/CI_CD.md). +GitHub Actions deploys via Azure OIDC — no long-lived credentials in CI. The pipeline runs smoke tests against the gateway and state-service health endpoint after each deploy. -## Formatting (pnpm) +See [docs/CI_CD.md](docs/CI_CD.md) for promotion rules, smoke-test diagnostics, and environment toggling. -This repo uses [Prettier](https://prettier.io/) via `pnpm` for lightweight formatting checks. +## Formatting ```bash pnpm install -pnpm check -pnpm lint -pnpm format:check -pnpm format +pnpm check # lint + terraform fmt check +pnpm format # apply Prettier ``` -- `pnpm check` runs repo checks (`lint` + `terraform fmt -check -recursive`) -- `pnpm lint` currently maps to formatting checks (easy to expand later) -- `pnpm format` applies Prettier changes +--- ## Documentation -- [PRD](docs/PRD.md) – Product requirements -- [Terraform Blueprint](docs/Terraform_Blueprint.md) – Infrastructure design -- [CI/CD Runbook](docs/CI_CD.md) – workflow behavior, staging toggle, smoke tests -- [Azure OIDC Setup](docs/AZURE_OIDC_SETUP.md) – GitHub Actions OIDC configuration -- [Secrets Checklist](docs/SECRETS.md) – Copy/paste setup for GitHub environment secrets +| Doc | Contents | +|-----|----------| +| [PRD](docs/PRD.md) | Product requirements and constraints | +| [Terraform Blueprint](docs/Terraform_Blueprint.md) | Infrastructure design and naming conventions | +| [CI/CD Runbook](docs/CI_CD.md) | Workflow behavior, staging toggle, smoke test diagnostics | +| [Azure OIDC Setup](docs/AZURE_OIDC_SETUP.md) | GitHub Actions OIDC configuration | +| [Secrets Checklist](docs/SECRETS.md) | Copy/paste setup for GitHub environment secrets | +| [Architecture ADRs](docs/architecture/) | System context, container architecture, SLM routing pattern, contracts | + +--- + +## Ecosystem + +Sluice is the AI data plane for the phoenixvc platform. All org AI traffic routes through it. + +| Repo | Role | +|------|------| +| [`docket`](https://github.com/phoenixvc/docket) | Consumes sluice OTEL spans for LLM spend tracking, FOCUS exports, and per-project cost attribution | +| [`cognitive-mesh`](https://github.com/phoenixvc/cognitive-mesh) | Routes all LLM calls through sluice for unified observability and model switching | +| [`mystira-workspace`](https://github.com/phoenixvc/mystira-workspace) | Primary consumer — story generation, publisher, and admin calls all route through sluice | +| [`phoenix-flow`](https://github.com/phoenixvc/phoenix-flow) | AI-assisted task routing calls route through sluice | +| [`codeflow-engine`](https://github.com/phoenixvc/codeflow-engine) | AutoPR AI analysis calls route through sluice | +| [`retort`](https://github.com/phoenixvc/retort) | Retort-scaffolded projects reference sluice as the recommended AI gateway | + +--- + +## Prerequisites + +- [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli) — `az login` +- [Terraform](https://www.terraform.io/downloads) >= 1.14.0 +- Node.js + pnpm (formatting only) diff --git a/docs/architecture/02-container-architecture.md b/docs/architecture/02-container-architecture.md index 2b66950..544639c 100644 --- a/docs/architecture/02-container-architecture.md +++ b/docs/architecture/02-container-architecture.md @@ -27,6 +27,7 @@ flowchart TB G4[Budget Router] G5[Semantic Cache] G6[Escalation Judge] + W[Webhook Auth] end subgraph Mesh @@ -57,6 +58,8 @@ flowchart TB C1 --> G1 C2 --> G1 + C3 --> W + W --> G1 C4 --> G1 G1 --> G2 diff --git a/docs/architecture/04-observability-telemetry.md b/docs/architecture/04-observability-telemetry.md index 3afe313..8c095da 100644 --- a/docs/architecture/04-observability-telemetry.md +++ b/docs/architecture/04-observability-telemetry.md @@ -34,8 +34,9 @@ flowchart TB subgraph Ingest I1[OpenTelemetry] - I2[Azure Monitor] + I2[Application Insights] I3[Blob Export] + I4[Prometheus] end subgraph Analytics @@ -62,15 +63,39 @@ flowchart TB S4 --> I1 S5 --> I2 S6 --> I3 + S5 --> I4 I1 --> A1 I2 --> A1 I3 --> A1 + I4 --> V1 A1 --> V1 V1 --> V2 ``` +### Telemetry Sinks + +LiteLLM enables Prometheus metrics via `success_callback` and `failure_callback` containing "prometheus". The Prometheus exporter exposes a `/metrics` endpoint which is scraped by Prometheus for application metrics collection. See `infra/modules/aigateway_aca/main.tf:95-113` for the container configuration. + +The primary telemetry sinks are: + +- **OpenTelemetry**: Traces and spans +- **Application Insights**: Azure Monitor implementation using `APPLICATIONINSIGHTS_CONNECTION_STRING` env var for OTEL exporter +- **Blob Export**: Raw event storage +- **Prometheus**: Application metrics via `/metrics` endpoint + +## Retention Policies + +Application Insights retention defaults: + +- **Production**: 90 days +- **Non-production (dev/staging)**: 30 days + +These are environment-specific settings configured in the Application Insights resource. Operators can adjust retention in the Azure Portal under Application Insights resource settings. + +Include retention expectations in operational runbooks to align cost and data availability expectations. + ## Key Metrics ### Gateway diff --git a/docs/architecture/reference/matrix-gateway.md b/docs/architecture/reference/matrix-gateway.md index 4551887..395719c 100644 --- a/docs/architecture/reference/matrix-gateway.md +++ b/docs/architecture/reference/matrix-gateway.md @@ -41,16 +41,26 @@ flowchart TD ```json { - "intent": "code_review", + "request_id": "req_abc123", + "label": "code_review", "complexity": "medium", "tool_candidate": true, - "recommended_target": "codeflow-engine", - "recommended_model_tier": "small", - "escalation_required": false, + "recommended_tier": "slm", + "cacheable": true, "confidence": 0.93 } ``` +> **Migration Note (v1.0.0)**: The response contract has been updated. Legacy field names `intent`, `recommended_target`, `recommended_model_tier`, and `escalation_required` are deprecated. Update clients to use the new fields: +> +> - `intent` → `label` +> - `recommended_target` → removed (use `recommended_tier` for routing) +> - `recommended_model_tier` → `recommended_tier` +> - `escalation_required` → derive from `confidence < 0.75` threshold +> - `cacheable` is a new field (previously not returned) +> +> **Deprecation window**: Legacy fields will be removed in v1.2.0. Clients should update by then. For backwards compatibility, implement fallback logic checking both old and new field names. + ## Contract Shapes ```typescript @@ -91,7 +101,7 @@ interface PolicyScreenOutput { | Condition | Action | | -------------------------------- | ---------------------- | | `policy-screen.allowed == false` | Block or redact | -| `confidence < 0.70` | Escalate to LLM | +| `confidence < 0.75` | Escalate to LLM | | Tool suggested but no mapping | Send to general LLM | | Tagging fails | Mark telemetry partial | diff --git a/docs/architecture/reference/matrix-rooivalk.md b/docs/architecture/reference/matrix-rooivalk.md index 29f20da..147c7b0 100644 --- a/docs/architecture/reference/matrix-rooivalk.md +++ b/docs/architecture/reference/matrix-rooivalk.md @@ -24,7 +24,7 @@ flowchart TD ## CRITICAL: SLM is for Reporting Only -``` +```text ┌─────────────────────────────────────────────────────────┐ │ IMPORTANT - SAFETY BOUNDARY │ ├─────────────────────────────────────────────────────────┤ @@ -109,7 +109,7 @@ interface SuggestSopOutput { ```typescript const DEFAULT_THRESHOLDS = { operator_summary: { direct_use: 0.8, facts_only: 0.65 }, - sop_suggestion: { direct_suggest: 0.78, manual_lookup: 0.65 }, + sop_suggestion: { direct_suggest: 0.8, manual_lookup: 0.65 }, }; ``` diff --git a/docs/architecture/reference/slm-implementation-matrix.md b/docs/architecture/reference/slm-implementation-matrix.md index eb1fc6e..ff50192 100644 --- a/docs/architecture/reference/slm-implementation-matrix.md +++ b/docs/architecture/reference/slm-implementation-matrix.md @@ -15,13 +15,13 @@ This document provides a repo-by-repo implementation matrix showing SLM endpoint ## Documentation Structure -``` +```text reference/ ├── slm-implementation-matrix.md # This file ├── matrix-gateway.md # AI Gateway details ├── matrix-cognitive-mesh.md # Cognitive Mesh details -├── matrix-codeflow.md # CodeFlow Engine details -├── matrix-agentkit.md # AgentKit Forge details +├── matrix-codeflow.md # CodeFlow Engine details +├── matrix-agentkit.md # AgentKit Forge details ├── matrix-rooivalk.md # PhoenixRooivalk details └── matrix-mystira.md # Mystira details ``` @@ -251,7 +251,7 @@ This is a practical role map, not a vendor mandate. ### Standard Fallback Pattern -``` +```text 1. SLM timeout → Deterministic rules 2. Low confidence → LLM escalation 3. Safety critical → Block immediately diff --git a/docs/architecture/reference/slm-management-plan.md b/docs/architecture/reference/slm-management-plan.md index 7c116c5..92c164d 100644 --- a/docs/architecture/reference/slm-management-plan.md +++ b/docs/architecture/reference/slm-management-plan.md @@ -40,7 +40,7 @@ Maintain a tiered model portfolio: Implement cost controls at each layer: -``` +```text Cost Control Layers ┌─────────────────────────────────────┐ │ 1. Budget caps per project │ @@ -172,7 +172,7 @@ async def security_pipeline(request: Request) -> SecurityResult: ### Fallback Hierarchy -``` +```text Request │ ▼ Primary SLM @@ -216,7 +216,7 @@ Request ### Model Lifecycle -``` +```text Discovery → Testing → Staging → Production → Deprecated → Retired │ │ │ │ │ ▼ ▼ ▼ ▼ ▼ @@ -271,4 +271,4 @@ Discovery → Testing → Staging → Production → Deprecated → Retired 6. [ ] Define fallback hierarchies 7. [ ] Implement observability stack 8. [ ] Document model lifecycle process -9. [ ] **Add explicit safety boundary for PhoenixRooivalk** +9. [x] Add explicit safety boundary for PhoenixRooivalk diff --git a/docs/architecture/reference/strategic/07-deployment-model.md b/docs/architecture/reference/strategic/07-deployment-model.md index e4e6f8d..7e03967 100644 --- a/docs/architecture/reference/strategic/07-deployment-model.md +++ b/docs/architecture/reference/strategic/07-deployment-model.md @@ -26,14 +26,14 @@ flowchart TD ## Decision Matrix -| System | Best SLM Jobs | Less Suitable | -| --------------- | -------------------------- | ------------------------------ | -| AI Gateway | routing, screening, cost | Nuanced synthesis | -| Cognitive Mesh | routing, decomposition | Final judgment | -| CodeFlow | PR triage, log analysis | Root cause across dependencies | -| AgentKit | tool selection, extraction | Multi-step planning | -| PhoenixRooivalk | summaries, alerts | Sole threat authority | -| Mystira | safety, continuity | Rich narrative | +| System | Best SLM Jobs | Less Suitable | +| --------------- | ------------------------------------------------------------- | ------------------------------ | +| AI Gateway | routing, screening, cost | Nuanced synthesis | +| Cognitive Mesh | routing, decomposition | Final judgment | +| CodeFlow | PR classification, CI failure triage, release-note extraction | Root cause across dependencies | +| AgentKit | tool selection, extraction | Multi-step planning | +| PhoenixRooivalk | summaries, alerts | Sole threat authority | +| Mystira | safety, continuity | Rich narrative | ## Practical Gateway Flow diff --git a/docs/architecture/systems/agentkit-forge.md b/docs/architecture/systems/agentkit-forge.md index c84a182..e61cf67 100644 --- a/docs/architecture/systems/agentkit-forge.md +++ b/docs/architecture/systems/agentkit-forge.md @@ -4,7 +4,7 @@ AgentKit Forge builds AI agents and orchestration workflows. SLMs help when agen ## Architecture -``` +```text Agent Task │ ▼ diff --git a/docs/architecture/systems/ai-gateway.md b/docs/architecture/systems/ai-gateway.md index 5a288f0..751f9fc 100644 --- a/docs/architecture/systems/ai-gateway.md +++ b/docs/architecture/systems/ai-gateway.md @@ -4,7 +4,7 @@ AI Gateway sits between applications and multiple AI providers. The SLM acts as ## Architecture -``` +```text Client Request │ ▼ @@ -144,3 +144,31 @@ Track per routing decision: - [ ] Add security prefiltering (injection, PII, secrets) - [ ] Set up cost tracking per tier - [ ] Configure latency alerts + +## v1 API Routing + +The gateway routes `/v1/responses` and `/v1/embeddings` requests to Azure OpenAI via LiteLLM provider configuration. + +### Routing Rules + +| Endpoint | Provider | Notes | +| ---------------- | ---------------------- | ------------------------- | +| `/v1/responses` | LiteLLM → Azure OpenAI | Standard chat completions | +| `/v1/embeddings` | LiteLLM → Azure OpenAI | Text embedding generation | + +### Example LiteLLM Config + +```yaml +model_list: + - model_name: gpt-4.1 + litellm_params: + model: azure/gpt-4.1 + api_base: https://.openai.azure.com + api_key: os.environ/AZURE_OPENAI_API_KEY + api_version: "2025-04-01-preview" +``` + +### Response vs Embeddings Handling + +- **Responses**: Model selection based on complexity/classification; supports streaming +- **Embeddings**: Batched processing; fixed deployment mapping diff --git a/docs/architecture/systems/codeflow-engine.md b/docs/architecture/systems/codeflow-engine.md index 30f5dfd..c1de01e 100644 --- a/docs/architecture/systems/codeflow-engine.md +++ b/docs/architecture/systems/codeflow-engine.md @@ -4,7 +4,7 @@ CodeFlow Engine is a DevOps and CI/CD intelligence system. **This is one of the ## Architecture -``` +```text Git Push / PR Event │ ▼ @@ -96,7 +96,7 @@ async def select_tests(change_type: str, impacted_files: list[str]) -> TestPlan: Type: {change_type} Files: {', '.join(impacted_files)} -Output: { "run_unit": bool, "run_integration": bool, "run_e2e": bool, "skip_reason": str|null }""" +Output: {{ "run_unit": bool, "run_integration": bool, "run_e2e": bool, "skip_reason": str|null }}""" return await slm_completion(prompt) ``` diff --git a/docs/architecture/systems/cognitive-mesh.md b/docs/architecture/systems/cognitive-mesh.md index d4f2c96..dedc502 100644 --- a/docs/architecture/systems/cognitive-mesh.md +++ b/docs/architecture/systems/cognitive-mesh.md @@ -4,7 +4,7 @@ Cognitive Mesh architectures orchestrate multiple AI agents and tools. The SLM i ## Architecture -``` +```text User Query │ ▼ @@ -143,7 +143,7 @@ async def compress_context(messages: list[Message]) -> Compressed: | Pros | Cons | | ------------------------------- | ----------------------------------------------- | -| Very large token savings | Decomposition quality can bottleneck workflow | +| Large token savings | Decomposition quality can bottleneck workflow | | Better determinism | Brittle routing if taxonomy is poor | | Easier specialist orchestration | Harder debugging if confidence handling is weak | | Improved auditability | | diff --git a/docs/architecture/systems/phoenix-rooivalk.md b/docs/architecture/systems/phoenix-rooivalk.md index dedaf96..5c0e0a7 100644 --- a/docs/architecture/systems/phoenix-rooivalk.md +++ b/docs/architecture/systems/phoenix-rooivalk.md @@ -4,7 +4,7 @@ PhoenixRooivalk is an edge AI counter-UAS (Unmanned Aerial System) system. **SLM ## Architecture -``` +```text Sensors │ ▼ diff --git a/docs/planning/request_to_token_attribution.md b/docs/planning/request_to_token_attribution.md index d5790c1..b0b5cd0 100644 --- a/docs/planning/request_to_token_attribution.md +++ b/docs/planning/request_to_token_attribution.md @@ -122,9 +122,9 @@ Start with downstream aggregation in pvc-costops-analytics - the cheapest and fa ### 1. cognitive-mesh (Upstream Caller) -**Required:** Pass correlation metadata in request body when calling gateway. There are two methods: +**Recommended:** Pass correlation metadata in request body when calling gateway. There are two methods: -**Method A: Via Request Metadata (Recommended)** +**Method A: Via Request Metadata (Preferred)** Pass correlation IDs in the request body `metadata` field: ```json @@ -142,7 +142,7 @@ Pass correlation IDs in the request body `metadata` field: } ``` -**Method B: Via HTTP Headers** +**Method B: Via HTTP Headers** (alternative - requires additional LiteLLM configuration or middleware) - x-request-id - x-session-id @@ -151,8 +151,6 @@ Pass correlation IDs in the request body `metadata` field: - x-stage-name - x-user-id -_Note: Method B requires additional LiteLLM configuration or middleware._ - ### 2. pvc-costops-analytics (Downstream Analytics) **Required:** KQL queries and dashboards to: @@ -218,7 +216,7 @@ _Note: Method B requires additional LiteLLM configuration or middleware._ - cognitive-mesh: Pass correlation metadata in request body - pvc-costops-analytics: Must create KQL queries for new event shape -- infra: Application Insights resource + APPLICATIONINSIGHTS_CONNECTION_STRING wiring added; trace export requires custom LiteLLM image (with azure-monitor-opentelemetry) or explicit OTEL_EXPORTER_OTLP_ENDPOINT configuration (currently empty by default) +- infra: Application Insights resource created; APPLICATIONINSIGHTS_CONNECTION_STRING stored in Key Vault and wired to container app via secret reference; trace export requires custom LiteLLM image (with azure-monitor-opentelemetry) or explicit OTEL_EXPORTER_OTLP_ENDPOINT configuration (currently empty by default) ## Action Items @@ -226,7 +224,7 @@ _Note: Method B requires additional LiteLLM configuration or middleware._ 1. ✅ ai-gateway: Add OTEL callback for token telemetry (Phase 1) 2. ✅ ai-gateway: Document correlation ID requirements (Phase 2) -3. ✅ ai-gateway: Add Application Insights connection string wiring (Phase 1b - trace export requires custom image or OTLP collector) +3. ✅ ai-gateway: Add Application Insights connection string wiring via Key Vault (Phase 1b - trace export requires custom image or OTLP collector) ### Pending diff --git a/infra/env/dev/terraform.tfvars b/infra/env/dev/terraform.tfvars index aaec5e8..e2d08c8 100644 --- a/infra/env/dev/terraform.tfvars +++ b/infra/env/dev/terraform.tfvars @@ -23,3 +23,8 @@ tags = { } enable_redis_cache = true + +# State Service +state_service_container_image = "ghcr.io/phoenixvc/ai-gateway-state-service:latest" +state_service_registry_username = "phoenixvc" +state_service_registry_password = "ghp_xxx" diff --git a/infra/modules/aigateway_aca/outputs.tf b/infra/modules/aigateway_aca/outputs.tf index a8dfe6b..e6b8ff3 100644 --- a/infra/modules/aigateway_aca/outputs.tf +++ b/infra/modules/aigateway_aca/outputs.tf @@ -30,8 +30,7 @@ output "container_app_environment_id" { value = azurerm_container_app_environment.cae.id } -output "application_insights_connection_string" { - value = azurerm_application_insights.ai.connection_string - description = "Application Insights connection string for OTEL export." - sensitive = true +output "application_insights_name" { + description = "Application Insights resource name. Retrieve connection string from Key Vault secret 'appinsights-connection-string'." + value = azurerm_application_insights.ai.name } diff --git a/infra/modules/dashboard_aca/main.tf b/infra/modules/dashboard_aca/main.tf index 65f86c8..ce740f4 100644 --- a/infra/modules/dashboard_aca/main.tf +++ b/infra/modules/dashboard_aca/main.tf @@ -12,7 +12,7 @@ terraform { locals { prefix = "pvc-${var.env}-${var.projname}" ca_name = "${local.prefix}-dashboard-${var.location_short}" - use_shared_token = trim(var.state_service_shared_token) != "" + use_shared_token = trimspace(var.state_service_shared_token) != "" tags = merge({ env = var.env diff --git a/infra/modules/state_service_aca/main.tf b/infra/modules/state_service_aca/main.tf index da86391..d768198 100644 --- a/infra/modules/state_service_aca/main.tf +++ b/infra/modules/state_service_aca/main.tf @@ -13,7 +13,7 @@ locals { prefix = "pvc-${var.env}-${var.projname}" ca_name = "${local.prefix}-state-${var.location_short}" use_registry_auth = var.registry_username != "" && var.registry_password != "" - use_shared_token = trim(var.state_service_shared_token) != "" + use_shared_token = trimspace(var.state_service_shared_token) != "" tags = merge({ env = var.env