From 8eef6e1cbb899dfa488628347c3d36c7edba6e5b Mon Sep 17 00:00:00 2001 From: JustAGhosT Date: Sun, 15 Mar 2026 04:39:38 +0200 Subject: [PATCH 01/11] refactor: rename UAT environment to staging Renamed all infrastructure files from UAT to staging and updated all environment references in configuration files and validation rules to use "staging" instead of "uat". --- infra/env/{uat => staging}/.terraform.lock.hcl | 0 infra/env/{uat => staging}/main.tf | 0 infra/env/{uat => staging}/terraform.tfvars | 6 +++--- infra/env/{uat => staging}/variables.tf | 4 ++-- infra/modules/aigateway_aca/variables.tf | 6 +++--- infra/modules/dashboard_aca/variables.tf | 6 +++--- infra/modules/state_service_aca/variables.tf | 6 +++--- 7 files changed, 14 insertions(+), 14 deletions(-) rename infra/env/{uat => staging}/.terraform.lock.hcl (100%) rename infra/env/{uat => staging}/main.tf (100%) rename infra/env/{uat => staging}/terraform.tfvars (89%) rename infra/env/{uat => staging}/variables.tf (97%) diff --git a/infra/env/uat/.terraform.lock.hcl b/infra/env/staging/.terraform.lock.hcl similarity index 100% rename from infra/env/uat/.terraform.lock.hcl rename to infra/env/staging/.terraform.lock.hcl diff --git a/infra/env/uat/main.tf b/infra/env/staging/main.tf similarity index 100% rename from infra/env/uat/main.tf rename to infra/env/staging/main.tf diff --git a/infra/env/uat/terraform.tfvars b/infra/env/staging/terraform.tfvars similarity index 89% rename from infra/env/uat/terraform.tfvars rename to infra/env/staging/terraform.tfvars index 566fd2e..2f80c99 100644 --- a/infra/env/uat/terraform.tfvars +++ b/infra/env/staging/terraform.tfvars @@ -1,4 +1,4 @@ -env = "uat" +env = "staging" projname = "aigateway" location = "southafricanorth" location_short = "san" @@ -7,7 +7,7 @@ location_short = "san" # NOTE: The TF_VAR_azure_openai_endpoint environment variable (set via the # GitHub Environment secret AZURE_OPENAI_ENDPOINT) takes precedence over this # value during CI/CD runs. For local development, either set that env var or -# update this file with the correct UAT endpoint. +# update this file with the correct staging endpoint. azure_openai_endpoint = "https://mys-shared-ai-san.cognitiveservices.azure.com" codex_model = "gpt-5.3-codex" @@ -22,7 +22,7 @@ secrets_expiration_date = "2027-03-31T00:00:00Z" tags = { owner = "ai-gateway-team" project = "aigateway" - env = "uat" + env = "staging" } enable_redis_cache = true diff --git a/infra/env/uat/variables.tf b/infra/env/staging/variables.tf similarity index 97% rename from infra/env/uat/variables.tf rename to infra/env/staging/variables.tf index ff4b0e5..2bab6ad 100644 --- a/infra/env/uat/variables.tf +++ b/infra/env/staging/variables.tf @@ -1,8 +1,8 @@ variable "env" { type = string validation { - condition = contains(["dev", "uat", "prod"], var.env) - error_message = "Environment must be one of: dev, uat, prod." + condition = contains(["dev", "staging", "prod"], var.env) + error_message = "Environment must be one of: dev, staging, prod." } } variable "location" { type = string } diff --git a/infra/modules/aigateway_aca/variables.tf b/infra/modules/aigateway_aca/variables.tf index 377b408..7fce498 100644 --- a/infra/modules/aigateway_aca/variables.tf +++ b/infra/modules/aigateway_aca/variables.tf @@ -3,10 +3,10 @@ variable "env" { type = string - description = "Environment name (dev|uat|prod)" + description = "Environment name (dev|staging|prod)" validation { - condition = contains(["dev", "uat", "prod"], var.env) - error_message = "Environment must be one of: dev, uat, prod." + condition = contains(["dev", "staging", "prod"], var.env) + error_message = "Environment must be one of: dev, staging, prod." } } diff --git a/infra/modules/dashboard_aca/variables.tf b/infra/modules/dashboard_aca/variables.tf index 6aa0eea..7fbb116 100644 --- a/infra/modules/dashboard_aca/variables.tf +++ b/infra/modules/dashboard_aca/variables.tf @@ -1,9 +1,9 @@ variable "env" { type = string - description = "Environment name (dev|uat|prod)" + description = "Environment name (dev|staging|prod)" validation { - condition = contains(["dev", "uat", "prod"], var.env) - error_message = "Environment must be one of: dev, uat, prod." + condition = contains(["dev", "staging", "prod"], var.env) + error_message = "Environment must be one of: dev, staging, prod." } } diff --git a/infra/modules/state_service_aca/variables.tf b/infra/modules/state_service_aca/variables.tf index 51febf4..d11da7d 100644 --- a/infra/modules/state_service_aca/variables.tf +++ b/infra/modules/state_service_aca/variables.tf @@ -1,9 +1,9 @@ variable "env" { type = string - description = "Environment name (dev|uat|prod)" + description = "Environment name (dev|staging|prod)" validation { - condition = contains(["dev", "uat", "prod"], var.env) - error_message = "Environment must be one of: dev, uat, prod." + condition = contains(["dev", "staging", "prod"], var.env) + error_message = "Environment must be one of: dev, staging, prod." } } From 901509d575531fda735b3339688844a8b85a1417 Mon Sep 17 00:00:00 2001 From: JustAGhosT Date: Sun, 15 Mar 2026 04:43:28 +0200 Subject: [PATCH 02/11] refactor: change environment validation from "uat" to "staging" --- infra/env/dev/variables.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/infra/env/dev/variables.tf b/infra/env/dev/variables.tf index cf18df1..92d7c13 100644 --- a/infra/env/dev/variables.tf +++ b/infra/env/dev/variables.tf @@ -1,8 +1,8 @@ variable "env" { type = string validation { - condition = contains(["dev", "uat", "prod"], var.env) - error_message = "Environment must be one of: dev, uat, prod." + condition = contains(["dev", "staging", "prod"], var.env) + error_message = "Environment must be one of: dev, staging, prod." } } variable "location" { type = string } From 3945195650572149ea71f1664a4d194e5aec5b54 Mon Sep 17 00:00:00 2001 From: JustAGhosT Date: Sun, 15 Mar 2026 04:46:22 +0200 Subject: [PATCH 03/11] refactor(infra): replace "uat" with "staging" in environment validation --- infra/env/prod/variables.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/infra/env/prod/variables.tf b/infra/env/prod/variables.tf index 1a9a003..efcd1be 100644 --- a/infra/env/prod/variables.tf +++ b/infra/env/prod/variables.tf @@ -1,8 +1,8 @@ variable "env" { type = string validation { - condition = contains(["dev", "uat", "prod"], var.env) - error_message = "Environment must be one of: dev, uat, prod." + condition = contains(["dev", "staging", "prod"], var.env) + error_message = "Environment must be one of: dev, staging, prod." } } variable "location" { type = string } From 8f6c24a8bbea9a1b266b5e49efa135d121b5b04c Mon Sep 17 00:00:00 2001 From: JustAGhosT Date: Sun, 15 Mar 2026 04:53:10 +0200 Subject: [PATCH 04/11] refactor: rename UAT environment to staging Replaces all instances of "uat" with "staging" in deployment workflow and documentation, including GitHub Actions job names, environment references, and PR label triggers. --- .github/workflows/deploy.yaml | 24 ++++++++++++------------ docs/PRD.md | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml index 34a6c9f..1361556 100644 --- a/.github/workflows/deploy.yaml +++ b/.github/workflows/deploy.yaml @@ -33,23 +33,23 @@ env: jobs: plan: - # PR into dev → dev | PR into main + label 'run-uat' → uat | Push to main/workflow_dispatch → prod + # PR into dev → dev | PR into main + label 'run-staging' → staging | Push to main/workflow_dispatch → prod # Skip plan for PRs from forks (no repo secrets; avoids AADSTS700213) - # Runtime UAT toggle: add PR label 'run-uat' to enable UAT on PRs into main. + # Runtime UAT toggle: add PR label 'run-staging' to enable UAT on PRs into main. if: | (github.event_name != 'pull_request' || github.event.pull_request.head.repo.fork == false) && ( (github.event_name == 'push' && github.ref == 'refs/heads/main') || (github.event_name == 'workflow_dispatch') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'dev') || - (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main' && contains(join(github.event.pull_request.labels.*.name, ','), 'run-uat')) + (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main' && contains(join(github.event.pull_request.labels.*.name, ','), 'run-staging')) ) name: Plan ${{ matrix.environment }} runs-on: ubuntu-latest strategy: fail-fast: false matrix: - environment: ${{ (github.event_name == 'workflow_dispatch' && fromJSON('["prod"]')) || (github.event_name == 'push' && github.ref == 'refs/heads/main' && fromJSON('["prod"]')) || (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'dev' && fromJSON('["dev"]')) || (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main' && contains(join(github.event.pull_request.labels.*.name, ','), 'run-uat') && fromJSON('["uat"]')) || fromJSON('["prod"]') }} + environment: ${{ (github.event_name == 'workflow_dispatch' && fromJSON('["prod"]')) || (github.event_name == 'push' && github.ref == 'refs/heads/main' && fromJSON('["prod"]')) || (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'dev' && fromJSON('["dev"]')) || (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main' && contains(join(github.event.pull_request.labels.*.name, ','), 'run-staging') && fromJSON('["staging"]')) || fromJSON('["prod"]') }} environment: ${{ matrix.environment }} defaults: run: @@ -324,18 +324,18 @@ jobs: jq -e '.enabled == true' /tmp/selection-get.json > /dev/null - deploy-uat: - name: Deploy uat + deploy-staging: + name: Deploy staging needs: plan runs-on: ubuntu-latest - if: github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main' && contains(join(github.event.pull_request.labels.*.name, ','), 'run-uat') - environment: uat + if: github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main' && contains(join(github.event.pull_request.labels.*.name, ','), 'run-staging') + environment: staging defaults: run: - working-directory: infra/env/uat + working-directory: infra/env/staging env: - TF_VAR_env: "uat" + TF_VAR_env: "staging" TF_VAR_projname: "aigateway" TF_VAR_location: "southafricanorth" TF_VAR_location_short: "san" @@ -407,7 +407,7 @@ jobs: -backend-config="resource_group_name=${TF_BACKEND_RG}" \ -backend-config="storage_account_name=${TF_BACKEND_SA}" \ -backend-config="container_name=${TF_BACKEND_CONTAINER}" \ - -backend-config="key=uat.terraform.tfstate" + -backend-config="key=staging.terraform.tfstate" - name: Import existing Container App into Terraform state uses: ./.github/actions/import-container-app @@ -416,7 +416,7 @@ jobs: env: ${{ env.TF_VAR_env }} location_short: ${{ env.TF_VAR_location_short }} subscription_id: ${{ env.AZURE_SUBSCRIPTION_ID }} - terraform_working_directory: infra/env/uat + terraform_working_directory: infra/env/staging - name: Terraform Plan run: | diff --git a/docs/PRD.md b/docs/PRD.md index 6cc0c70..40580c7 100644 --- a/docs/PRD.md +++ b/docs/PRD.md @@ -29,7 +29,7 @@ Roo/Qoder currently struggles with Azure model/operation mismatches. A gateway n ## 3) Environments - `dev` -- `uat` +- `staging` - `prod` Each env is independently deployable. From 4f315b2f99ddf20d13bda6843064b30e10c385d4 Mon Sep 17 00:00:00 2001 From: JustAGhosT Date: Sun, 15 Mar 2026 05:59:49 +0200 Subject: [PATCH 05/11] feat: add Application Insights for OTEL tracing and complete uat->staging rename - Add Application Insights resource for trace storage - Add appinsights connection string to Key Vault and Container App env - Add OTEL configuration for azure-monitor-opentelemetry exporter - Complete uat->staging rename across all docs and scripts - Update planning doc with completed Phase 1 & 2 status --- .../actions/import-container-app/action.yml | 2 +- .github/pull_request_template.md | 6 ++-- README.md | 18 +++++------ dashboard/app.js | 2 +- docs/AZURE_OIDC_SETUP.md | 20 ++++++------ docs/CI_CD.md | 10 +++--- docs/PRD.md | 6 ++-- docs/SECRETS.md | 18 +++++------ docs/Terraform_Blueprint.md | 10 +++--- docs/planning/request_to_token_attribution.md | 27 ++++++++-------- infra/modules/aigateway_aca/main.tf | 31 +++++++++++++++++++ infra/modules/aigateway_aca/outputs.tf | 6 ++++ infra/scripts/terraform-init.ps1 | 4 +-- infra/scripts/terraform-init.sh | 10 +++--- scripts/add-federated-credentials.sh | 8 ++--- scripts/bootstrap.ps1 | 4 +-- scripts/bootstrap.sh | 6 ++-- 17 files changed, 113 insertions(+), 75 deletions(-) diff --git a/.github/actions/import-container-app/action.yml b/.github/actions/import-container-app/action.yml index cdd6872..5950bca 100644 --- a/.github/actions/import-container-app/action.yml +++ b/.github/actions/import-container-app/action.yml @@ -10,7 +10,7 @@ inputs: description: Project name component of the Container App name (TF_VAR_projname) env: required: true - description: Environment name (dev|uat|prod) + description: Environment name (dev|staging|prod) location_short: required: true description: Short location code (TF_VAR_location_short) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 47b33b7..2081df5 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -15,10 +15,10 @@ - [ ] No environment/config changes required - [ ] Environment/config changes required (describe below) -## UAT Toggle (PRs to `main`) +## Staging Toggle (PRs to `main`) -- Add label `run-uat` to this PR to enable UAT deployment (`deploy-uat`). -- Remove label `run-uat` to skip UAT deployment. +- Add label `run-staging` to this PR to enable staging deployment (`deploy-staging`). +- Remove label `run-staging` to skip staging deployment. ## Risk / Rollback diff --git a/README.md b/README.md index d4dc2ba..430cf15 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Creates the shared resource group, storage account, and container for Terraform ### 2. Add GitHub secrets -Add these secrets to each GitHub **Environment** (dev, uat, prod): **Settings → Environments → <env> → Environment secrets**. +Add these secrets to each GitHub **Environment** (dev, staging, prod): **Settings → Environments → <env> → Environment secrets**. | Secret | Description | Example | | ----------------------- | --------------------------------- | --------------------------------------------- | @@ -53,16 +53,16 @@ Bootstrap prints these values. For local runs, copy `infra/.env.local.example` t **Bash:** ```bash -./infra/scripts/terraform-init.sh dev # or uat, prod +./infra/scripts/terraform-init.sh dev # or staging, prod ``` **PowerShell:** ```powershell -.\infra\scripts\terraform-init.ps1 -Env dev # or uat, prod +.\infra\scripts\terraform-init.ps1 -Env dev # or staging, prod ``` -Valid environments: `dev`, `uat`, `prod`. +Valid environments: `dev`, `staging`, `prod`. ### 4. Plan and apply @@ -74,11 +74,11 @@ terraform apply ## Environments -| Env | Purpose | -| ---- | --------------- | -| dev | Development | -| uat | User acceptance | -| prod | Production | +| Env | Purpose | +| ------- | ----------- | +| dev | Development | +| staging | Staging | +| prod | Production | ## CI/CD diff --git a/dashboard/app.js b/dashboard/app.js index 65011e4..63675e5 100644 --- a/dashboard/app.js +++ b/dashboard/app.js @@ -53,7 +53,7 @@ function escHtml(s) { function deriveEnv(url) { if (!url) return null; - const m = url.match(/pvc-(dev|uat|prod)-/); + const m = url.match(/pvc-(dev|staging|prod)-/); return m ? m[1] : null; } diff --git a/docs/AZURE_OIDC_SETUP.md b/docs/AZURE_OIDC_SETUP.md index 509156e..0c9518b 100644 --- a/docs/AZURE_OIDC_SETUP.md +++ b/docs/AZURE_OIDC_SETUP.md @@ -12,7 +12,7 @@ If you see: Error: AADSTS700213: No matching federated identity record found for presented assertion subject 'repo:phoenixvc/ai-gateway:environment:dev' ``` -**Cause:** The workflow uses `environment: dev` (and uat/prod), so the OIDC subject is `repo:org/repo:environment:dev`. Azure must have a federated credential with that exact subject. +**Cause:** The workflow uses `environment: dev` (and staging/prod), so the OIDC subject is `repo:org/repo:environment:dev`. Azure must have a federated credential with that exact subject. ### Fix: Add environment federated credentials @@ -32,21 +32,21 @@ az ad app list --display-name pvc-shared-github-actions-oidc --query "[0].appId" 1. Go to **Azure Portal** → **Microsoft Entra ID** → **App registrations** → your app (e.g. `pvc-shared-github-actions-oidc`) 2. **Certificates & secrets** → **Federated credentials** → **Add credential** -3. For each environment (dev, uat, prod), add: +3. For each environment (dev, staging, prod), add: - **Federated credential scenario:** GitHub Actions deploying Azure resources - **Organization:** phoenixvc - **Repository:** ai-gateway - **Entity type:** Environment - - **Environment name:** dev (or uat, prod) - - **Name:** github-actions-dev (or uat, prod) + - **Environment name:** dev (or staging, prod) + - **Name:** github-actions-dev (or staging, prod) ### Subject formats -| Workflow config | OIDC subject | -| -------------------- | ----------------------------------------------- | -| `environment: dev` | `repo:phoenixvc/ai-gateway:environment:dev` | -| `environment: uat` | `repo:phoenixvc/ai-gateway:environment:uat` | -| `environment: prod` | `repo:phoenixvc/ai-gateway:environment:prod` | -| Branch only (no env) | `repo:phoenixvc/ai-gateway:ref:refs/heads/main` | +| Workflow config | OIDC subject | +| ---------------------- | ----------------------------------------------- | +| `environment: dev` | `repo:phoenixvc/ai-gateway:environment:dev` | +| `environment: staging` | `repo:phoenixvc/ai-gateway:environment:staging` | +| `environment: prod` | `repo:phoenixvc/ai-gateway:environment:prod` | +| Branch only (no env) | `repo:phoenixvc/ai-gateway:ref:refs/heads/main` | The federated credential **Subject** in Azure must match exactly. diff --git a/docs/CI_CD.md b/docs/CI_CD.md index ce180f0..a5a623a 100644 --- a/docs/CI_CD.md +++ b/docs/CI_CD.md @@ -6,15 +6,15 @@ This document describes the current GitHub Actions deployment behavior for `ai-g - PRs from forks are skipped for deployment-related jobs (no repo secrets). - PRs targeting `dev` run `plan` + `deploy-dev`. -- PRs targeting `main` run UAT only when the PR has label `run-uat`. +- PRs targeting `main` run staging only when the PR has label `run-staging`. - Push to `main` and `workflow_dispatch` run `plan` + `deploy-prod`. -## Runtime UAT toggle +## Runtime staging toggle -UAT deployment for PRs to `main` is controlled by PR label: +staging deployment for PRs to `main` is controlled by PR label: -- Add label `run-uat` to enable `deploy-uat` for that PR. -- Remove label `run-uat` to disable UAT for that PR. +- Add label `run-staging` to enable `deploy-staging` for that PR. +- Remove label `run-staging` to disable staging for that PR. ## Smoke test behavior diff --git a/docs/PRD.md b/docs/PRD.md index 40580c7..e1cee41 100644 --- a/docs/PRD.md +++ b/docs/PRD.md @@ -15,7 +15,7 @@ Roo/Qoder currently struggles with Azure model/operation mismatches. A gateway n 2. Support: - `POST /v1/responses` routed to Azure **Responses** endpoint for configurable model (default: `gpt-5.3-codex`). - `POST /v1/embeddings` routed to Azure embeddings deployment. -3. Enable **multiple environments** (dev/uat/prod) and **multiple downstream projects**. +3. Enable **multiple environments** (dev/staging/prod) and **multiple downstream projects**. 4. Infrastructure managed with **Terraform**. 5. CI/CD via **GitHub Actions** using **Azure OIDC** (no long-lived secrets). 6. “Get it working” first; hardening follows. @@ -150,7 +150,7 @@ Gateway must expose: - `docs/` - Documentation. - `infra/` - `modules/aigateway_aca` - Core Terraform module. - - `env/dev|uat|prod` - Environment-specific configurations. + - `env/dev|staging|prod` - Environment-specific configurations. - `.github/workflows/` - CI/CD pipelines. - `scripts/` - Helper scripts (bootstrap). @@ -161,7 +161,7 @@ Gateway must expose: - **Phase 1: Terraform & CI/CD** - Terraform defines infra. - GitHub Actions deploys using Azure OIDC. - - Dev auto-apply on merge; UAT/Prod gated with environment approvals. + - Dev auto-apply on merge; Staging/Prod gated with environment approvals. ## 10) Acceptance criteria diff --git a/docs/SECRETS.md b/docs/SECRETS.md index 4463421..460097e 100644 --- a/docs/SECRETS.md +++ b/docs/SECRETS.md @@ -2,17 +2,17 @@ Copy this checklist when setting up environments for this repo. -For workflow behavior (dev/uat/prod triggers, PR label `run-uat`, and smoke-test flow), see [CI_CD.md](CI_CD.md). +For workflow behavior (dev/staging/prod triggers, PR label `run-staging`, and smoke-test flow), see [CI_CD.md](CI_CD.md). ## Where to add secrets Add these as **Environment secrets** in GitHub: - **Settings → Environments → dev → Environment secrets** -- **Settings → Environments → uat → Environment secrets** +- **Settings → Environments → staging → Environment secrets** - **Settings → Environments → prod → Environment secrets** -> This workflow is environment-based (`environment: dev|uat|prod`), so each environment should have the full secret set. +> This workflow is environment-based (`environment: dev|staging|prod`), so each environment should have the full secret set. ## Required secrets (all environments) @@ -53,7 +53,7 @@ When `STATE_SERVICE_CONTAINER_IMAGE` is set (state-service enabled), set this se ## Copy/paste template -Use this block as a setup checklist when creating/updating `dev`, `uat`, and `prod`: +Use this block as a setup checklist when creating/updating `dev`, `staging`, and `prod`: ```text AZURE_CLIENT_ID= @@ -82,13 +82,13 @@ STATE_SERVICE_REGISTRY_PASSWORD= # required for priv - [ ] `AIGATEWAY_KEY` matches the key expected by the deployed gateway. - [ ] OIDC federated credentials exist for each environment subject: - `repo:phoenixvc/ai-gateway:environment:dev` - - `repo:phoenixvc/ai-gateway:environment:uat` + - `repo:phoenixvc/ai-gateway:environment:staging` - `repo:phoenixvc/ai-gateway:environment:prod` -## Runtime UAT toggle +## Runtime staging toggle -- UAT deploy on PRs into `main` is controlled by PR label `run-uat`. -- Add label `run-uat` to enable `deploy-uat` for that PR. -- Remove label `run-uat` to skip UAT for that PR. +- Staging deploy on PRs into `main` is controlled by PR label `run-staging`. +- Add label `run-staging` to enable `deploy-staging` for that PR. +- Remove label `run-staging` to skip staging for that PR. For OIDC troubleshooting, see [AZURE_OIDC_SETUP.md](AZURE_OIDC_SETUP.md). diff --git a/docs/Terraform_Blueprint.md b/docs/Terraform_Blueprint.md index 54ea563..f1027b0 100644 --- a/docs/Terraform_Blueprint.md +++ b/docs/Terraform_Blueprint.md @@ -3,7 +3,7 @@ This canvas includes a working Terraform scaffold: - `infra/modules/aigateway_aca` -- `infra/env/dev|uat|prod` +- `infra/env/dev|staging|prod` - Shared state configured via `terraform init -backend-config=...` in GitHub Actions > Notes: @@ -27,7 +27,7 @@ infra/ main.tf variables.tf terraform.tfvars - uat/ + staging/ main.tf variables.tf terraform.tfvars @@ -44,7 +44,7 @@ infra/ ```hcl variable "env" { type = string - description = "Environment name (dev|uat|prod)" + description = "Environment name (dev|staging|prod)" } variable "projname" { @@ -343,7 +343,7 @@ output "key_vault_name" { ## 5) Env stacks -### 5.1 `infra/env/dev/variables.tf` (repeat for uat/prod) +### 5.1 `infra/env/dev/variables.tf` (repeat for staging/prod) ```hcl variable "env" { type = string } @@ -441,7 +441,7 @@ tags = { } ``` -Repeat the env folders for `uat` and `prod`, changing only `env` and tags. +Repeat the env folders for `staging` and `prod`, changing only `env` and tags. --- diff --git a/docs/planning/request_to_token_attribution.md b/docs/planning/request_to_token_attribution.md index 34a282c..d5d347f 100644 --- a/docs/planning/request_to_token_attribution.md +++ b/docs/planning/request_to_token_attribution.md @@ -65,11 +65,11 @@ LiteLLM's OTEL callback automatically emits spans with: ### Phase 2: Correlation ID Propagation -**Status: In Progress** +**Status: ✅ Done** Correlation IDs flow through the system in two ways: -**Method A: Via Request Metadata (Recommended)** +**Method A: Via Request Metadata (Implemented)** Pass correlation IDs in the request body `metadata` field: ```json @@ -207,18 +207,18 @@ _Note: Method B requires additional LiteLLM configuration or middleware._ ## Acceptance Criteria -| Criterion | Status | Notes | -| -------------------------------------------- | ---------- | ----------------------------------------- | -| 100% of LLM calls emit token telemetry | ✅ Done | Via OTEL callback | -| 100% include workflow + stage | ⚠️ Partial | Requires upstream to pass metadata | -| Support KQL joins by operation_Id/request_id | ✅ Done | OTEL spans include metadata | -| Request-completion rollup totals | 🔜 Future | Requires Phase 3 (downstream aggregation) | +| Criterion | Status | Notes | +| -------------------------------------------- | --------- | ----------------------------------------- | +| 100% of LLM calls emit token telemetry | ✅ Done | Via OTEL callback | +| 100% include workflow + stage | ✅ Done | Metadata passed through to OTEL spans | +| Support KQL joins by operation_Id/request_id | ✅ Done | OTEL spans include metadata | +| Request-completion rollup totals | 🔜 Future | Requires Phase 3 (downstream aggregation) | ## Dependencies -- cognitive-mesh: Must pass correlation headers to gateway +- cognitive-mesh: Must pass correlation metadata to gateway - pvc-costops-analytics: Must create KQL queries for new event shape -- infra: May need custom LiteLLM container image or OTEL collector +- infra: Application Insights added for trace storage ## Action Items @@ -226,12 +226,13 @@ _Note: Method B requires additional LiteLLM configuration or middleware._ 1. ✅ ai-gateway: Add OTEL callback for token telemetry (Phase 1) 2. ✅ ai-gateway: Document correlation ID requirements (Phase 2) +3. ✅ ai-gateway: Add Application Insights for trace storage (Phase 1b) ### Pending -3. cognitive-mesh: Pass correlation IDs in request metadata -4. pvc-costops-analytics: Create KQL queries for OTEL span joins -5. pvc-costops-analytics: Implement request rollup aggregation (Phase 3) +4. cognitive-mesh: Pass correlation metadata in request body +5. pvc-costops-analytics: Create KQL queries for OTEL span joins +6. pvc-costops-analytics: Implement request rollup aggregation (Phase 3) --- diff --git a/infra/modules/aigateway_aca/main.tf b/infra/modules/aigateway_aca/main.tf index 412723c..9216a67 100644 --- a/infra/modules/aigateway_aca/main.tf +++ b/infra/modules/aigateway_aca/main.tf @@ -128,6 +128,15 @@ resource "azurerm_log_analytics_workspace" "law" { tags = local.tags } +resource "azurerm_application_insights" "ai" { + name = "${local.prefix}-ai-${var.location_short}" + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + application_type = "web" + retention_in_days = var.env == "prod" ? 90 : 30 + tags = local.tags +} + resource "azurerm_container_app_environment" "cae" { name = local.cae_name location = azurerm_resource_group.rg.location @@ -245,6 +254,15 @@ resource "azurerm_key_vault_secret" "langfuse_secret_key" { depends_on = [azurerm_key_vault_access_policy.terraform_client] } +resource "azurerm_key_vault_secret" "appinsights_connection_string" { + name = "appinsights-connection-string" + value = azurerm_application_insights.ai.connection_string + key_vault_id = azurerm_key_vault.kv.id + expiration_date = var.secrets_expiration_date + + depends_on = [azurerm_key_vault_access_policy.terraform_client] +} + resource "azurerm_user_assigned_identity" "ca" { name = "${local.ca_name}-id" resource_group_name = azurerm_resource_group.rg.name @@ -330,6 +348,12 @@ resource "azurerm_container_app" "ca" { } } + secret { + name = "appinsights-connection-string" + key_vault_secret_id = azurerm_key_vault_secret.appinsights_connection_string.versionless_id + identity = azurerm_user_assigned_identity.ca.id + } + template { min_replicas = var.min_replicas max_replicas = var.max_replicas @@ -429,6 +453,13 @@ resource "azurerm_container_app" "ca" { } } + # Azure Application Insights connection string (for azure-monitor-opentelemetry exporter) + # Use with custom LiteLLM image that includes azure-monitor-opentelemetry package + env { + name = "APPLICATIONINSIGHTS_CONNECTION_STRING" + secret_name = "appinsights-connection-string" + } + # LiteLLM commonly listens on 4000; set port as needed } } diff --git a/infra/modules/aigateway_aca/outputs.tf b/infra/modules/aigateway_aca/outputs.tf index f00e81a..a8dfe6b 100644 --- a/infra/modules/aigateway_aca/outputs.tf +++ b/infra/modules/aigateway_aca/outputs.tf @@ -29,3 +29,9 @@ output "container_app_environment_id" { description = "ID of the Container App Environment — used by sibling modules (e.g. dashboard_aca) to deploy into the same environment." value = azurerm_container_app_environment.cae.id } + +output "application_insights_connection_string" { + value = azurerm_application_insights.ai.connection_string + description = "Application Insights connection string for OTEL export." + sensitive = true +} diff --git a/infra/scripts/terraform-init.ps1 b/infra/scripts/terraform-init.ps1 index 6b3dee5..9ba1588 100644 --- a/infra/scripts/terraform-init.ps1 +++ b/infra/scripts/terraform-init.ps1 @@ -1,7 +1,7 @@ # Load .env.local and run terraform init -upgrade -# Usage: .\infra\scripts\terraform-init.ps1 [dev|uat|prod] +# Usage: .\infra\scripts\terraform-init.ps1 [dev|staging|prod] -param([Parameter(Mandatory=$true)][ValidateSet("dev","uat","prod")][string]$Env) +param([Parameter(Mandatory=$true)][ValidateSet("dev","staging","prod")][string]$Env) $envFile = Join-Path $PSScriptRoot ".." ".env.local" if (-not (Test-Path $envFile)) { diff --git a/infra/scripts/terraform-init.sh b/infra/scripts/terraform-init.sh index 52f3300..c7f5798 100644 --- a/infra/scripts/terraform-init.sh +++ b/infra/scripts/terraform-init.sh @@ -1,15 +1,15 @@ #!/bin/bash # Load .env.local and run terraform init -upgrade -# Usage: ./infra/scripts/terraform-init.sh [dev|uat|prod] +# Usage: ./infra/scripts/terraform-init.sh [dev|staging|prod] set -e -ENV="${1:?Usage: $0 dev|uat|prod}" +ENV="${1:?Usage: $0 dev|staging|prod}" case "$ENV" in - dev|uat|prod) ;; + dev|staging|prod) ;; *) - echo "Usage: $0 dev|uat|prod" - echo "Error: ENV must be dev, uat, or prod; got: $ENV" + echo "Usage: $0 dev|staging|prod" + echo "Error: ENV must be dev, staging, or prod; got: $ENV" exit 1 ;; esac diff --git a/scripts/add-federated-credentials.sh b/scripts/add-federated-credentials.sh index f7740bc..9969b0c 100644 --- a/scripts/add-federated-credentials.sh +++ b/scripts/add-federated-credentials.sh @@ -3,7 +3,7 @@ set -e # Add Federated Credentials for GitHub Actions Environments # Use this script if you already ran bootstrap and got AADSTS700213 because -# the workflow uses environment: dev/uat/prod but Azure only had branch-based credentials. +# the workflow uses environment: dev/staging/prod but Azure only had branch-based credentials. # # Usage: $0 # Example: $0 abc123-def456 phoenixvc ai-gateway @@ -11,7 +11,7 @@ set -e if [ "$#" -ne 3 ]; then echo "Usage: $0 " echo "" - echo "Adds federated identity credentials for dev, uat, prod environments" + echo "Adds federated identity credentials for dev, staging, prod environments" echo "to an existing Azure AD app registration (fixes AADSTS700213)." echo "" echo "Example: $0 \$(az ad app list --display-name pvc-shared-github-actions-oidc --query [0].appId -o tsv) phoenixvc ai-gateway" @@ -30,8 +30,8 @@ fi command -v jq >/dev/null 2>&1 || { echo "Error: jq is required for safe JSON construction. Install jq and retry."; exit 1; } -echo "Ensuring federated credentials for environments (dev, uat, prod) on app $APP_ID..." -for ENV in dev uat prod; do +echo "Ensuring federated credentials for environments (dev, staging, prod) on app $APP_ID..." +for ENV in dev staging prod; do SUBJECT="repo:$GITHUB_ORG/$GITHUB_REPO:environment:$ENV" EXISTING_SUBJECT=$(az ad app federated-credential list --id "$OBJECT_ID" --query "[?name=='github-actions-$ENV'].subject" -o tsv 2>/dev/null | head -n1) if [ -n "$EXISTING_SUBJECT" ] && [ "$EXISTING_SUBJECT" = "$SUBJECT" ]; then diff --git a/scripts/bootstrap.ps1 b/scripts/bootstrap.ps1 index 19a8e44..0583b05 100644 --- a/scripts/bootstrap.ps1 +++ b/scripts/bootstrap.ps1 @@ -120,8 +120,8 @@ $bytes = New-Object byte[] 32 [System.Security.Cryptography.RandomNumberGenerator]::Create().GetBytes($bytes) $AIGATEWAY_KEY = [Convert]::ToBase64String($bytes) -Write-Host "Ensuring Federated Credentials for GitHub Actions (environments: dev, uat, prod)..." -foreach ($EnvName in @("dev","uat","prod")) { +Write-Host "Ensuring Federated Credentials for GitHub Actions (environments: dev, staging, prod)..." +foreach ($EnvName in @("dev","staging","prod")) { $SUBJECT = "repo:" + $GITHUB_ORG + "/" + $GITHUB_REPO + ":environment:" + $EnvName $EXISTING_SUBJECT = az ad app federated-credential list --id $OBJECT_ID --query "[?name=='github-actions-$EnvName'].subject" -o tsv 2>$null | Select-Object -First 1 if ($EXISTING_SUBJECT -and ($EXISTING_SUBJECT -eq $SUBJECT)) { diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index aa093f0..18417dc 100644 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -13,7 +13,7 @@ GITHUB_REPO="$2" SCOPE="$3" # --- Configuration --- -# Shared infra: OIDC app and TF state span dev/uat/prod +# Shared infra: OIDC app and TF state span dev/staging/prod LOCATION="southafricanorth" RG_NAME="pvc-shared-tfstate-rg-san" CONTAINER_NAME="tfstate" @@ -119,9 +119,9 @@ OBJECT_ID=$(az ad app show --id "$APP_ID" --query id --output tsv) AIGATEWAY_KEY=$(openssl rand -base64 32 2>/dev/null || head -c 32 /dev/urandom | base64) -echo "Ensuring Federated Credentials for GitHub Actions (environments: dev, uat, prod)..." +echo "Ensuring Federated Credentials for GitHub Actions (environments: dev, staging, prod)..." command -v jq >/dev/null 2>&1 || { echo "Error: jq is required for safe JSON construction. Install jq and retry."; exit 1; } -for ENV in dev uat prod; do +for ENV in dev staging prod; do SUBJECT="repo:$GITHUB_ORG/$GITHUB_REPO:environment:$ENV" EXISTING_SUBJECT=$(az ad app federated-credential list --id "$OBJECT_ID" --query "[?name=='github-actions-$ENV'].subject" -o tsv 2>/dev/null | head -n1) if [ -n "$EXISTING_SUBJECT" ] && [ "$EXISTING_SUBJECT" = "$SUBJECT" ]; then From eb07474fd57c7aee706c1d131736dc23fab0675c Mon Sep 17 00:00:00 2001 From: JustAGhosT Date: Sun, 15 Mar 2026 07:47:40 +0200 Subject: [PATCH 06/11] fix: address PR review comments - terminology consistency - deploy.yaml: fix 'Runtime UAT toggle' comment to 'Runtime staging toggle' - request_to_token_attribution.md: update acceptance criteria status to 'Ready' with dependency notes, fix tense on infra dependency - PRD.md: replace Dev/UAT/Prod with Dev/staging/Prod, fix M2 milestone - README.md: fix 'UAT toggle' to 'staging toggle' --- .github/workflows/deploy.yaml | 2 +- README.md | 2 +- docs/PRD.md | 4 ++-- docs/planning/request_to_token_attribution.md | 14 +++++++------- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml index 1361556..2a0e590 100644 --- a/.github/workflows/deploy.yaml +++ b/.github/workflows/deploy.yaml @@ -35,7 +35,7 @@ jobs: plan: # PR into dev → dev | PR into main + label 'run-staging' → staging | Push to main/workflow_dispatch → prod # Skip plan for PRs from forks (no repo secrets; avoids AADSTS700213) - # Runtime UAT toggle: add PR label 'run-staging' to enable UAT on PRs into main. + # Runtime staging toggle: add PR label 'run-staging' to enable staging on PRs into main. if: | (github.event_name != 'pull_request' || github.event.pull_request.head.repo.fork == false) && ( diff --git a/README.md b/README.md index 430cf15..dabd2de 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,6 @@ pnpm format - [PRD](docs/PRD.md) – Product requirements - [Terraform Blueprint](docs/Terraform_Blueprint.md) – Infrastructure design -- [CI/CD Runbook](docs/CI_CD.md) – workflow behavior, UAT toggle, smoke tests +- [CI/CD Runbook](docs/CI_CD.md) – workflow behavior, staging toggle, smoke tests - [Azure OIDC Setup](docs/AZURE_OIDC_SETUP.md) – GitHub Actions OIDC configuration - [Secrets Checklist](docs/SECRETS.md) – Copy/paste setup for GitHub environment secrets diff --git a/docs/PRD.md b/docs/PRD.md index e1cee41..710823f 100644 --- a/docs/PRD.md +++ b/docs/PRD.md @@ -167,7 +167,7 @@ Gateway must expose: 1. Roo/Qoder can use gateway for coding with configured model (default `gpt-5.3-codex`) without `chatCompletion operation does not work`. 2. Codebase indexing completes using embeddings through the gateway. -3. Dev/UAT/Prod are reproducible via Terraform + Actions. +3. Dev/staging/Prod are reproducible via Terraform + Actions. 4. No secrets committed. ## 11) Risks & mitigations @@ -180,5 +180,5 @@ Gateway must expose: - M0: Repo setup, Bootstrap scripts (OIDC, State Backend). - M1: Dev env deployed; smoke tests pass; Roo works. -- M2: UAT + Prod; environment approvals. +- M2: staging + Prod; environment approvals. - M3: Hardening (Front Door/WAF, Entra auth). diff --git a/docs/planning/request_to_token_attribution.md b/docs/planning/request_to_token_attribution.md index d5d347f..89a7a49 100644 --- a/docs/planning/request_to_token_attribution.md +++ b/docs/planning/request_to_token_attribution.md @@ -207,18 +207,18 @@ _Note: Method B requires additional LiteLLM configuration or middleware._ ## Acceptance Criteria -| Criterion | Status | Notes | -| -------------------------------------------- | --------- | ----------------------------------------- | -| 100% of LLM calls emit token telemetry | ✅ Done | Via OTEL callback | -| 100% include workflow + stage | ✅ Done | Metadata passed through to OTEL spans | -| Support KQL joins by operation_Id/request_id | ✅ Done | OTEL spans include metadata | -| Request-completion rollup totals | 🔜 Future | Requires Phase 3 (downstream aggregation) | +| Criterion | Status | Notes | +| -------------------------------------------- | --------- | ------------------------------------------------------- | +| 100% of LLM calls emit token telemetry | ✅ Done | Via OTEL callback | +| 100% include workflow + stage | 🔜 Ready | Requires cognitive-mesh to pass metadata to gateway | +| Support KQL joins by operation_Id/request_id | 🔜 Ready | Requires pvc-costops-analytics to implement KQL queries | +| Request-completion rollup totals | 🔜 Future | Requires Phase 3 (downstream aggregation) | ## Dependencies - cognitive-mesh: Must pass correlation metadata to gateway - pvc-costops-analytics: Must create KQL queries for new event shape -- infra: Application Insights added for trace storage +- infra: Application Insights being added for trace storage ## Action Items From 9d5b99b7df71f057df559d3528286fbf0d44b67b Mon Sep 17 00:00:00 2001 From: JustAGhosT Date: Sun, 15 Mar 2026 08:01:09 +0200 Subject: [PATCH 07/11] docs: add SLM architecture and guides documentation - docs/architecture/README.md: SLM characteristics, patterns, advantages, limitations - docs/guides/README.md: Practical guidance on when to use SLM vs LLM, implementation patterns --- docs/architecture/README.md | 152 ++++++++++++++++++++++++++++++++++ docs/guides/README.md | 158 ++++++++++++++++++++++++++++++++++++ 2 files changed, 310 insertions(+) create mode 100644 docs/architecture/README.md create mode 100644 docs/guides/README.md diff --git a/docs/architecture/README.md b/docs/architecture/README.md new file mode 100644 index 0000000..44bc8fe --- /dev/null +++ b/docs/architecture/README.md @@ -0,0 +1,152 @@ +# Small Language Models (SLM) + +A Small Language Model (SLM) is a language model with significantly fewer parameters and lower computational requirements than large models such as GPT-class systems. While definitions vary, SLMs typically fall into the tens of millions to a few billion parameters, compared with tens or hundreds of billions in large models. + +## Examples + +- Phi-2 +- Phi-3 +- Llama 3 8B +- Gemma +- Mistral 7B + +Although some of these approach the boundary between "small" and "medium," they are still commonly used where full-scale LLM infrastructure is impractical. + +## Core Characteristics + +| Property | Small Language Model | Large Language Model | +| ----------------- | ------------------------------ | -------------------- | +| Parameter count | ~10M–10B | 50B–1T+ | +| Hardware | CPU / small GPU / edge devices | multi-GPU clusters | +| Latency | low | moderate/high | +| Memory footprint | small | large | +| Cost per request | low | higher | +| Reasoning ability | limited | stronger | + +## Architectural Patterns + +### 1. Cascade Architecture + +``` +SLM + ↓ confidence high +Return result + +SLM + ↓ confidence low +LLM escalation +``` + +This is widely used in AI cost optimization pipelines. + +### 2. Router + Specialists + +``` +Router (SLM) + ├─ Code model + ├─ Security model + ├─ Cost analysis model + └─ General LLM fallback +``` + +SLMs act as intent classifiers. + +### 3. Local-First AI + +``` +Device + ├─ SLM + ├─ embeddings + └─ local vector store +``` + +Cloud models are only used when needed. + +## Typical Modern AI Stack + +``` + User Request + │ + Router (SLM) + ┌──────────┼──────────┐ + │ │ │ + Tool call Specialist LLM + (cheap) (SLM) (expensive) +``` + +This hybrid architecture is becoming the dominant design pattern in AI systems. + +## Advantages + +### 1. Cost Efficiency + +A typical cost comparison: + +| Model type | Approx cost | +| ---------- | ------------------------- | +| SLM | ~1–5% of large model cost | +| LLM | baseline | + +For large pipelines this difference becomes dominant. + +### 2. Low Latency + +SLMs can respond in 10–100 ms, especially when running locally. + +### 3. Deployability + +They can run on: + +- CPUs +- edge GPUs +- phones +- embedded boards + +### 4. Privacy and Data Control + +Data never leaves the environment. Important for: + +- healthcare +- finance +- internal enterprise tooling + +## Limitations + +### 1. Reduced Reasoning Ability + +SLMs struggle with: + +- multi-step reasoning +- long planning chains +- abstract reasoning +- ambiguous tasks + +### 2. Smaller Context Windows + +Often limited to 4k–32k tokens, though some newer ones extend further. + +### 3. Knowledge Coverage + +Because they are smaller: + +- less general knowledge +- more hallucination risk without grounding + +### 4. Prompt Sensitivity + +They require: + +- cleaner prompts +- tighter task definitions +- structured inputs + +## Practical Tradeoff Summary + +| Factor | Prefer SLM | Prefer LLM | +| --------------- | ---------- | ---------- | +| cost | ✓ | | +| latency | ✓ | | +| edge deployment | ✓ | | +| reasoning | | ✓ | +| creativity | | ✓ | +| complex tasks | | ✓ | diff --git a/docs/guides/README.md b/docs/guides/README.md new file mode 100644 index 0000000..aa5ca3b --- /dev/null +++ b/docs/guides/README.md @@ -0,0 +1,158 @@ +# SLM Implementation Guide + +## When to Use an SLM + +SLMs are appropriate when: + +- the task is structured +- the domain is narrow +- latency matters +- cost must be minimized +- inference must run locally + +### Typical Examples + +- CI/CD pipelines +- log classification +- code lint summarization +- telemetry tagging +- router agents +- RAG query classification + +## When to Use a Large Model + +Use a full LLM when: + +- deep reasoning is required +- complex architecture planning +- research tasks +- multi-step problem solving +- creative generation +- ambiguous user requests + +## Typical Use Cases + +### 1. Edge and Embedded AI + +SLMs are commonly used on-device. + +Examples: + +- mobile assistants +- IoT systems +- robotics +- drones +- offline copilots + +Advantages: + +- no cloud latency +- privacy (data never leaves device) +- deterministic cost + +This aligns with architectures where inference must run on Jetson, mobile chips, or CPUs. + +### 2. Specialized Task Models + +SLMs perform well when the task domain is narrow. + +Examples: + +- classification +- log analysis +- document tagging +- code lint explanation +- schema validation +- chatbot for a specific knowledge base + +In many cases an SLM + RAG outperforms a large model with no context. + +### 3. Agent Systems and Routing + +SLMs are often used as cheap first-pass models. + +Typical pattern: + +``` +User request + ↓ +Router (SLM) + ↓ +Decision + ├─ handle locally + ├─ call tool + └─ escalate to large model +``` + +Benefits: + +- large model usage drops significantly +- lower operational cost +- deterministic routing + +### 4. High-Throughput Batch Processing + +SLMs are useful for: + +- codebase analysis +- repository indexing +- log classification +- telemetry tagging +- document chunk summarization + +When processing millions of documents, the cost difference is substantial. + +## Implementation Patterns + +### Router Pattern Implementation + +```python +async def route_request(request: str) -> Response: + # Use SLM for classification/routing + intent = await slm_classify(request) + + if intent == "simple": + return await handle_locally(request) + elif intent == "tool": + return await call_tool(request) + else: + # Escalate to full LLM + return await llm_complete(request) +``` + +### Cascade Pattern Implementation + +```python +async def cascade(request: str) -> Response: + # First try SLM + result = await slm_complete(request) + + # Check confidence + if result.confidence > 0.85: + return result + + # Escalate to LLM for low confidence + return await llm_complete(request) +``` + +### Local-First Pattern + +``` +Device + ├─ SLM (local inference) + ├─ embeddings (local compute) + └─ local vector store (SQLite/Chroma) +``` + +Cloud models only used when local SLM cannot handle the request. + +## Cost Optimization Example + +For a system processing 1M requests/day: + +| Model | Cost/request | Daily cost | Monthly cost | +| ---------- | ------------ | ---------- | ------------ | +| SLM (7B) | $0.001 | $1,000 | $30,000 | +| LLM (175B) | $0.05 | $50,000 | $1,500,000 | + +**Potential savings: 98%** From cab02059cd62f733cfa2251800f7bb6ac778b10d Mon Sep 17 00:00:00 2001 From: JustAGhosT Date: Sun, 15 Mar 2026 08:07:36 +0200 Subject: [PATCH 08/11] docs: add architecture documentation for all systems and SLM management plan - docs/architecture/: Add README, ai-gateway, cognitive-mesh, phoenix-rooivalk, codeflow-engine, agentkit-forge, cross-system, and slm-management-plan - docs/guides/: Update index - Remove empty infra/env/uat directory --- docs/architecture/README.md | 165 +++------------ docs/architecture/agentkit-forge.md | 164 +++++++++++++++ docs/architecture/ai-gateway.md | 125 +++++++++++ docs/architecture/codeflow-engine.md | 144 +++++++++++++ docs/architecture/cognitive-mesh.md | 138 +++++++++++++ docs/architecture/cross-system.md | 115 +++++++++++ docs/architecture/phoenix-rooivalk.md | 144 +++++++++++++ docs/architecture/slm-management-plan.md | 253 +++++++++++++++++++++++ docs/guides/README.md | 163 +-------------- 9 files changed, 1119 insertions(+), 292 deletions(-) create mode 100644 docs/architecture/agentkit-forge.md create mode 100644 docs/architecture/ai-gateway.md create mode 100644 docs/architecture/codeflow-engine.md create mode 100644 docs/architecture/cognitive-mesh.md create mode 100644 docs/architecture/cross-system.md create mode 100644 docs/architecture/phoenix-rooivalk.md create mode 100644 docs/architecture/slm-management-plan.md diff --git a/docs/architecture/README.md b/docs/architecture/README.md index 44bc8fe..d3f37c2 100644 --- a/docs/architecture/README.md +++ b/docs/architecture/README.md @@ -1,152 +1,39 @@ -# Small Language Models (SLM) +# Architecture -A Small Language Model (SLM) is a language model with significantly fewer parameters and lower computational requirements than large models such as GPT-class systems. While definitions vary, SLMs typically fall into the tens of millions to a few billion parameters, compared with tens or hundreds of billions in large models. +This directory contains system architecture documentation for the AI Gateway and related systems. -## Examples +## Overview -- Phi-2 -- Phi-3 -- Llama 3 8B -- Gemma -- Mistral 7B +The architecture follows a layered approach combining: -Although some of these approach the boundary between "small" and "medium," they are still commonly used where full-scale LLM infrastructure is impractical. +- **SLMs (Small Language Models)** for cost-effective routing, classification, and tool selection +- **LLMs** for complex reasoning and final synthesis -## Core Characteristics +## Documentation -| Property | Small Language Model | Large Language Model | -| ----------------- | ------------------------------ | -------------------- | -| Parameter count | ~10M–10B | 50B–1T+ | -| Hardware | CPU / small GPU / edge devices | multi-GPU clusters | -| Latency | low | moderate/high | -| Memory footprint | small | large | -| Cost per request | low | higher | -| Reasoning ability | limited | stronger | +### Core Concepts -## Architectural Patterns +- [README](README.md) - SLM fundamentals, characteristics, patterns +- [cross-system.md](cross-system.md) - How all systems integrate -### 1. Cascade Architecture +### Project-Specific -``` -SLM - ↓ confidence high -Return result +- [ai-gateway.md](ai-gateway.md) - AI Gateway architecture +- [cognitive-mesh.md](cognitive-mesh.md) - Agent orchestration +- [phoenix-rooivalk.md](phoenix-rooivalk.md) - Edge AI system +- [codeflow-engine.md](codeflow-engine.md) - CI/CD intelligence +- [agentkit-forge.md](agentkit-forge.md) - Agent building framework -SLM - ↓ confidence low -LLM escalation -``` +### Planning -This is widely used in AI cost optimization pipelines. +- [slm-management-plan.md](slm-management-plan.md) - Cross-project SLM management -### 2. Router + Specialists +## Quick Reference -``` -Router (SLM) - ├─ Code model - ├─ Security model - ├─ Cost analysis model - └─ General LLM fallback -``` - -SLMs act as intent classifiers. - -### 3. Local-First AI - -``` -Device - ├─ SLM - ├─ embeddings - └─ local vector store -``` - -Cloud models are only used when needed. - -## Typical Modern AI Stack - -``` - User Request - │ - Router (SLM) - ┌──────────┼──────────┐ - │ │ │ - Tool call Specialist LLM - (cheap) (SLM) (expensive) -``` - -This hybrid architecture is becoming the dominant design pattern in AI systems. - -## Advantages - -### 1. Cost Efficiency - -A typical cost comparison: - -| Model type | Approx cost | -| ---------- | ------------------------- | -| SLM | ~1–5% of large model cost | -| LLM | baseline | - -For large pipelines this difference becomes dominant. - -### 2. Low Latency - -SLMs can respond in 10–100 ms, especially when running locally. - -### 3. Deployability - -They can run on: - -- CPUs -- edge GPUs -- phones -- embedded boards - -### 4. Privacy and Data Control - -Data never leaves the environment. Important for: - -- healthcare -- finance -- internal enterprise tooling - -## Limitations - -### 1. Reduced Reasoning Ability - -SLMs struggle with: - -- multi-step reasoning -- long planning chains -- abstract reasoning -- ambiguous tasks - -### 2. Smaller Context Windows - -Often limited to 4k–32k tokens, though some newer ones extend further. - -### 3. Knowledge Coverage - -Because they are smaller: - -- less general knowledge -- more hallucination risk without grounding - -### 4. Prompt Sensitivity - -They require: - -- cleaner prompts -- tighter task definitions -- structured inputs - -## Practical Tradeoff Summary - -| Factor | Prefer SLM | Prefer LLM | -| --------------- | ---------- | ---------- | -| cost | ✓ | | -| latency | ✓ | | -| edge deployment | ✓ | | -| reasoning | | ✓ | -| creativity | | ✓ | -| complex tasks | | ✓ | +| System | SLM Role | Key Document | +| --------------- | ----------------------------------- | ------------------- | +| AI Gateway | routing, policy checks | ai-gateway.md | +| Cognitive Mesh | agent routing, task decomposition | cognitive-mesh.md | +| PhoenixRooivalk | edge telemetry analysis | phoenix-rooivalk.md | +| CodeFlow Engine | CI intelligence, log analysis | codeflow-engine.md | +| AgentKit Forge | tool selection, context compression | agentkit-forge.md | diff --git a/docs/architecture/agentkit-forge.md b/docs/architecture/agentkit-forge.md new file mode 100644 index 0000000..1b93cab --- /dev/null +++ b/docs/architecture/agentkit-forge.md @@ -0,0 +1,164 @@ +# AgentKit Forge + +AgentKit Forge builds AI agents and orchestration workflows. SLMs help build scalable multi-agent systems. + +## Architecture + +``` +Agent Task + │ + ▼ +SLM Tool Selector + │ + ├─ GitHub API + ├─ Azure CLI + ├─ Terraform + └─ Documentation Search +``` + +## SLM Use Cases + +### 1. Tool Selection + +Agent decides which tool to invoke. + +**Example tools:** + +- GitHub API +- Azure CLI +- Terraform +- Kusto queries +- File system operations + +**SLM output:** + +```json +{ + "tool": "azure_cli", + "command": "az monitor metrics list", + "args": { + "resource": "/subscriptions/.../appinsights/...", + "metric": "requests" + }, + "confidence": 0.92 +} +``` + +### 2. Context Compression + +Agents accumulate long conversation histories. + +SLM compresses them: + +```json +{ + "previous_state_summary": "User requested Azure cost analysis", + "relevant_files": ["infra/main.tf", "infra/outputs.tf"], + "active_task": "generating cost breakdown" +} +``` + +### 3. Token Budget Control + +SLM predicts which context segments are needed before invoking a large model. + +```python +# Before calling expensive LLM +context_plan = await slm_plan_context( + task="analyze deployment", + available_context=["git_diff", "terraform_plan", "logs", "metrics"] +) + +# Returns: +{ + "required_context": ["terraform_plan"], + "optional_context": ["logs"], + "estimated_tokens": 8000, + "can_fit_in_window": true +} +``` + +## Implementation + +### Tool Selection + +```python +async def select_tool(task: str, available_tools: list[Tool]) -> ToolInvocation: + prompt = f"""Select the best tool for this task. + +Task: {task} + +Available tools: +{format_tools(available_tools)} + +Output: tool_name, args, confidence""" + + result = await slm_completion(prompt) + return ToolInvocation( + tool=result.tool, + args=result.args, + confidence=result.confidence + ) +``` + +### Context Planning + +```python +async def plan_context(task: str, context_options: list[Context]) -> ContextPlan: + prompt = f"""Plan which context to use for this task. + +Task: {task} + +Available context: +{format_context(context_options)} + +Output: required_context, optional_context, estimated_tokens""" + + return await slm_completion(prompt) +``` + +### Multi-Step Reasoning + +```python +async def execute_agent_task(task: str) -> AgentResult: + # Step 1: Decompose + plan = await slm_decompose(task) + + # Step 2: Execute each step with tool selection + for step in plan.steps: + tool = await select_tool(step.description, available_tools) + result = await execute_tool(tool) + + # Step 3: Check if escalation needed + if result.complexity == "high": + llm_result = await llm_complete(step, context) + result = llm_result + + return aggregate_results(plan.steps) +``` + +## Key Concerns + +| Concern | Strategy | +| ------------- | -------------------------------------- | +| Tool accuracy | Validate tool exists before invocation | +| Context bloat | SLM filters context before LLM | +| Cost | Route 70%+ through SLM tool selection | +| Reliability | Fallback to LLM on low confidence | + +## Tool Categories + +| Category | SLM Handles | LLM Handles | +| ------------ | ------------------ | ------------------ | +| CLI commands | selection + args | complex pipelines | +| API calls | endpoint selection | response parsing | +| File ops | path determination | content generation | +| Queries | query construction | result synthesis | + +## Metrics + +- Tool selection accuracy +- Context compression ratio +- LLM call reduction rate +- Average task latency +- Cost per agent task diff --git a/docs/architecture/ai-gateway.md b/docs/architecture/ai-gateway.md new file mode 100644 index 0000000..82c3df6 --- /dev/null +++ b/docs/architecture/ai-gateway.md @@ -0,0 +1,125 @@ +# AI Gateway + +AI Gateway sits between applications and multiple AI providers. Its main responsibilities include request routing, guardrails, caching, cost control, model selection, and telemetry tagging. + +## Architecture + +``` +Client + │ + ▼ +AI Gateway + │ + ├─ SLM: request classification + ├─ SLM: security scan + ├─ SLM: cost prediction + │ + ▼ +Routing Decision + ├─ Small model + ├─ Tool call + └─ Large model escalation +``` + +## SLM Use Cases + +### 1. Request Classification + +Determine the intent of a prompt before routing. + +**Example tasks:** + +- code generation +- analysis +- summarization +- conversational +- tool execution + +**SLM outputs structured routing signals:** + +```json +{ + "intent": "code_generation", + "complexity": "medium", + "security_risk": "low", + "suggested_model": "gpt-large" +} +``` + +This prevents sending every request to expensive models. + +### 2. Prompt Sanitization / Policy Checks + +SLM performs quick checks: + +- prompt injection +- policy violations +- secrets exposure +- PII detection + +This happens before any expensive inference. + +### 3. Cost-aware Model Routing + +SLM predicts complexity: + +- low complexity → small model +- medium → mid-tier +- high → large reasoning model + +## Implementation + +### Routing Logic + +```python +async def route_request(request: str) -> RoutingDecision: + # Use SLM for classification + classification = await slm_classify(request) + + if classification.confidence > 0.8: + return await route_by_intent(classification.intent) + else: + return await escalate_to_llm(request) +``` + +### Policy Check Pipeline + +```python +async def security_scan(prompt: str) -> SecurityResult: + checks = await asyncio.gather( + slm_check_injection(prompt), + slm_check_pii(prompt), + slm_check_secrets(prompt) + ) + + if any(checks.flagged for checks in checks): + return SecurityResult(blocked=True, reason=checks) + + return SecurityResult(allowed=True) +``` + +## Key Concerns + +| Concern | Strategy | +| -------- | -------------------------------------------- | +| Latency | SLM runs inline; must respond in <50ms | +| Accuracy | Cascade: low confidence → LLM verification | +| Cost | Route 80%+ to SLMs; LLM only for escalation | +| Security | SLM policy check before any model invocation | + +## SLM Model Selection + +Recommended models for gateway classification: + +- Phi-3 Mini (3.8B) - fast, accurate +- Llama 3 8B - good general classification +- Gemma 2B - minimal latency + +## Metrics + +Track per routing decision: + +- SLM vs LLM routing ratio +- Average latency by route type +- Escalation rate (SLM → LLM) +- Cost per 1K requests diff --git a/docs/architecture/codeflow-engine.md b/docs/architecture/codeflow-engine.md new file mode 100644 index 0000000..81dd76f --- /dev/null +++ b/docs/architecture/codeflow-engine.md @@ -0,0 +1,144 @@ +# CodeFlow Engine + +CodeFlow Engine is a DevOps and CI/CD intelligence system. SLMs are extremely efficient for many CI tasks. + +## Architecture + +``` +Git Push + │ + ▼ +CodeFlow Engine + │ + ├─ SLM: commit classification + ├─ SLM: risk analysis + ├─ SLM: CI log analysis + │ + ▼ +CI/CD Actions + ├─ auto approve + ├─ run full tests + └─ escalate review +``` + +## SLM Use Cases + +### 1. Pull Request Classification + +SLM categorizes PRs: + +```json +{ + "type": "documentation", + "risk": "low", + "tests_required": false, + "reviewers_needed": 1 +} +``` + +### 2. Commit Message Analysis + +SLM determines: + +- semantic change type +- breaking change risk +- release notes impact + +### 3. CI Failure Diagnosis + +SLM reads build logs and classifies failures. + +**Example output:** + +```json +{ + "failure_type": "dependency_error", + "likely_cause": "missing npm package", + "suggested_fix": "npm install", + "severity": "medium" +} +``` + +## Implementation + +### PR Classification + +```python +async def classify_pr(pr_diff: str, pr_description: str) -> PRClassification: + prompt = f"""Classify this PR: + +Diff: {pr_diff[:2000]} +Description: {pr_description} + +Output JSON with: type, risk_level, tests_required, reviewers_needed""" + + result = await slm_completion(prompt) + return PRClassification.parse_json(result) +``` + +### Commit Analysis + +```python +async def analyze_commit(commit: Commit) -> CommitAnalysis: + prompt = f"""Analyze this commit: + +Message: {commit.message} +Files: {commit.changed_files} + +Determine: breaking_change_risk, release_note_needed, impact_area""" + + return await slm_completion(prompt) +``` + +### CI Log Diagnosis + +```python +async def diagnose_failure(build_log: str) -> Diagnosis: + prompt = f"""Diagnose this CI failure: + +Log (last 5000 chars): +{build_log[-5000:]} + +Output: failure_type, likely_cause, suggested_fix""" + + return await slm_completion(prompt) +``` + +### Auto-Classification Rules + +```python +# Map SLM output to actions +CLASSIFICATION_ACTIONS = { + ("docs", "low"): {"auto_merge": True, "ci_skip": True}, + ("feat", "low"): {"auto_merge": False, "ci_full": True}, + ("fix", "medium"): {"auto_merge": False, "ci_full": True, "security_review": True}, + ("refactor", "low"): {"auto_merge": True, "ci_minimal": True}, +} +``` + +## Key Concerns + +| Concern | Strategy | +| -------- | ----------------------------------------------- | +| Speed | SLM must complete in <2s | +| Accuracy | Validate against rules; escalate on uncertainty | +| Cost | Batch processing; SLM only for classification | +| Coverage | Handle all common CI scenarios | + +## Classification Types + +| Change Type | SLM Output | CI Action | +| ------------- | ------------ | ---------------- | +| documentation | risk: low | skip tests | +| bugfix | risk: medium | run tests | +| refactor | risk: low | run tests | +| security | risk: high | full review | +| breaking | risk: high | require approval | + +## Metrics + +- Classification accuracy +- Auto-merge success rate +- Mean time to diagnosis +- Cost per PR processed +- False positive rate on security flags diff --git a/docs/architecture/cognitive-mesh.md b/docs/architecture/cognitive-mesh.md new file mode 100644 index 0000000..02a9e45 --- /dev/null +++ b/docs/architecture/cognitive-mesh.md @@ -0,0 +1,138 @@ +# Cognitive Mesh + +Cognitive Mesh architectures orchestrate multiple AI agents and tools. The biggest challenge is orchestration intelligence—SLMs are ideal for the coordination layer. + +## Architecture + +``` +User Query + │ + ▼ +SLM Router + │ + ├─ Code Agent + ├─ Infra Agent + ├─ Security Agent + └─ Research Agent + │ + ▼ + Specialist Work + │ + ▼ + LLM (only when required) +``` + +## SLM Use Cases + +### 1. Agent Router + +Determine which specialist agent should handle a request. + +**Example agents:** + +- code agent +- research agent +- infrastructure agent +- financial agent +- security agent + +SLM acts as a deterministic routing layer. + +### 2. Task Decomposition + +SLM splits requests into tasks: + +**Example:** + +User request: "Analyze this repo and generate a deployment plan." + +SLM decomposition: + +1. repository structure analysis +2. dependency inventory +3. infrastructure detection +4. deployment strategy generation + +Only the final step may require a large model. + +### 3. Agent Health Monitoring + +SLMs analyze: + +- agent logs +- task failure messages +- retry signals + +They detect issues early without invoking large models. + +## Implementation + +### Agent Selection + +```python +async def select_agent(user_request: str) -> Agent: + # SLM determines best agent + classification = await slm_classify_intent(user_request) + + agent_map = { + "code": CodeAgent, + "infrastructure": InfraAgent, + "security": SecurityAgent, + "research": ResearchAgent, + } + + return agent_map[classification.intent] +``` + +### Task Decomposition + +```python +async def decompose_task(request: str) -> TaskPlan: + # SLM breaks down into subtasks + decomposition = await slm_decompose(request) + + return TaskPlan( + subtasks=decomposition.steps, + dependencies=decomposition.dependencies, + llm_required_at_step=decomposition.final_step_only + ) +``` + +### Health Check + +```python +async def check_agent_health(agent_logs: list[str]) -> HealthReport: + # SLM analyzes logs for issues + analysis = await slm_analyze_logs(agent_logs) + + return HealthReport( + status=analysis.health_status, + issues=analysis.issues, + recommendations=analysis.recommendations + ) +``` + +## Key Concerns + +| Concern | Strategy | +| ------------------ | ----------------------------------------- | +| Routing accuracy | Validate against known agent capabilities | +| Task complexity | SLM estimates; LLM confirms if wrong | +| Agent coordination | SLM manages task queue and dependencies | +| Failure detection | SLM monitors logs; LLM only for recovery | + +## Agent Capabilities Matrix + +| Agent | SLM Handles | LLM Required For | +| -------- | ------------------------------ | ------------------- | +| Code | file operations, git commands | complex refactoring | +| Infra | terraform plans, status checks | architecture design | +| Security | vulnerability scanning | threat analysis | +| Research | information retrieval | synthesis | + +## Metrics + +- Routing accuracy by agent type +- Task decomposition quality (steps correct) +- Agent utilization ratio +- LLM escalation rate per agent diff --git a/docs/architecture/cross-system.md b/docs/architecture/cross-system.md new file mode 100644 index 0000000..e6b6f54 --- /dev/null +++ b/docs/architecture/cross-system.md @@ -0,0 +1,115 @@ +# Cross-System Architecture + +These systems together form a layered architecture. + +``` + User / Operator + │ + ▼ + AI Gateway + │ + (SLM Routing Layer) + │ + ┌───────────────┼────────────────┐ + │ │ │ + ▼ ▼ ▼ + Cognitive Mesh CodeFlow Engine AgentKit Forge + │ │ │ + │ │ │ + └───────────────┼────────────────┘ + │ + ▼ + Large Model Layer + │ + ▼ + PhoenixRooivalk Edge + (SLM Edge AI) +``` + +## Layer Responsibilities + +### Layer 1: Edge (PhoenixRooivalk) + +- Local inference only +- No cloud dependency +- Immediate threat response +- Minimal latency + +### Layer 2: Gateway (AI Gateway) + +- First request touchpoint +- Security policy enforcement +- Cost routing decisions +- Telemetry tagging + +### Layer 3: Orchestration (Cognitive Mesh, AgentKit Forge) + +- Multi-agent coordination +- Task decomposition +- Tool selection +- LLM escalation + +### Layer 4: Intelligence (CodeFlow Engine) + +- CI/CD automation +- Log analysis +- Commit classification +- Release management + +### Layer 5: Large Model Layer + +- Complex reasoning +- Creative generation +- Deep analysis +- Final synthesis + +## Data Flow + +``` +Edge Event (Rooivalk) + │ + ▼ Classify locally +Report + │ + ▼ Route via Gateway +AI Gateway + │ + ├─→ Route to Cognitive Mesh (agent task) + ├─→ Route to CodeFlow (CI task) + └─→ Route to AgentKit (tool task) + │ + ▼ + SLM Selection + │ + ┌────┼────┐ + │ │ │ + ▼ ▼ ▼ + Tool LLM Cache + │ + ▼ + Result + Telemetry + │ + ▼ + Cost Attribution +``` + +## Why SLMs Matter + +Across all five platforms, SLMs provide: + +| Benefit | Description | +| ---------------------- | --------------------------------------- | +| Cost Control | Large models invoked only when required | +| Latency Reduction | Routing decisions in milliseconds | +| Edge Deployment | PhoenixRooivalk inference locally | +| Deterministic Behavior | Easier to constrain and audit | + +## Summary + +| System | SLM Role | +| --------------- | --------------------------------------- | +| AI Gateway | routing, policy checks, cost prediction | +| Cognitive Mesh | agent routing, task decomposition | +| PhoenixRooivalk | edge telemetry analysis | +| CodeFlow Engine | CI intelligence, log analysis | +| AgentKit Forge | tool selection, context compression | diff --git a/docs/architecture/phoenix-rooivalk.md b/docs/architecture/phoenix-rooivalk.md new file mode 100644 index 0000000..627374a --- /dev/null +++ b/docs/architecture/phoenix-rooivalk.md @@ -0,0 +1,144 @@ +# PhoenixRooivalk + +PhoenixRooivalk is an edge AI counter-UAS (Unmanned Aerial System) system. Key constraints: + +- compute must run locally +- latency must be extremely low +- connectivity cannot be assumed + +## Architecture + +``` +Sensors + │ + ▼ +Telemetry Pipeline + │ + ▼ +SLM Edge Processor + │ + ├─ event classification + ├─ threat summarization + └─ anomaly detection + │ + ▼ +Operator Console +``` + +## SLM Use Cases + +### 1. Telemetry Interpretation + +Drones produce large telemetry streams: + +- radar +- RF signatures +- flight patterns +- behavior anomalies + +SLM interprets events: + +```json +{ + "pattern": "loitering", + "classification": "suspicious", + "confidence": 0.74 +} +``` + +### 2. Threat Report Summarization + +Edge device converts raw telemetry into operator reports. + +**Example:** + +Raw data → SLM summary: + +> Drone detected approaching perimeter at 35m altitude, RF signature consistent with consumer quadcopter. + +### 3. Mission Log Structuring + +SLM converts unstructured logs into structured intelligence records. + +## Implementation + +### Edge Processing Pipeline + +```python +class EdgeProcessor: + def __init__(self): + self.slm = load_local_slm() # Gemma or Phi-3 + + async def process_telemetry(self, raw_stream: bytes) -> ProcessedEvent: + # Parse telemetry + telemetry = self.parse(raw_stream) + + # SLM classification + classification = await self.slm.classify(telemetry) + + # Generate summary if threat detected + if classification.threat_level > THRESHOLD: + summary = await self.slm.summarize(telemetry) + + return ProcessedEvent( + classification=classification, + summary=summary, + timestamp=datetime.utcnow() + ) +``` + +### Local Inference + +```python +# Run on edge device (Jetson Nano / edge GPU) +async def run_local_inference(telemetry_data): + # No cloud call - all local + model = SLMModel("phi-3-mini-4k") + + result = await model.run( + input=telemetry_data, + device="cuda", # or "cpu" for minimal hardware + batch_size=1 + ) + + return result +``` + +## Key Concerns + +| Concern | Strategy | +| -------------------- | --------------------------------------------- | +| Hardware constraints | Optimize SLM for edge (quantization, pruning) | +| Latency | Must process in <100ms | +| Reliability | Offline-first; queue for later sync | +| Security | No external connectivity required | + +## Hardware Options + +| Device | SLM Capability | Notes | +| ----------- | ----------------- | ----------------------- | +| Jetson Nano | Phi-3 Mini (int4) | ~5ms inference | +| Jetson Orin | Phi-3 Mini (fp16) | Real-time processing | +| Edge CPU | Gemma 2B | Offline fallback | +| Mobile SoC | Phi-3 Mini (int4) | Phone/tablet deployment | + +## Model Optimization + +```python +# Quantize for edge deployment +from optimum.quanto import quantize + +model = quantize( + original_model, + weights=quantization_type.q4, + activations=quantization_type.q8 +) +``` + +## Metrics + +- Processing latency (target: <50ms p99) +- Classification accuracy vs cloud baseline +- Offline operation time +- Memory footprint +- Threat detection rate diff --git a/docs/architecture/slm-management-plan.md b/docs/architecture/slm-management-plan.md new file mode 100644 index 0000000..15054b9 --- /dev/null +++ b/docs/architecture/slm-management-plan.md @@ -0,0 +1,253 @@ +# SLM Management Plan + +This document outlines the key concerns and management strategy for SLM deployment across all projects. + +## Key Concerns Overview + +| Concern | Priority | Projects Affected | +| -------------------- | -------- | ----------------------- | +| Model Selection | High | All | +| Cost Management | High | All | +| Latency Requirements | High | Gateway, Rooivalk | +| Edge Deployment | High | Rooivalk | +| Security & Privacy | High | Gateway, Cognitive Mesh | +| Reliability | Medium | All | +| Observability | Medium | All | +| Versioning | Medium | All | + +## 1. Model Selection + +### Strategy + +Maintain a tiered model portfolio: + +| Tier | Models | Use Cases | Cost | +| ----------- | -------------------- | ------------------------------ | --------------- | +| Ultra-light | Phi-3 Mini, Gemma 2B | Classification, routing | $0.0001/request | +| Light | Phi-3, Llama 3 8B | Tool selection, log analysis | $0.001/request | +| Medium | Llama 3 70B | Complex routing, decomposition | $0.01/request | +| Heavy | GPT-4 class | Reasoning, synthesis | $0.05+/request | + +### Management + +- **Central model registry** with capability matrix +- **A/B testing framework** for model comparisons +- **Performance benchmarks** per use case category + +## 2. Cost Management + +### Strategy + +Implement cost controls at each layer: + +``` +Cost Control Layers +┌─────────────────────────────────────┐ +│ 1. Budget caps per project │ +├─────────────────────────────────────┤ +│ 2. SLM-first routing (80%+ target) │ +├─────────────────────────────────────┤ +│ 3. Confidence-based escalation │ +├─────────────────────────────────────┤ +│ 4. Request caching │ +├─────────────────────────────────────┤ +│ 5. Telemetry & alerting │ +└─────────────────────────────────────┘ +``` + +### Metrics + +| Metric | Target | +| -------------------- | ------ | +| SLM routing % | >80% | +| Cost per 1K requests | <$5 | +| LLM escalation rate | <20% | +| Cache hit rate | >30% | + +### Alerts + +- Cost spike >20% day-over-day +- LLM escalation >25% +- Budget utilization >80% + +## 3. Latency Requirements + +### Targets by Project + +| Project | Target P99 | Critical Path | +| --------------- | ---------- | --------------------- | +| AI Gateway | <100ms | routing decision | +| PhoenixRooivalk | <50ms | threat classification | +| CodeFlow | <2s | PR classification | +| Cognitive Mesh | <500ms | agent selection | +| AgentKit Forge | <1s | tool selection | + +### Optimization + +- **Model quantization** for edge (int4) +- **Caching** of frequent decisions +- **Batch processing** for non-critical tasks +- **Connection pooling** to inference endpoints + +## 4. Edge Deployment (PhoenixRooivalk) + +### Strategy + +| Requirement | Solution | +| ------------------ | ------------------------------- | +| Hardware diversity | Support Jetson, CPU, mobile | +| Offline operation | Full local inference capability | +| Model updates | OTA with rollback | +| Security | No external connectivity | + +### Model Optimization + +```python +# Standard edge optimization pipeline +optimizations = [ + quantization(weights="int4"), + pruning(structured=0.3), + distillation(student=phi3_mini), + compilation(target="cuda|cpu") +] +``` + +## 5. Security & Privacy + +### Strategy + +| Layer | Controls | +| ---------- | ----------------------------------------- | +| Input | Prompt injection detection, PII filtering | +| Processing | No data leaves boundary | +| Output | Content filtering, audit logging | +| Access | Role-based model access | + +### SLM Security Checks + +```python +async def security_pipeline(request: Request) -> SecurityResult: + # 1. Prompt injection check + injection = await slm_check_injection(request.prompt) + if injection.detected: + return blocked(injection.reason) + + # 2. PII detection + pii = await slm_check_pii(request.prompt) + if pii.found: + return blocked("PII detected") + + # 3. Policy check + policy = await slm_check_policy(request.prompt) + if policy.violation: + return blocked(policy.violation) + + return allowed() +``` + +## 6. Reliability + +### Strategy + +| Concern | Mitigation | +| ------------------- | ------------------------ | +| Model downtime | Fallback models per tier | +| Latency spikes | Timeout + escalation | +| Quality degradation | Continuous evaluation | +| Hallucinations | Confidence thresholds | + +### Fallback Hierarchy + +``` +Request + │ + ▼ Primary SLM + │ + ├─ Success → Return + │ + ├─ Timeout → Fallback SLM + │ + ├─ Low confidence → LLM verification + │ + └─ Failure → Error with telemetry +``` + +## 7. Observability + +### Metrics Collection + +| Metric Type | Collection | +| -------------- | -------------------------- | +| Request volume | Per model, per project | +| Latency | P50, P95, P99 per endpoint | +| Error rate | By error type, model | +| Cost | Per project, per user | +| Quality | Accuracy, escalation rate | + +### Dashboards + +- **Cost Dashboard**: Spend by project, model, day +- **Performance Dashboard**: Latency by tier +- **Quality Dashboard**: Accuracy, false positives + +## 8. Versioning + +### Strategy + +| Component | Versioning | Update Frequency | +| -------------- | ---------------- | ------------------ | +| Models | Semantic (1.0.0) | Monthly evaluation | +| Prompts | Git-based | Per task | +| Infrastructure | Terraform | Per deployment | + +### Model Lifecycle + +``` +Discovery → Testing → Staging → Production → Deprecated → Retired + │ │ │ │ │ + ▼ ▼ ▼ ▼ ▼ + Evaluate A/B test Shadow mode Active Fallback +``` + +## Project-Specific Concerns + +### AI Gateway + +- High-volume routing +- Security-first evaluation +- Real-time cost tracking + +### Cognitive Mesh + +- Agent capability mapping +- Task decomposition accuracy +- Multi-agent coordination + +### PhoenixRooivalk + +- Edge hardware diversity +- Offline reliability +- Minimal latency + +### CodeFlow Engine + +- PR classification accuracy +- CI log analysis quality +- Auto-merge reliability + +### AgentKit Forge + +- Tool selection accuracy +- Context compression ratio +- LLM call reduction + +## Action Items + +1. [ ] Establish model registry with tiered selection +2. [ ] Implement cost tracking per project +3. [ ] Set up latency monitoring dashboards +4. [ ] Create edge deployment pipeline +5. [ ] Build security check pipeline +6. [ ] Define fallback hierarchies +7. [ ] Implement observability stack +8. [ ] Document model lifecycle process diff --git a/docs/guides/README.md b/docs/guides/README.md index aa5ca3b..3fd3038 100644 --- a/docs/guides/README.md +++ b/docs/guides/README.md @@ -1,158 +1,15 @@ -# SLM Implementation Guide +# Guides -## When to Use an SLM +Implementation guides for various topics. -SLMs are appropriate when: +## SLM Implementation -- the task is structured -- the domain is narrow -- latency matters -- cost must be minimized -- inference must run locally +- [README](README.md) - When to use SLMs vs LLMs, implementation patterns -### Typical Examples +## Coming Soon -- CI/CD pipelines -- log classification -- code lint summarization -- telemetry tagging -- router agents -- RAG query classification - -## When to Use a Large Model - -Use a full LLM when: - -- deep reasoning is required -- complex architecture planning -- research tasks -- multi-step problem solving -- creative generation -- ambiguous user requests - -## Typical Use Cases - -### 1. Edge and Embedded AI - -SLMs are commonly used on-device. - -Examples: - -- mobile assistants -- IoT systems -- robotics -- drones -- offline copilots - -Advantages: - -- no cloud latency -- privacy (data never leaves device) -- deterministic cost - -This aligns with architectures where inference must run on Jetson, mobile chips, or CPUs. - -### 2. Specialized Task Models - -SLMs perform well when the task domain is narrow. - -Examples: - -- classification -- log analysis -- document tagging -- code lint explanation -- schema validation -- chatbot for a specific knowledge base - -In many cases an SLM + RAG outperforms a large model with no context. - -### 3. Agent Systems and Routing - -SLMs are often used as cheap first-pass models. - -Typical pattern: - -``` -User request - ↓ -Router (SLM) - ↓ -Decision - ├─ handle locally - ├─ call tool - └─ escalate to large model -``` - -Benefits: - -- large model usage drops significantly -- lower operational cost -- deterministic routing - -### 4. High-Throughput Batch Processing - -SLMs are useful for: - -- codebase analysis -- repository indexing -- log classification -- telemetry tagging -- document chunk summarization - -When processing millions of documents, the cost difference is substantial. - -## Implementation Patterns - -### Router Pattern Implementation - -```python -async def route_request(request: str) -> Response: - # Use SLM for classification/routing - intent = await slm_classify(request) - - if intent == "simple": - return await handle_locally(request) - elif intent == "tool": - return await call_tool(request) - else: - # Escalate to full LLM - return await llm_complete(request) -``` - -### Cascade Pattern Implementation - -```python -async def cascade(request: str) -> Response: - # First try SLM - result = await slm_complete(request) - - # Check confidence - if result.confidence > 0.85: - return result - - # Escalate to LLM for low confidence - return await llm_complete(request) -``` - -### Local-First Pattern - -``` -Device - ├─ SLM (local inference) - ├─ embeddings (local compute) - └─ local vector store (SQLite/Chroma) -``` - -Cloud models only used when local SLM cannot handle the request. - -## Cost Optimization Example - -For a system processing 1M requests/day: - -| Model | Cost/request | Daily cost | Monthly cost | -| ---------- | ------------ | ---------- | ------------ | -| SLM (7B) | $0.001 | $1,000 | $30,000 | -| LLM (175B) | $0.05 | $50,000 | $1,500,000 | - -**Potential savings: 98%** +- AI Gateway deployment guide +- Cognitive Mesh setup guide +- Edge deployment guide (PhoenixRooivalk) +- CodeFlow Engine integration +- AgentKit Forge quickstart From ef8619a113e18d668409cb97f4aa6f36e1ad7440 Mon Sep 17 00:00:00 2001 From: JustAGhosT Date: Sun, 15 Mar 2026 08:14:16 +0200 Subject: [PATCH 09/11] docs: update architecture docs with detailed SLM implementation patterns - ai-gateway.md: SLM as admission control and routing brain - cognitive-mesh.md: SLM as router, decompressor, state compressor - codeflow-engine.md: SLM as CI/CD triage - agentkit-forge.md: SLM as tool selector and execution governor - phoenix-rooivalk.md: SLM for reports only, NOT safety-critical - cross-system.md: Combined architecture with tiered model strategy - slm-management-plan.md: Updated with canonical principles - guides/README.md: Updated with practical patterns --- docs/architecture/README.md | 45 ++++-- docs/architecture/agentkit-forge.md | 126 ++++++++++------- docs/architecture/ai-gateway.md | 133 ++++++++++-------- docs/architecture/codeflow-engine.md | 153 +++++++++++--------- docs/architecture/cognitive-mesh.md | 141 ++++++++++++------- docs/architecture/cross-system.md | 120 ++++++++++------ docs/architecture/phoenix-rooivalk.md | 170 ++++++++++++++--------- docs/architecture/slm-management-plan.md | 23 ++- docs/guides/README.md | 12 +- 9 files changed, 578 insertions(+), 345 deletions(-) diff --git a/docs/architecture/README.md b/docs/architecture/README.md index d3f37c2..f3032d3 100644 --- a/docs/architecture/README.md +++ b/docs/architecture/README.md @@ -9,6 +9,11 @@ The architecture follows a layered approach combining: - **SLMs (Small Language Models)** for cost-effective routing, classification, and tool selection - **LLMs** for complex reasoning and final synthesis +### Canonical Principle + +> **Use SLMs to decide, filter, classify, compress, and prepare.** +> **Use LLMs to reason, reconcile, synthesize, and communicate.** + ## Documentation ### Core Concepts @@ -18,11 +23,11 @@ The architecture follows a layered approach combining: ### Project-Specific -- [ai-gateway.md](ai-gateway.md) - AI Gateway architecture -- [cognitive-mesh.md](cognitive-mesh.md) - Agent orchestration -- [phoenix-rooivalk.md](phoenix-rooivalk.md) - Edge AI system -- [codeflow-engine.md](codeflow-engine.md) - CI/CD intelligence -- [agentkit-forge.md](agentkit-forge.md) - Agent building framework +- [ai-gateway.md](ai-gateway.md) - AI Gateway: SLM as admission control & routing +- [cognitive-mesh.md](cognitive-mesh.md) - Agent orchestration: routing, decomposition +- [phoenix-rooivalk.md](phoenix-rooivalk.md) - Edge AI: SLM for reports only (NOT control) +- [codeflow-engine.md](codeflow-engine.md) - CI/CD intelligence: PR triage, log analysis +- [agentkit-forge.md](agentkit-forge.md) - Agent building: tool selection, context compression ### Planning @@ -30,10 +35,26 @@ The architecture follows a layered approach combining: ## Quick Reference -| System | SLM Role | Key Document | -| --------------- | ----------------------------------- | ------------------- | -| AI Gateway | routing, policy checks | ai-gateway.md | -| Cognitive Mesh | agent routing, task decomposition | cognitive-mesh.md | -| PhoenixRooivalk | edge telemetry analysis | phoenix-rooivalk.md | -| CodeFlow Engine | CI intelligence, log analysis | codeflow-engine.md | -| AgentKit Forge | tool selection, context compression | agentkit-forge.md | +| System | SLM Role | Key Document | +| --------------- | --------------------------------------- | ------------------- | +| AI Gateway | routing, policy checks, cost prediction | ai-gateway.md | +| Cognitive Mesh | agent routing, task decomposition | cognitive-mesh.md | +| PhoenixRooivalk | **operator summaries only** | phoenix-rooivalk.md | +| CodeFlow Engine | CI intelligence, log analysis | codeflow-engine.md | +| AgentKit Forge | tool selection, context compression | agentkit-forge.md | + +## Implementation Order + +1. **AI Gateway SLM router** — Highest immediate cost-leverage +2. **CodeFlow Engine CI/PR classifier** — Fastest operational value +3. **Cognitive Mesh decomposer/router** — Strong leverage once taxonomy stabilizes +4. **AgentKit Forge tool selector** — Useful once tool inventory is mature +5. **PhoenixRooivalk operator interpreter** — Valuable, keep isolated from critical control + +## Tiered Model Strategy + +| Tier | Use For | Examples | +| ------ | --------------------- | --------------------------------------------- | +| Tier 0 | deterministic/non-LLM | regex, schemas, policies | +| Tier 1 | SLM | classification, decomposition, tool selection | +| Tier 2 | LLM | synthesis, complex reasoning | diff --git a/docs/architecture/agentkit-forge.md b/docs/architecture/agentkit-forge.md index 1b93cab..c84a182 100644 --- a/docs/architecture/agentkit-forge.md +++ b/docs/architecture/agentkit-forge.md @@ -1,36 +1,33 @@ # AgentKit Forge -AgentKit Forge builds AI agents and orchestration workflows. SLMs help build scalable multi-agent systems. +AgentKit Forge builds AI agents and orchestration workflows. SLMs help when agents have **many tools** and **large working memory**. ## Architecture ``` Agent Task - │ - ▼ -SLM Tool Selector - │ - ├─ GitHub API - ├─ Azure CLI - ├─ Terraform - └─ Documentation Search + │ + ▼ +┌─────────────────────────────────────┐ +│ SLM Execution Governor │ +│ (tool selection, memory, budget) │ +└─────────────────────────────────────┘ + │ + ▼ +Tool Selection + │ + ├─→ GitHub API + ├─→ Azure CLI + ├─→ Terraform + ├─→ Documentation Search + └─→ LLM Synthesis ``` -## SLM Use Cases +## Most Practical SLM Jobs -### 1. Tool Selection +### 1. Tool Selector -Agent decides which tool to invoke. - -**Example tools:** - -- GitHub API -- Azure CLI -- Terraform -- Kusto queries -- File system operations - -**SLM output:** +Map user or system request to the right tool. ```json { @@ -44,37 +41,41 @@ Agent decides which tool to invoke. } ``` -### 2. Context Compression +### 2. Relevance Filter -Agents accumulate long conversation histories. - -SLM compresses them: +Only send necessary state to expensive models. ```json { - "previous_state_summary": "User requested Azure cost analysis", - "relevant_files": ["infra/main.tf", "infra/outputs.tf"], - "active_task": "generating cost breakdown" + "relevant_context": ["terraform_plan", "error_logs"], + "pruned_context": ["old_successful_deploys", "unrelated_metrics"], + "estimated_tokens": 3500 } ``` -### 3. Token Budget Control +### 3. Budget Governor -SLM predicts which context segments are needed before invoking a large model. +Estimate likely token spend and whether tool-first is sufficient. -```python -# Before calling expensive LLM -context_plan = await slm_plan_context( - task="analyze deployment", - available_context=["git_diff", "terraform_plan", "logs", "metrics"] -) +```json +{ + "estimated_tokens": 8000, + "can_fit_in_window": true, + "should_use_tool_first": true, + "budget_tier": "medium" +} +``` -# Returns: +### 4. Execution Classifier + +Distinguish how to handle the request. + +```json { - "required_context": ["terraform_plan"], - "optional_context": ["logs"], - "estimated_tokens": 8000, - "can_fit_in_window": true + "action": "use_tool", + "tool_name": "github_api", + "escalate_to_llm": false, + "reason": "simple data retrieval" } ``` @@ -117,7 +118,26 @@ Output: required_context, optional_context, estimated_tokens""" return await slm_completion(prompt) ``` -### Multi-Step Reasoning +### Budget Governor + +```python +async def govern_budget(task: str) -> BudgetDecision: + prompt = f"""Estimate token budget for this task. + +Task: {task} + +Consider: context size, expected output, complexity""" + + estimate = await slm_completion(prompt) + + return BudgetDecision( + estimated_tokens=estimate.tokens, + can_fit=estimate.can_fit, + should_escalate=estimate.should_escalate + ) +``` + +### Multi-Step Execution ```python async def execute_agent_task(task: str) -> AgentResult: @@ -137,6 +157,14 @@ async def execute_agent_task(task: str) -> AgentResult: return aggregate_results(plan.steps) ``` +## Tradeoffs + +| Pros | Cons | +| ----------------------------------- | ---------------------------------------------- | +| Keeps agent execution lean | Weak tool selection harms trust | +| Lowers token burn dramatically | Compressed memory can omit critical edge cases | +| Improves tool invocation discipline | Too much reliance can make agents look shallow | + ## Key Concerns | Concern | Strategy | @@ -155,10 +183,10 @@ async def execute_agent_task(task: str) -> AgentResult: | File ops | path determination | content generation | | Queries | query construction | result synthesis | -## Metrics +## Implementation Checklist -- Tool selection accuracy -- Context compression ratio -- LLM call reduction rate -- Average task latency -- Cost per agent task +- [ ] Implement tool selection with confidence scores +- [ ] Add relevance filtering for context +- [ ] Implement budget governor with token estimation +- [ ] Add execution classification (direct/tool/LLM) +- [ ] Set up fallback to LLM on low confidence diff --git a/docs/architecture/ai-gateway.md b/docs/architecture/ai-gateway.md index 82c3df6..5a288f0 100644 --- a/docs/architecture/ai-gateway.md +++ b/docs/architecture/ai-gateway.md @@ -1,85 +1,89 @@ # AI Gateway -AI Gateway sits between applications and multiple AI providers. Its main responsibilities include request routing, guardrails, caching, cost control, model selection, and telemetry tagging. +AI Gateway sits between applications and multiple AI providers. The SLM acts as the **admission control and routing brain** — the fast, cheap, deterministic control layer before expensive model invocation. ## Architecture ``` -Client - │ - ▼ -AI Gateway - │ - ├─ SLM: request classification - ├─ SLM: security scan - ├─ SLM: cost prediction - │ - ▼ +Client Request + │ + ▼ +┌─────────────────────────────────────┐ +│ SLM Control Layer │ +│ (intent, complexity, risk, tools) │ +└─────────────────────────────────────┘ + │ + ▼ Routing Decision - ├─ Small model - ├─ Tool call - └─ Large model escalation + │ + ├─→ Cache (if cacheable) + ├─→ Tool call + ├─→ SLM response + ├─→ Small model + └─→ Large model escalation ``` -## SLM Use Cases +## SLM as Admission Control -### 1. Request Classification +The SLM sits **before** expensive model invocation and sometimes **after** provider response for tagging/telemetry normalization. -Determine the intent of a prompt before routing. +### Best SLM Use Cases -**Example tasks:** +| Use Case | Description | Output Schema | +| ------------------------ | ------------------------------ | ---------------------------------- | +| Intent Classification | Determine user intent | `{ "intent": "code_review", ... }` | +| Complexity Scoring | Rate request complexity | `{ "complexity": "medium", ... }` | +| Tool Eligibility | Detect if tool call needed | `{ "tool_candidate": true, ... }` | +| Safety Prefiltering | Prompt injection, PII, secrets | `{ "risk": "low", ... }` | +| Cache Key Enrichment | Generate cache keys | `{ "cacheable": false, ... }` | +| Telemetry Categorization | Tag for observability | `{ "category": "analysis", ... }` | +| Tenant Policy Gating | Per-tenant routing rules | `{ "tier": "premium", ... }` | -- code generation -- analysis -- summarization -- conversational -- tool execution +### Why This Works -**SLM outputs structured routing signals:** +These tasks are: + +- **Short-context** — SLM handles easily +- **Repetitive** — High cache hit potential +- **Structured** — Schema-bound outputs +- **Latency-sensitive** — SLM is fast + +### Good SLM Output ```json { - "intent": "code_generation", + "intent": "code_review", "complexity": "medium", - "security_risk": "low", - "suggested_model": "gpt-large" + "tool_candidate": true, + "risk": "low", + "cacheable": false, + "recommended_tier": "large" } ``` -This prevents sending every request to expensive models. - -### 2. Prompt Sanitization / Policy Checks - -SLM performs quick checks: - -- prompt injection -- policy violations -- secrets exposure -- PII detection - -This happens before any expensive inference. +## Implementation -### 3. Cost-aware Model Routing +### Routing Logic -SLM predicts complexity: +```python +async def gateway_admission(request: Request) -> AdmissionDecision: + # SLM does admission control + classification = await slm_classify(request.prompt) -- low complexity → small model -- medium → mid-tier -- high → large reasoning model + # Route based on classification + if classification.cacheable: + cached = await check_cache(classification.cache_key) + if cached: + return CachedResponse(cached) -## Implementation + if classification.tool_candidate: + return await route_to_tools(classification) -### Routing Logic + if classification.complexity == "low": + return await route_to_slm(classification) -```python -async def route_request(request: str) -> RoutingDecision: - # Use SLM for classification - classification = await slm_classify(request) - - if classification.confidence > 0.8: - return await route_by_intent(classification.intent) - else: - return await escalate_to_llm(request) + # Escalate to LLM + return await route_to_llm(classification) ``` ### Policy Check Pipeline @@ -92,12 +96,21 @@ async def security_scan(prompt: str) -> SecurityResult: slm_check_secrets(prompt) ) - if any(checks.flagged for checks in checks): + if any(check.flagged for check in checks): return SecurityResult(blocked=True, reason=checks) return SecurityResult(allowed=True) ``` +## Tradeoffs + +| Pros | Cons | +| ------------------------------- | -------------------------------------------------- | +| Major cost reduction | Misrouting risk if classifier is weak | +| Consistent routing | Small models can under-detect subtle safety issues | +| Lower p95 latency | More moving parts in gateway logic | +| Easier telemetry and governance | | + ## Key Concerns | Concern | Strategy | @@ -123,3 +136,11 @@ Track per routing decision: - Average latency by route type - Escalation rate (SLM → LLM) - Cost per 1K requests + +## Implementation Checklist + +- [ ] Add SLM policy envelope returning intent, complexity, risk, cacheability, tier +- [ ] Implement cascade pattern for low confidence → LLM +- [ ] Add security prefiltering (injection, PII, secrets) +- [ ] Set up cost tracking per tier +- [ ] Configure latency alerts diff --git a/docs/architecture/codeflow-engine.md b/docs/architecture/codeflow-engine.md index 81dd76f..30f5dfd 100644 --- a/docs/architecture/codeflow-engine.md +++ b/docs/architecture/codeflow-engine.md @@ -1,64 +1,75 @@ # CodeFlow Engine -CodeFlow Engine is a DevOps and CI/CD intelligence system. SLMs are extremely efficient for many CI tasks. +CodeFlow Engine is a DevOps and CI/CD intelligence system. **This is one of the most natural SLM fits** — CI/CD emits lots of repetitive semi-structured text where SLMs excel. ## Architecture ``` -Git Push - │ - ▼ -CodeFlow Engine - │ - ├─ SLM: commit classification - ├─ SLM: risk analysis - ├─ SLM: CI log analysis - │ - ▼ -CI/CD Actions - ├─ auto approve - ├─ run full tests - └─ escalate review +Git Push / PR Event + │ + ▼ +┌─────────────────────────────────────┐ +│ SLM Triage Layer │ +│ (classification, risk, pipeline) │ +└─────────────────────────────────────┘ + │ + ▼ +CI/CD Decision + │ + ├─→ Auto approve + ├─→ Run tests (full/minimal/skip) + ├─→ Security review + └─→ Escalate to LLM ``` -## SLM Use Cases +## Best SLM Use Cases -### 1. Pull Request Classification +| Use Case | Description | Example Output | +| ------------------------ | ------------------------- | ------------------------------------------------------------- | +| PR Classification | Categorize change type | `{ "type": "api_contract", "risk": "high" }` | +| Test Selection | Choose which tests to run | `{ "run_unit": true, "run_integration": false }` | +| Blast Radius | Estimate change impact | `{ "impacted": ["schemas", "api"], "risk": "medium" }` | +| Changelog Category | Generate release notes | `{ "category": "feature", "component": "gateway" }` | +| Build Log Classification | Diagnose failures | `{ "failure": "dependency_error", "fix": "npm install" }` | +| Flaky Test Grouping | Identify test patterns | `{ "flaky_group": "network_timed_out" }` | +| Issue Routing | Route to component owners | `{ "component": "infrastructure", "owner": "platform-team" }` | -SLM categorizes PRs: +## Example SLM Outputs + +### PR Classification ```json { - "type": "documentation", - "risk": "low", - "tests_required": false, - "reviewers_needed": 1 + "change_type": "api_contract", + "risk": "high", + "requires_full_ci": true, + "security_review": false, + "impacted_domains": ["schemas", "api"], + "suggested_reviewers": ["platform-team"] } ``` -### 2. Commit Message Analysis - -SLM determines: - -- semantic change type -- breaking change risk -- release notes impact - -### 3. CI Failure Diagnosis - -SLM reads build logs and classifies failures. - -**Example output:** +### Failure Diagnosis ```json { - "failure_type": "dependency_error", - "likely_cause": "missing npm package", - "suggested_fix": "npm install", - "severity": "medium" + "failure_type": "dependency_resolution", + "retryable": false, + "likely_root_cause": "missing package lock update", + "suggested_action": "regenerate lock file and rerun" } ``` +## Why This Works + +CI/CD emits lots of repetitive semi-structured text: + +- Similar commit patterns +- Recurring error types +- Predictable change categories + +SLMs do very well at pattern recognition on this data. + ## Implementation ### PR Classification @@ -67,30 +78,30 @@ SLM reads build logs and classifies failures. async def classify_pr(pr_diff: str, pr_description: str) -> PRClassification: prompt = f"""Classify this PR: -Diff: {pr_diff[:2000]} +Diff (first 2000 chars): {pr_diff[:2000]} Description: {pr_description} -Output JSON with: type, risk_level, tests_required, reviewers_needed""" +Output JSON with: type, risk_level, tests_required, reviewers_needed, security_review""" result = await slm_completion(prompt) return PRClassification.parse_json(result) ``` -### Commit Analysis +### Test Selection ```python -async def analyze_commit(commit: Commit) -> CommitAnalysis: - prompt = f"""Analyze this commit: +async def select_tests(change_type: str, impacted_files: list[str]) -> TestPlan: + prompt = f"""Select tests for this change: -Message: {commit.message} -Files: {commit.changed_files} +Type: {change_type} +Files: {', '.join(impacted_files)} -Determine: breaking_change_risk, release_note_needed, impact_area""" +Output: { "run_unit": bool, "run_integration": bool, "run_e2e": bool, "skip_reason": str|null }""" return await slm_completion(prompt) ``` -### CI Log Diagnosis +### Failure Diagnosis ```python async def diagnose_failure(build_log: str) -> Diagnosis: @@ -99,23 +110,31 @@ async def diagnose_failure(build_log: str) -> Diagnosis: Log (last 5000 chars): {build_log[-5000:]} -Output: failure_type, likely_cause, suggested_fix""" +Output: failure_type, retryable, likely_root_cause, suggested_action""" return await slm_completion(prompt) ``` -### Auto-Classification Rules +### Auto-Rules Mapping ```python -# Map SLM output to actions CLASSIFICATION_ACTIONS = { ("docs", "low"): {"auto_merge": True, "ci_skip": True}, ("feat", "low"): {"auto_merge": False, "ci_full": True}, ("fix", "medium"): {"auto_merge": False, "ci_full": True, "security_review": True}, ("refactor", "low"): {"auto_merge": True, "ci_minimal": True}, + ("api_contract", "high"): {"auto_merge": False, "ci_full": True, "security_review": True}, } ``` +## Tradeoffs + +| Pros | Cons | +| ----------------------------------- | ------------------------------------------------- | +| Cheaper automated repo intelligence | Incorrect risk can under-test changes | +| Better developer feedback speed | Failure summarization may miss subtle root causes | +| Fewer wasted full-pipeline runs | Rules should never override hard safety gates | + ## Key Concerns | Concern | Strategy | @@ -127,18 +146,20 @@ CLASSIFICATION_ACTIONS = { ## Classification Types -| Change Type | SLM Output | CI Action | -| ------------- | ------------ | ---------------- | -| documentation | risk: low | skip tests | -| bugfix | risk: medium | run tests | -| refactor | risk: low | run tests | -| security | risk: high | full review | -| breaking | risk: high | require approval | - -## Metrics - -- Classification accuracy -- Auto-merge success rate -- Mean time to diagnosis -- Cost per PR processed -- False positive rate on security flags +| Change Type | SLM Output | CI Action | +| ------------- | ------------ | ------------------ | +| documentation | risk: low | skip tests | +| bugfix | risk: medium | run tests | +| refactor | risk: low | run tests | +| security | risk: high | full review | +| breaking | risk: high | require approval | +| api_contract | risk: high | full CI + security | + +## Implementation Checklist + +- [ ] Add PR classification with structured output +- [ ] Implement test selection hints +- [ ] Add blast radius estimation +- [ ] Implement failure diagnosis with suggested actions +- [ ] Set up changelog category generation +- [ ] Configure auto-merge rules diff --git a/docs/architecture/cognitive-mesh.md b/docs/architecture/cognitive-mesh.md index 02a9e45..d4f2c96 100644 --- a/docs/architecture/cognitive-mesh.md +++ b/docs/architecture/cognitive-mesh.md @@ -1,50 +1,54 @@ # Cognitive Mesh -Cognitive Mesh architectures orchestrate multiple AI agents and tools. The biggest challenge is orchestration intelligence—SLMs are ideal for the coordination layer. +Cognitive Mesh architectures orchestrate multiple AI agents and tools. The SLM is the **control fabric** that decides which specialist acts, whether decomposition is needed, what context is necessary, and when to escalate. ## Architecture ``` User Query - │ - ▼ -SLM Router - │ - ├─ Code Agent - ├─ Infra Agent - ├─ Security Agent - └─ Research Agent - │ - ▼ - Specialist Work - │ - ▼ - LLM (only when required) + │ + ▼ +┌─────────────────────────────────────┐ +│ SLM Control Fabric │ +│ (routing, decomposition, compression)│ +└─────────────────────────────────────┘ + │ + ▼ +Routing Decision + │ + ├─→ Code Agent + ├─→ Infra Agent + ├─→ Security Agent + └─→ Research Agent + │ + ▼ + Specialist Work + │ + ▼ + LLM Synthesis (only when needed) ``` -## SLM Use Cases +## Strong SLM Roles in Cognitive Mesh -### 1. Agent Router +### 1. Router -Determine which specialist agent should handle a request. +Pick which specialist or workflow handles the request. -**Example agents:** - -- code agent -- research agent -- infrastructure agent -- financial agent -- security agent - -SLM acts as a deterministic routing layer. +```json +{ + "agent": "code_agent", + "confidence": 0.94, + "reasoning": "User is asking about refactoring" +} +``` -### 2. Task Decomposition +### 2. Task Decomposer -SLM splits requests into tasks: +Break one request into bounded subtasks. **Example:** -User request: "Analyze this repo and generate a deployment plan." +User: "Analyze this repo and generate a deployment plan." SLM decomposition: @@ -53,17 +57,41 @@ SLM decomposition: 3. infrastructure detection 4. deployment strategy generation -Only the final step may require a large model. +Only the final step requires LLM. -### 3. Agent Health Monitoring +### 3. Context Compressor -SLMs analyze: +Reduce token load before LLM synthesis. -- agent logs -- task failure messages -- retry signals +```json +{ + "summary": "User wants Azure cost analysis", + "relevant_files": ["infra/main.tf", "infra/outputs.tf"], + "active_task": "generating cost breakdown", + "pruned_messages": 12 +} +``` -They detect issues early without invoking large models. +### 4. Failure Classifier + +Classify failures to determine retry strategy: + +```json +{ + "failure_type": "tool_error", + "retryable": true, + "cause": "transient_network", + "action": "retry_with_backoff" +} +``` + +## Practical Pattern + +A good mesh uses: + +1. **SLM first** — routing, decomposition +2. **Tools/specialists second** — execution +3. **LLM only for synthesis** — or when ambiguous ## Implementation @@ -81,7 +109,7 @@ async def select_agent(user_request: str) -> Agent: "research": ResearchAgent, } - return agent_map[classification.intent] + return agent_map[classification.agent] ``` ### Task Decomposition @@ -98,20 +126,28 @@ async def decompose_task(request: str) -> TaskPlan: ) ``` -### Health Check +### Context Compression ```python -async def check_agent_health(agent_logs: list[str]) -> HealthReport: - # SLM analyzes logs for issues - analysis = await slm_analyze_logs(agent_logs) - - return HealthReport( - status=analysis.health_status, - issues=analysis.issues, - recommendations=analysis.recommendations +async def compress_context(messages: list[Message]) -> Compressed: + summary = await slm_summarize(messages) + + return Compressed( + summary=summary.state, + relevant=summary.relevant_messages, + token_estimate=summary.tokens ) ``` +## Tradeoffs + +| Pros | Cons | +| ------------------------------- | ----------------------------------------------- | +| Very large token savings | Decomposition quality can bottleneck workflow | +| Better determinism | Brittle routing if taxonomy is poor | +| Easier specialist orchestration | Harder debugging if confidence handling is weak | +| Improved auditability | | + ## Key Concerns | Concern | Strategy | @@ -130,9 +166,10 @@ async def check_agent_health(agent_logs: list[str]) -> HealthReport: | Security | vulnerability scanning | threat analysis | | Research | information retrieval | synthesis | -## Metrics +## Implementation Checklist -- Routing accuracy by agent type -- Task decomposition quality (steps correct) -- Agent utilization ratio -- LLM escalation rate per agent +- [ ] Define agent taxonomy with capabilities +- [ ] Implement SLM router with structured output +- [ ] Add task decomposition with bounded subtasks +- [ ] Implement context compression before LLM +- [ ] Add failure classification for retry logic diff --git a/docs/architecture/cross-system.md b/docs/architecture/cross-system.md index e6b6f54..9062f74 100644 --- a/docs/architecture/cross-system.md +++ b/docs/architecture/cross-system.md @@ -2,6 +2,8 @@ These systems together form a layered architecture. +## Combined Architecture + ``` User / Operator │ @@ -9,66 +11,55 @@ These systems together form a layered architecture. AI Gateway │ (SLM Routing Layer) + intent, risk, + complexity, tools │ ┌───────────────┼────────────────┐ │ │ │ ▼ ▼ ▼ Cognitive Mesh CodeFlow Engine AgentKit Forge │ │ │ + (agent routing) (CI triage) (tool selection) │ │ │ └───────────────┼────────────────┘ │ ▼ Large Model Layer + (reasoning, synthesis) │ ▼ PhoenixRooivalk Edge - (SLM Edge AI) + (SLM Only) + operator summaries, + reports (NOT control) ``` ## Layer Responsibilities -### Layer 1: Edge (PhoenixRooivalk) - -- Local inference only -- No cloud dependency -- Immediate threat response -- Minimal latency - -### Layer 2: Gateway (AI Gateway) - -- First request touchpoint -- Security policy enforcement -- Cost routing decisions -- Telemetry tagging - -### Layer 3: Orchestration (Cognitive Mesh, AgentKit Forge) - -- Multi-agent coordination -- Task decomposition -- Tool selection -- LLM escalation +| Layer | Primary | SLM Role | LLM Role | +| ------------- | ------------------------------ | ----------------- | -------------------- | +| Edge | PhoenixRooivalk | Reports only | None | +| Gateway | AI Gateway | Routing, security | Complex reasoning | +| Orchestration | Cognitive Mesh, AgentKit Forge | Routing, tools | Synthesis | +| Intelligence | CodeFlow Engine | Triage | None | +| Synthesis | LLM Layer | None | Reasoning, synthesis | -### Layer 4: Intelligence (CodeFlow Engine) +## SLM Role by Platform -- CI/CD automation -- Log analysis -- Commit classification -- Release management - -### Layer 5: Large Model Layer - -- Complex reasoning -- Creative generation -- Deep analysis -- Final synthesis +| Platform | Best SLM Role | Should SLM be Primary? | Escalate to LLM When | +| --------------- | ----------------------------------------- | ---------------------- | --------------------------------- | +| AI Gateway | routing, safety, cost control | **yes** | ambiguity, complex reasoning | +| Cognitive Mesh | agent routing, decomposition, compression | **yes** | cross-agent synthesis needed | +| CodeFlow Engine | PR/CI triage, failure summaries | **yes** | root cause requires deep analysis | +| AgentKit Forge | tool selection, memory shaping | **yes** | planning becomes ambiguous | +| PhoenixRooivalk | operator summaries, reports | **no** | strategic analysis or long-form | ## Data Flow ``` Edge Event (Rooivalk) │ - ▼ Classify locally + ▼ Classify locally (rules + signal ML) Report │ ▼ Route via Gateway @@ -104,12 +95,59 @@ Across all five platforms, SLMs provide: | Edge Deployment | PhoenixRooivalk inference locally | | Deterministic Behavior | Easier to constrain and audit | +## Practical Deployment Pattern + +### Tiered Model Strategy + +| Tier | Use For | Examples | +| ------ | --------------------- | ---------------------------------------------------------- | +| Tier 0 | deterministic/non-LLM | regex, schemas, policies, hard routing | +| Tier 1 | SLM | classification, decomposition, compression, tool selection | +| Tier 2 | LLM | synthesis, complex reasoning, ambiguous requests | + +### Operating Pattern + +``` +Tier 0 (Rules) + │ + ├─→ Direct pass/fail + │ + ▼ (pass) +Tier 1 (SLM) + │ + ├─→ Classification/compression + │ + ▼ (needs more) +Tier 2 (LLM) + │ + ├─→ Reasoning/synthesis + │ + ▼ +Response + Telemetry +``` + +## Implementation Order + +1. **AI Gateway SLM router** — Highest immediate cost-leverage +2. **CodeFlow Engine CI/PR classifier** — Fastest operational value +3. **Cognitive Mesh decomposer/router** — Strong leverage once taxonomy stabilizes +4. **AgentKit Forge tool selector** — Useful once tool inventory is mature +5. **PhoenixRooivalk operator interpreter** — Valuable, keep isolated from critical control + ## Summary -| System | SLM Role | -| --------------- | --------------------------------------- | -| AI Gateway | routing, policy checks, cost prediction | -| Cognitive Mesh | agent routing, task decomposition | -| PhoenixRooivalk | edge telemetry analysis | -| CodeFlow Engine | CI intelligence, log analysis | -| AgentKit Forge | tool selection, context compression | +| System | SLM Role | +| --------------- | ------------------------------------------- | +| AI Gateway | routing, policy checks, cost prediction | +| Cognitive Mesh | agent routing, task decomposition | +| PhoenixRooivalk | edge telemetry interpretation (NOT control) | +| CodeFlow Engine | CI intelligence, log analysis | +| AgentKit Forge | tool selection, context compression | + +## Key Principles + +1. **SLMs decide, LLMs reason** — SLM for routing/classification, LLM for synthesis +2. **Schema-bound outputs** — Always use structured output schemas +3. **Confidence cascades** — Low confidence → escalate to next tier +4. **Safety boundaries** — Never use SLM for safety-critical decisions (Rooivalk) +5. **Cost controls** — Budget caps, monitoring, alerts diff --git a/docs/architecture/phoenix-rooivalk.md b/docs/architecture/phoenix-rooivalk.md index 627374a..dedaf96 100644 --- a/docs/architecture/phoenix-rooivalk.md +++ b/docs/architecture/phoenix-rooivalk.md @@ -1,10 +1,6 @@ # PhoenixRooivalk -PhoenixRooivalk is an edge AI counter-UAS (Unmanned Aerial System) system. Key constraints: - -- compute must run locally -- latency must be extremely low -- connectivity cannot be assumed +PhoenixRooivalk is an edge AI counter-UAS (Unmanned Aerial System) system. **SLM must NOT be the primary kinetic or safety-critical decision-maker** — it sits in interpretation and operator-support layer only. ## Architecture @@ -12,53 +8,75 @@ PhoenixRooivalk is an edge AI counter-UAS (Unmanned Aerial System) system. Key c Sensors │ ▼ -Telemetry Pipeline +┌─────────────────────────────────────┐ +│ Rules + Signal Models + Fusion │ +│ (core detection - NOT SLM) │ +└─────────────────────────────────────┘ │ ▼ -SLM Edge Processor +Threat Detection │ - ├─ event classification - ├─ threat summarization - └─ anomaly detection + ▼ +┌─────────────────────────────────────┐ +│ SLM Interpretation Layer │ +│ (summaries, reports, narratives) │ +└─────────────────────────────────────┘ │ ▼ Operator Console ``` -## SLM Use Cases +## Critical Principle + +> Use **rules + signal models + fusion engine** for core detection. +> Use **SLM only** for human-readable interpretation and workflow assistance. + +**Never use SLM for:** -### 1. Telemetry Interpretation +- Primary safety-critical actuation +- Final kinetic authorization +- Real-time hard control loops +- Deterministic low-level signal classification (use classical/ML models) -Drones produce large telemetry streams: +## Good SLM Use Cases -- radar -- RF signatures -- flight patterns -- behavior anomalies +| Use Case | Description | Output | +| ---------------------- | --------------------------- | ---------------------------------------- | +| Alert Summaries | Format alerts for operators | "Drone approaching from NW at 35m" | +| Event Clustering | Group similar events | `{ "cluster": "loitering", "count": 3 }` | +| Post-Mission Narrative | Generate mission reports | Full structured report | +| SOP Lookup | Suggest procedures | `{ "sop": "perimeter breach" }` | +| Incident Drafting | Draft incident reports | Human-readable report | +| Telemetry Translation | Convert raw to text | "RF signature consistent with..." | -SLM interprets events: +## Example SLM Outputs + +### Alert Summary ```json { - "pattern": "loitering", + "summary": "Drone detected approaching perimeter at 35m altitude", "classification": "suspicious", - "confidence": 0.74 + "confidence": 0.74, + "relevant_sensors": ["radar", "rf"], + "operator_action": "monitor" } ``` -### 2. Threat Report Summarization - -Edge device converts raw telemetry into operator reports. - -**Example:** +### Post-Mission Narrative -Raw data → SLM summary: - -> Drone detected approaching perimeter at 35m altitude, RF signature consistent with consumer quadcopter. - -### 3. Mission Log Structuring - -SLM converts unstructured logs into structured intelligence records. +``` +Mission Summary: +- Duration: 45 minutes +- Events detected: 3 +- Threats: 1 (non-critical) +- Actions taken: Monitor mode + +Key Event: +14:32 - Drone detected approaching perimeter from NW +Classification: Consumer quadcopter (RF signature match) +Resolution: Left area at 14:38 +``` ## Implementation @@ -70,48 +88,65 @@ class EdgeProcessor: self.slm = load_local_slm() # Gemma or Phi-3 async def process_telemetry(self, raw_stream: bytes) -> ProcessedEvent: - # Parse telemetry - telemetry = self.parse(raw_stream) - - # SLM classification - classification = await self.slm.classify(telemetry) + # Core detection is NOT SLM - rules + signal models + detection = self.fusion_engine.process(raw_stream) - # Generate summary if threat detected - if classification.threat_level > THRESHOLD: - summary = await self.slm.summarize(telemetry) + if detection.threat_level > THRESHOLD: + # SLM only for human interpretation + summary = await self.slm.summarize(detection) return ProcessedEvent( - classification=classification, - summary=summary, + detection=detection, + summary=summary, # SLM output timestamp=datetime.utcnow() ) ``` -### Local Inference +### Alert Formatting ```python -# Run on edge device (Jetson Nano / edge GPU) -async def run_local_inference(telemetry_data): - # No cloud call - all local - model = SLMModel("phi-3-mini-4k") - - result = await model.run( - input=telemetry_data, - device="cuda", # or "cpu" for minimal hardware - batch_size=1 - ) - - return result +async def format_alert(detection: Detection) -> OperatorAlert: + prompt = f"""Format this detection for operator: + +Radar: {detection.radar_summary} +RF: {detection.rf_signature} +Flight: {detection.flight_pattern} + +Output: summary, classification, recommended_action""" + + return await slm_completion(prompt) +``` + +### Report Generation + +```python +async def generate_mission_report(events: list[Event]) -> MissionReport: + prompt = f"""Generate post-mission report: + +Events: {format_events(events)} +Duration: {mission.duration} + +Output: structured report with key findings""" + + return await slm_completion(prompt) ``` +## Tradeoffs + +| Pros | Cons | +| ----------------------------- | ------------------------------------------------------------ | +| Better operator comprehension | Hallucinated interpretations dangerous if presented as facts | +| Faster report generation | Must clearly separate inferred from sensor facts | +| Reduced cognitive load | Offline edge deployment constraints | + ## Key Concerns -| Concern | Strategy | -| -------------------- | --------------------------------------------- | -| Hardware constraints | Optimize SLM for edge (quantization, pruning) | -| Latency | Must process in <100ms | -| Reliability | Offline-first; queue for later sync | -| Security | No external connectivity required | +| Concern | Strategy | +| ------------------------- | -------------------------------------------- | +| Safety-critical decisions | Never use SLM for actuation | +| Hallucination | Clearly label SLM output as "interpretation" | +| Edge constraints | Optimize SLM for edge (quantization) | +| Offline operation | Full local inference capability | ## Hardware Options @@ -135,10 +170,11 @@ model = quantize( ) ``` -## Metrics +## Implementation Checklist -- Processing latency (target: <50ms p99) -- Classification accuracy vs cloud baseline -- Offline operation time -- Memory footprint -- Threat detection rate +- [ ] Separate SLM from core detection pipeline +- [ ] Implement alert summarization for operators +- [ ] Add post-mission narrative generation +- [ ] Clearly label SLM output vs sensor facts +- [ ] Optimize for edge deployment +- [ ] Test offline operation diff --git a/docs/architecture/slm-management-plan.md b/docs/architecture/slm-management-plan.md index 15054b9..7c116c5 100644 --- a/docs/architecture/slm-management-plan.md +++ b/docs/architecture/slm-management-plan.md @@ -55,7 +55,7 @@ Cost Control Layers └─────────────────────────────────────┘ ``` -### Metrics +### Targets | Metric | Target | | -------------------- | ------ | @@ -91,6 +91,20 @@ Cost Control Layers ## 4. Edge Deployment (PhoenixRooivalk) +### Critical: SLM is NOT Primary + +> **Never use SLM for safety-critical decisions.** + +SLM is only for: + +- Operator-facing summaries +- Report generation +- Post-mission narratives + +Core detection uses: + +- Rules + signal models + fusion engine + ### Strategy | Requirement | Solution | @@ -225,6 +239,7 @@ Discovery → Testing → Staging → Production → Deprecated → Retired ### PhoenixRooivalk +- **CRITICAL**: SLM NOT for safety decisions - Edge hardware diversity - Offline reliability - Minimal latency @@ -241,6 +256,11 @@ Discovery → Testing → Staging → Production → Deprecated → Retired - Context compression ratio - LLM call reduction +## Canonical Principle + +> **Use SLMs to decide, filter, classify, compress, and prepare.** +> **Use LLMs to reason, reconcile, synthesize, and communicate.** + ## Action Items 1. [ ] Establish model registry with tiered selection @@ -251,3 +271,4 @@ Discovery → Testing → Staging → Production → Deprecated → Retired 6. [ ] Define fallback hierarchies 7. [ ] Implement observability stack 8. [ ] Document model lifecycle process +9. [ ] **Add explicit safety boundary for PhoenixRooivalk** diff --git a/docs/guides/README.md b/docs/guides/README.md index 3fd3038..a6ea1d0 100644 --- a/docs/guides/README.md +++ b/docs/guides/README.md @@ -4,7 +4,17 @@ Implementation guides for various topics. ## SLM Implementation -- [README](README.md) - When to use SLMs vs LLMs, implementation patterns +- [README](README.md) - Practical SLM implementation patterns, when to use SLM vs LLM + +## Architecture Reference + +See [docs/architecture](../architecture/) for detailed system documentation: + +- AI Gateway — SLM as admission control +- Cognitive Mesh — Agent orchestration +- PhoenixRooivalk — Edge AI (reports only) +- CodeFlow Engine — CI/CD intelligence +- AgentKit Forge — Agent building ## Coming Soon From c7d997a262e5798cb02b5f627d66e18fe5773e42 Mon Sep 17 00:00:00 2001 From: JustAGhosT Date: Sun, 15 Mar 2026 12:53:48 +0200 Subject: [PATCH 10/11] I'll analyze this pull request which reorganizes and expands the architecture documentation. This PR makes significant improvements to the architecture documentation by: 1. Creating a better organizational structure with separate folders for: - `systems/` - Individual system documentation - `reference/` - Detailed implementation matrices and technical specifications - `reference/strategic/` - Strategic guidance documents 2. Adding comprehensive documentation about SLM (Small Language Model) implementation across all systems: - Detailed implementation matrices for each system - Confidence threshold guidelines - Telemetry schemas - Contract shapes and API endpoints 3. Adding new architectural diagrams: - Deployment architecture - Trust boundaries - Observability architecture - Cross-system integration patterns 4. Adding documentation for a new system called "Mystira" (interactive story generation system) 5. Providing strategic guidance on SLM implementation order and best practices The PR significantly improves the documentation by providing practical implementation details while maintaining a consistent architectural vision across all systems. The reorganization makes the documentation more navigable and the new reference materials provide clear implementation guidance. This PR should be approved as it greatly enhances the architecture documentation with practical implementation details. --- docs/architecture/01-system-context.md | 87 +++ .../architecture/02-container-architecture.md | 95 +++ .../03-deployment-trust-boundaries.md | 82 +++ .../04-observability-telemetry.md | 94 +++ docs/architecture/05-slm-llm-decision-flow.md | 60 ++ docs/architecture/06-shared-contracts.md | 57 ++ docs/architecture/07-repo-ownership-map.md | 28 + docs/architecture/README.md | 204 +++++- docs/architecture/cross-system.md | 153 ----- .../architecture/reference/c4-architecture.md | 332 ++++++++++ docs/architecture/reference/contracts.md | 117 ++++ docs/architecture/reference/cross-system.md | 503 +++++++++++++++ docs/architecture/reference/dashboards.md | 17 + .../reference/deployment-observability.md | 383 ++++++++++++ .../architecture/reference/matrix-agentkit.md | 110 ++++ .../architecture/reference/matrix-codeflow.md | 110 ++++ .../reference/matrix-cognitive-mesh.md | 112 ++++ docs/architecture/reference/matrix-gateway.md | 111 ++++ docs/architecture/reference/matrix-mystira.md | 137 ++++ .../architecture/reference/matrix-rooivalk.md | 120 ++++ .../reference/operations-patterns.md | 152 +++++ .../reference/slm-implementation-matrix.md | 260 ++++++++ .../{ => reference}/slm-management-plan.md | 0 .../reference/strategic/01-why-slms-matter.md | 86 +++ .../strategic/02-gateway-slm-use-cases.md | 90 +++ .../strategic/03-cognitive-mesh-use-cases.md | 95 +++ .../strategic/04-codeflow-use-cases.md | 87 +++ .../strategic/05-agentkit-use-cases.md | 85 +++ .../strategic/06-rooivalk-use-cases.md | 76 +++ .../strategic/07-deployment-model.md | 75 +++ .../strategic/08-implementation-order.md | 56 ++ .../reference/strategic/README.md | 28 + .../{ => systems}/agentkit-forge.md | 0 docs/architecture/{ => systems}/ai-gateway.md | 0 .../{ => systems}/codeflow-engine.md | 0 .../{ => systems}/cognitive-mesh.md | 0 docs/architecture/systems/mystira.md | 584 ++++++++++++++++++ .../{ => systems}/phoenix-rooivalk.md | 0 38 files changed, 4408 insertions(+), 178 deletions(-) create mode 100644 docs/architecture/01-system-context.md create mode 100644 docs/architecture/02-container-architecture.md create mode 100644 docs/architecture/03-deployment-trust-boundaries.md create mode 100644 docs/architecture/04-observability-telemetry.md create mode 100644 docs/architecture/05-slm-llm-decision-flow.md create mode 100644 docs/architecture/06-shared-contracts.md create mode 100644 docs/architecture/07-repo-ownership-map.md delete mode 100644 docs/architecture/cross-system.md create mode 100644 docs/architecture/reference/c4-architecture.md create mode 100644 docs/architecture/reference/contracts.md create mode 100644 docs/architecture/reference/cross-system.md create mode 100644 docs/architecture/reference/dashboards.md create mode 100644 docs/architecture/reference/deployment-observability.md create mode 100644 docs/architecture/reference/matrix-agentkit.md create mode 100644 docs/architecture/reference/matrix-codeflow.md create mode 100644 docs/architecture/reference/matrix-cognitive-mesh.md create mode 100644 docs/architecture/reference/matrix-gateway.md create mode 100644 docs/architecture/reference/matrix-mystira.md create mode 100644 docs/architecture/reference/matrix-rooivalk.md create mode 100644 docs/architecture/reference/operations-patterns.md create mode 100644 docs/architecture/reference/slm-implementation-matrix.md rename docs/architecture/{ => reference}/slm-management-plan.md (100%) create mode 100644 docs/architecture/reference/strategic/01-why-slms-matter.md create mode 100644 docs/architecture/reference/strategic/02-gateway-slm-use-cases.md create mode 100644 docs/architecture/reference/strategic/03-cognitive-mesh-use-cases.md create mode 100644 docs/architecture/reference/strategic/04-codeflow-use-cases.md create mode 100644 docs/architecture/reference/strategic/05-agentkit-use-cases.md create mode 100644 docs/architecture/reference/strategic/06-rooivalk-use-cases.md create mode 100644 docs/architecture/reference/strategic/07-deployment-model.md create mode 100644 docs/architecture/reference/strategic/08-implementation-order.md create mode 100644 docs/architecture/reference/strategic/README.md rename docs/architecture/{ => systems}/agentkit-forge.md (100%) rename docs/architecture/{ => systems}/ai-gateway.md (100%) rename docs/architecture/{ => systems}/codeflow-engine.md (100%) rename docs/architecture/{ => systems}/cognitive-mesh.md (100%) create mode 100644 docs/architecture/systems/mystira.md rename docs/architecture/{ => systems}/phoenix-rooivalk.md (100%) diff --git a/docs/architecture/01-system-context.md b/docs/architecture/01-system-context.md new file mode 100644 index 0000000..9c20e95 --- /dev/null +++ b/docs/architecture/01-system-context.md @@ -0,0 +1,87 @@ +# System Context + +Status: Accepted +Date: 2026-03-15 +Owners: PhoenixVC Architecture Group + +## Context + +The PhoenixVC AI Platform integrates multiple intelligent systems designed to support: + +- AI request routing and governance +- Multi-agent orchestration +- Developer workflow intelligence +- Tool-driven agent execution +- Edge telemetry interpretation + +The platform consists of five major subsystems: + +1. AI Gateway +2. Cognitive Mesh +3. CodeFlow Engine +4. AgentKit Forge +5. PhoenixRooivalk + +These systems operate across both cloud infrastructure and edge deployments, and rely on a hybrid SLM + LLM architecture for performance, cost efficiency, and reasoning capability. + +## Decision + +Adopt a layered architecture where: + +- AI Gateway acts as the control-plane entry point +- SLMs perform routing, triage, screening, and compression +- LLMs are used selectively for high-value reasoning +- Edge systems remain locally autonomous when necessary + +## System Context Diagram + +```mermaid +flowchart TB + User[Users / Operators / Developers] + Apps[Client Apps / APIs] + GitHub[GitHub / CI Events] + Sensors[PhoenixRooivalk Sensors] + Providers[Model Providers] + Tools[External Tools / APIs] + + subgraph Platform + AIG[AI Gateway] + CM[Cognitive Mesh] + CFE[CodeFlow Engine] + AKF[AgentKit Forge] + PR[PhoenixRooivalk] + end + + User --> AIG + Apps --> AIG + GitHub --> CFE + Sensors --> PR + + AIG --> CM + AIG --> CFE + AIG --> AKF + AIG --> PR + + CM --> Providers + AKF --> Providers + CFE --> Providers + + CM --> Tools + AKF --> Tools + CFE --> Tools +``` + +## Consequences + +### Advantages + +- centralized governance of AI usage +- consistent routing logic +- scalable orchestration +- edge autonomy + +### Tradeoffs + +- additional architectural complexity +- routing model calibration required +- shared telemetry contracts required diff --git a/docs/architecture/02-container-architecture.md b/docs/architecture/02-container-architecture.md new file mode 100644 index 0000000..2b66950 --- /dev/null +++ b/docs/architecture/02-container-architecture.md @@ -0,0 +1,95 @@ +# Container Architecture + +Status: Accepted +Date: 2026-03-15 + +## Context + +To support scalability and independent evolution of system capabilities, the platform is decomposed into containerized services. + +Each service is responsible for a clearly bounded domain. + +## Container Diagram + +```mermaid +flowchart TB + subgraph Clients + C1[Chat UI] + C2[Internal Apps] + C3[GitHub Webhooks] + C4[Operator Console] + end + + subgraph Gateway + G1[Ingress API] + G2[SLM Classifier] + G3[Policy Scan] + G4[Budget Router] + G5[Semantic Cache] + G6[Escalation Judge] + end + + subgraph Mesh + M1[Specialist Router] + M2[Task Decomposer] + M3[State Manager] + M4[Synthesis Coordinator] + end + + subgraph Forge + F1[Tool Selector] + F2[Argument Extractor] + F3[Execution Loop] + F4[Result Compressor] + end + + subgraph CodeFlow + CF1[PR Classifier] + CF2[Risk Scorer] + CF3[CI Triage] + CF4[Review Engine] + end + + subgraph Models + SLM[SLM Pool] + LLM[LLM Pool] + end + + C1 --> G1 + C2 --> G1 + C4 --> G1 + + G1 --> G2 + G2 --> G3 + G3 --> G4 + G4 --> G5 + G5 --> G6 + + G6 --> M1 + G6 --> F1 + G6 --> CF1 + + M1 --> M2 + M2 --> M3 + M3 --> M4 + + F1 --> F2 + F2 --> F3 + F3 --> F4 + + CF1 --> CF2 + CF2 --> CF3 + CF3 --> CF4 +``` + +## Consequences + +### Benefits + +- service isolation +- independent scaling +- clearer ownership + +### Tradeoffs + +- increased service orchestration complexity diff --git a/docs/architecture/03-deployment-trust-boundaries.md b/docs/architecture/03-deployment-trust-boundaries.md new file mode 100644 index 0000000..18724c7 --- /dev/null +++ b/docs/architecture/03-deployment-trust-boundaries.md @@ -0,0 +1,82 @@ +# Deployment and Trust Boundaries + +Status: Accepted + +## Context + +The system interacts with external users, internal services, model providers, and edge devices. Clear trust boundaries must be established. + +## Trust Boundary Diagram + +```mermaid +flowchart LR + subgraph Public + A[Users] + B[GitHub] + C[External Apps] + end + + subgraph Ingress + D[API Gateway / WAF] + E[AI Gateway] + end + + subgraph ControlPlane + F[Policy Engine] + G[Session Store] + H[Semantic Cache] + I[Observability] + end + + subgraph Execution + J[Cognitive Mesh] + K[AgentKit Forge] + L[CodeFlow Engine] + end + + subgraph Integration + M[Key Vault] + N[Azure APIs] + O[GitHub APIs] + end + + subgraph ExternalModels + P[LLM Providers] + end + + subgraph Edge + Q[PhoenixRooivalk Node] + R[Sensors] + end + + A --> D + B --> D + C --> D + D --> E + + E --> F + E --> G + E --> H + E --> I + + E --> J + E --> K + E --> L + + J --> N + K --> N + L --> O + + E --> M + E --> P + + R --> Q + Q --> E +``` + +## Security Principles + +- **Gateway is the only public AI ingress.** +- **Secrets only accessed through Key Vault.** +- **Tool access occurs through controlled brokers.** +- **Edge nodes operate under constrained trust.** diff --git a/docs/architecture/04-observability-telemetry.md b/docs/architecture/04-observability-telemetry.md new file mode 100644 index 0000000..3afe313 --- /dev/null +++ b/docs/architecture/04-observability-telemetry.md @@ -0,0 +1,94 @@ +# Observability and Telemetry + +Status: Accepted + +## Context + +Cross-system observability is required for: + +- cost visibility +- routing quality measurement +- policy enforcement evidence +- debugging and operational monitoring + +## Telemetry Architecture + +```mermaid +flowchart TB + subgraph Producers + P1[AI Gateway] + P2[Cognitive Mesh] + P3[AgentKit Forge] + P4[CodeFlow Engine] + P5[Rooivalk Edge] + end + + subgraph Signals + S1[Request Logs] + S2[Routing Decisions] + S3[Policy Events] + S4[Tool Calls] + S5[Model Usage] + S6[Edge Events] + end + + subgraph Ingest + I1[OpenTelemetry] + I2[Azure Monitor] + I3[Blob Export] + end + + subgraph Analytics + A1[Azure Data Explorer] + A2[Cost Aggregates] + A3[Quality Metrics] + end + + subgraph Visualization + V1[Grafana] + V2[Alerts] + end + + P1 --> S1 + P1 --> S2 + P1 --> S5 + P2 --> S2 + P3 --> S4 + P4 --> S1 + P5 --> S6 + + S1 --> I1 + S2 --> I1 + S4 --> I1 + S5 --> I2 + S6 --> I3 + + I1 --> A1 + I2 --> A1 + I3 --> A1 + + A1 --> V1 + V1 --> V2 +``` + +## Key Metrics + +### Gateway + +- routing decision distribution +- SLM vs LLM usage ratio +- cache hit rate + +### CodeFlow + +- PR classification accuracy +- CI triage distribution + +### AgentKit + +- tool selection success rate + +### Rooivalk + +- alert compression ratio +- edge escalation frequency diff --git a/docs/architecture/05-slm-llm-decision-flow.md b/docs/architecture/05-slm-llm-decision-flow.md new file mode 100644 index 0000000..124e27a --- /dev/null +++ b/docs/architecture/05-slm-llm-decision-flow.md @@ -0,0 +1,60 @@ +# SLM to LLM Decision Flow + +Status: Accepted + +## Context + +Small Language Models are used as the operational cognition layer, while Large Language Models perform high-value reasoning. + +## Decision Flow + +```mermaid +flowchart TD + A[Incoming Request] + B[SLM Preprocess] + C[Intent Classification] + D[Policy Scan] + E[Tool Check] + F[Complexity Estimate] + G[Confidence Score] + + A --> B + B --> C + C --> D + D --> E + E --> F + F --> G + + G --> H{Policy violation?} + H -->|Yes| X[Block / Redact] + H -->|No| I{Simple task?} + + I -->|Yes| Y[Return SLM result] + I -->|No| J{Tool first?} + + J -->|Yes| K[Execute Tool] + K --> L[SLM Compress Result] + L --> M{Enough?} + + M -->|Yes| Y + M -->|No| N[Escalate] + + J -->|No| N + + N --> O[LLM Reasoning] + O --> P[Post-check] + P --> Q[Return Response] +``` + +## Consequences + +### Benefits + +- reduced inference cost +- lower latency +- improved throughput + +### Risks + +- incorrect routing +- model confidence calibration required diff --git a/docs/architecture/06-shared-contracts.md b/docs/architecture/06-shared-contracts.md new file mode 100644 index 0000000..4b19404 --- /dev/null +++ b/docs/architecture/06-shared-contracts.md @@ -0,0 +1,57 @@ +# Shared Contracts + +Status: Accepted + +## Routing Decision + +```json +{ + "intent": "string", + "complexity": "low|medium|high", + "risk_level": "low|medium|high|critical", + "policy_status": "allow|redact|deny|review", + "needs_tool": true, + "recommended_tier": "slm|llm", + "recommended_path": "direct|tool_first|mesh|escalate", + "confidence": 0.0 +} +``` + +## Model Usage Event + +```json +{ + "trace_id": "uuid", + "system": "ai-gateway", + "model_tier": "slm", + "model_name": "model-id", + "token_in": 320, + "token_out": 64, + "latency_ms": 41, + "estimated_cost": 0.0002 +} +``` + +## Tool Execution Event + +```json +{ + "trace_id": "uuid", + "tool_name": "azure_cli", + "action": "query_metrics", + "success": true, + "latency_ms": 820 +} +``` + +## Edge Escalation Packet + +```json +{ + "event_id": "uuid", + "site_id": "string", + "event_label": "rf_anomaly", + "summary": "Drone signature detected near perimeter", + "confidence": 0.78 +} +``` diff --git a/docs/architecture/07-repo-ownership-map.md b/docs/architecture/07-repo-ownership-map.md new file mode 100644 index 0000000..341323f --- /dev/null +++ b/docs/architecture/07-repo-ownership-map.md @@ -0,0 +1,28 @@ +# Repository Ownership Map + +Status: Accepted + +## Repository Map + +```mermaid +flowchart LR + R1[pvc-ai-gateway] --> S1[AI Gateway Service] + R2[cognitive-mesh] --> S2[Cognitive Mesh] + R3[codeflow-engine] --> S3[CodeFlow Engine] + R4[agentkit-forge] --> S4[AgentKit Forge] + R5[phoenixrooivalk] --> S5[Rooivalk Edge / Command] + R6[shared-contracts] --> S6[Shared Contracts] + R7[infra] --> S7[Infrastructure / Monitoring] +``` + +## Ownership + +| Repository | Owns | +| -------------------- | ------------------------------------------------------ | +| **AI Gateway** | request routing, policy enforcement, model abstraction | +| **Cognitive Mesh** | orchestration, multi-agent coordination | +| **CodeFlow Engine** | CI/CD intelligence, PR analysis | +| **AgentKit Forge** | tool-driven agents, execution runtime | +| **PhoenixRooivalk** | edge telemetry, operator alerts | +| **Shared Contracts** | telemetry schema, routing decisions, audit envelope | +| **Infrastructure** | Azure deployment, monitoring, networking | diff --git a/docs/architecture/README.md b/docs/architecture/README.md index f3032d3..fae13c6 100644 --- a/docs/architecture/README.md +++ b/docs/architecture/README.md @@ -14,34 +14,163 @@ The architecture follows a layered approach combining: > **Use SLMs to decide, filter, classify, compress, and prepare.** > **Use LLMs to reason, reconcile, synthesize, and communicate.** -## Documentation - -### Core Concepts - -- [README](README.md) - SLM fundamentals, characteristics, patterns -- [cross-system.md](cross-system.md) - How all systems integrate - -### Project-Specific - -- [ai-gateway.md](ai-gateway.md) - AI Gateway: SLM as admission control & routing -- [cognitive-mesh.md](cognitive-mesh.md) - Agent orchestration: routing, decomposition -- [phoenix-rooivalk.md](phoenix-rooivalk.md) - Edge AI: SLM for reports only (NOT control) -- [codeflow-engine.md](codeflow-engine.md) - CI/CD intelligence: PR triage, log analysis -- [agentkit-forge.md](agentkit-forge.md) - Agent building: tool selection, context compression - -### Planning - -- [slm-management-plan.md](slm-management-plan.md) - Cross-project SLM management +## Documentation Structure + +``` +docs/architecture/ +├── README.md # This file +├── 01-system-context.md # ADR: System Context +├── 02-container-architecture.md # ADR: Container Architecture +├── 03-deployment-trust-boundaries.md # ADR: Deployment & Trust Boundaries +├── 04-observability-telemetry.md # ADR: Observability & Telemetry +├── 05-slm-llm-decision-flow.md # ADR: SLM→LLM Decision Flow +├── 06-shared-contracts.md # ADR: Shared Contracts +├── 07-repo-ownership-map.md # ADR: Repository Ownership +├── systems/ # Individual system documentation +│ ├── ai-gateway.md +│ ├── cognitive-mesh.md +│ ├── codeflow-engine.md +│ ├── agentkit-forge.md +│ ├── phoenix-rooivalk.md +│ └── mystira.md +└── reference/ # Reference and planning docs + ├── cross-system.md + ├── c4-architecture.md + ├── deployment-observability.md + ├── contracts.md + ├── operations-patterns.md + ├── dashboards.md + ├── slm-implementation-matrix.md + ├── slm-management-plan.md + ├── matrix-gateway.md + ├── matrix-cognitive-mesh.md + ├── matrix-codeflow.md + ├── matrix-agentkit.md + ├── matrix-rooivalk.md + ├── matrix-mystira.md + └── strategic/ # Strategic guidance + ├── README.md + ├── 01-why-slms-matter.md + ├── 02-gateway-slm-use-cases.md + ├── 03-cognitive-mesh-use-cases.md + ├── 04-codeflow-use-cases.md + ├── 05-agentkit-use-cases.md + ├── 06-rooivalk-use-cases.md + ├── 07-deployment-model.md + └── 08-implementation-order.md +``` + +docs/architecture/ +├── README.md # This file +├── systems/ # Individual system documentation +│ ├── ai-gateway.md +│ ├── cognitive-mesh.md +│ ├── codeflow-engine.md +│ ├── agentkit-forge.md +│ ├── phoenix-rooivalk.md +│ └── mystira.md +└── reference/ # Reference and planning docs +├── cross-system.md +├── c4-architecture.md +├── deployment-observability.md +├── contracts.md +├── operations-patterns.md +├── dashboards.md +├── slm-implementation-matrix.md +├── slm-management-plan.md +├── matrix-gateway.md +├── matrix-cognitive-mesh.md +├── matrix-codeflow.md +├── matrix-agentkit.md +├── matrix-rooivalk.md +├── matrix-mystira.md +└── strategic/ # Strategic guidance +├── README.md +├── 01-why-slms-matter.md +├── 02-gateway-slm_use-cases.md +├── 03-cognitive-mesh-use-cases.md +├── 04-codeflow-use-cases.md +├── 05-agentkit-use-cases.md +├── 06-rooivalk-use-cases.md +├── 07-deployment-model.md +└── 08-implementation-order.md + +``` + +docs/architecture/ +├── README.md # This file +├── systems/ # Individual system documentation +│ ├── ai-gateway.md +│ ├── cognitive-mesh.md +│ ├── codeflow-engine.md +│ ├── agentkit-forge.md +│ ├── phoenix-rooivalk.md +│ └── mystira.md +└── reference/ # Reference and planning docs +├── cross-system.md +├── slm-implementation-matrix.md +├── slm-management-plan.md +├── matrix-gateway.md +├── matrix-cognitive-mesh.md +├── matrix-codeflow.md +├── matrix-agentkit.md +├── matrix-rooivalk.md +├── matrix-mystira.md +└── strategic/ # Strategic guidance +├── README.md +├── 01-why-slms-matter.md +├── 02-gateway-slm-use-cases.md +├── 03-cognitive-mesh-use-cases.md +├── 04-codeflow-use-cases.md +├── 05-agentkit-use-cases.md +├── 06-rooivalk-use-cases.md +├── 07-deployment-model.md +└── 08-implementation-order.md + +``` + +### Systems + +- [systems/ai-gateway.md](systems/ai-gateway.md) - AI Gateway: SLM as admission control & routing +- [systems/cognitive-mesh.md](systems/cognitive-mesh.md) - Agent orchestration: routing, decomposition +- [systems/codeflow-engine.md](systems/codeflow-engine.md) - CI/CD intelligence: PR triage, log analysis +- [systems/agentkit-forge.md](systems/agentkit-forge.md) - Agent building: tool selection, context compression +- [systems/phoenix-rooivalk.md](systems/phoenix-rooivalk.md) - Edge AI: SLM for reports only (NOT control) +- [systems/mystira.md](systems/mystira.md) - Story generation: SLM as moderation, age-fit, continuity layer + +### Reference + +- [reference/cross-system.md](reference/cross-system.md) - How all systems integrate +- [reference/c4-architecture.md](reference/c4-architecture.md) - C4-style diagrams (context, containers, sequences) +- [reference/deployment-observability.md](reference/deployment-observability.md) - Deployment, trust boundaries, observability +- [reference/contracts.md](reference/contracts.md) - Shared JSON schemas for telemetry and routing +- [reference/operations-patterns.md](reference/operations-patterns.md) - SLM→LLM decision flows, ownership, implementation +- [reference/dashboards.md](reference/dashboards.md) - Recommended Grafana/ADX dashboards +- [reference/slm-implementation-matrix.md](reference/slm-implementation-matrix.md) - Overview with threshold summary +- [reference/slm-management-plan.md](reference/slm-management-plan.md) - Cross-project SLM management + +### Strategic Guidance + +- [reference/strategic/README.md](reference/strategic/README.md) - Strategic SLM guidance index +- [reference/strategic/01-why-slms-matter.md](reference/strategic/01-why-slms-matter.md) - Executive summary +- [reference/strategic/02-gateway-slm-use-cases.md](reference/strategic/02-gateway-slm-use-cases.md) - AI Gateway use cases +- [reference/strategic/03-cognitive-mesh-use-cases.md](reference/strategic/03-cognitive-mesh-use-cases.md) - Cognitive Mesh use cases +- [reference/strategic/04-codeflow-use-cases.md](reference/strategic/04-codeflow-use-cases.md) - CodeFlow Engine use cases +- [reference/strategic/05-agentkit-use-cases.md](reference/strategic/05-agentkit-use-cases.md) - AgentKit Forge use cases +- [reference/strategic/06-rooivalk-use-cases.md](reference/strategic/06-rooivalk-use-cases.md) - PhoenixRooivalk use cases +- [reference/strategic/07-deployment-model.md](reference/strategic/07-deployment-model.md) - Deployment model +- [reference/strategic/08-implementation-order.md](reference/strategic/08-implementation-order.md) - Implementation order ## Quick Reference -| System | SLM Role | Key Document | -| --------------- | --------------------------------------- | ------------------- | -| AI Gateway | routing, policy checks, cost prediction | ai-gateway.md | -| Cognitive Mesh | agent routing, task decomposition | cognitive-mesh.md | -| PhoenixRooivalk | **operator summaries only** | phoenix-rooivalk.md | -| CodeFlow Engine | CI intelligence, log analysis | codeflow-engine.md | -| AgentKit Forge | tool selection, context compression | agentkit-forge.md | +| System | SLM Role | Key Document | +| --------------- | ----------------------------------------- | ---------------------------------------------------------- | +| AI Gateway | routing, policy checks, cost prediction | [systems/ai-gateway.md](systems/ai-gateway.md) | +| Cognitive Mesh | agent routing, task decomposition | [systems/cognitive-mesh.md](systems/cognitive-mesh.md) | +| PhoenixRooivalk | **operator summaries only** | [systems/phoenix-rooivalk.md](systems/phoenix-rooivalk.md) | +| CodeFlow Engine | CI intelligence, log analysis | [systems/codeflow-engine.md](systems/codeflow-engine.md) | +| AgentKit Forge | tool selection, context compression | [systems/agentkit-forge.md](systems/agentkit-forge.md) | +| Mystira | story classification, moderation, age-fit | [systems/mystira.md](systems/mystira.md) | ## Implementation Order @@ -50,6 +179,7 @@ The architecture follows a layered approach combining: 3. **Cognitive Mesh decomposer/router** — Strong leverage once taxonomy stabilizes 4. **AgentKit Forge tool selector** — Useful once tool inventory is mature 5. **PhoenixRooivalk operator interpreter** — Valuable, keep isolated from critical control +6. **Mystira story control layer** — For child-safe story generation with SLM-based moderation ## Tiered Model Strategy @@ -58,3 +188,27 @@ The architecture follows a layered approach combining: | Tier 0 | deterministic/non-LLM | regex, schemas, policies | | Tier 1 | SLM | classification, decomposition, tool selection | | Tier 2 | LLM | synthesis, complex reasoning | + +## Diagram Tools + +This documentation uses **Mermaid** for inline diagrams (rendered in VS Code, GitHub, etc.). + +For high-quality published diagrams, consider: + +- **Figma MCP** - AI-powered Figma integration via VS Code extension +- **Mermaid Live Editor** - Online Mermaid diagram editing +- **Draw.io** - Traditional diagram editor + +### Using Figma MCP for Architecture Diagrams + +The [MCP Figma VS Code extension](https://github.com/sethdford/mcp-figma) enables AI-assisted diagram creation: + +1. Install the extension in VS Code +2. Configure MCP server for your AI assistant +3. Use AI to generate and edit architecture diagrams in Figma + +This is useful for creating polished, branded diagrams for presentations and documentation. + +``` + +``` diff --git a/docs/architecture/cross-system.md b/docs/architecture/cross-system.md deleted file mode 100644 index 9062f74..0000000 --- a/docs/architecture/cross-system.md +++ /dev/null @@ -1,153 +0,0 @@ -# Cross-System Architecture - -These systems together form a layered architecture. - -## Combined Architecture - -``` - User / Operator - │ - ▼ - AI Gateway - │ - (SLM Routing Layer) - intent, risk, - complexity, tools - │ - ┌───────────────┼────────────────┐ - │ │ │ - ▼ ▼ ▼ - Cognitive Mesh CodeFlow Engine AgentKit Forge - │ │ │ - (agent routing) (CI triage) (tool selection) - │ │ │ - └───────────────┼────────────────┘ - │ - ▼ - Large Model Layer - (reasoning, synthesis) - │ - ▼ - PhoenixRooivalk Edge - (SLM Only) - operator summaries, - reports (NOT control) -``` - -## Layer Responsibilities - -| Layer | Primary | SLM Role | LLM Role | -| ------------- | ------------------------------ | ----------------- | -------------------- | -| Edge | PhoenixRooivalk | Reports only | None | -| Gateway | AI Gateway | Routing, security | Complex reasoning | -| Orchestration | Cognitive Mesh, AgentKit Forge | Routing, tools | Synthesis | -| Intelligence | CodeFlow Engine | Triage | None | -| Synthesis | LLM Layer | None | Reasoning, synthesis | - -## SLM Role by Platform - -| Platform | Best SLM Role | Should SLM be Primary? | Escalate to LLM When | -| --------------- | ----------------------------------------- | ---------------------- | --------------------------------- | -| AI Gateway | routing, safety, cost control | **yes** | ambiguity, complex reasoning | -| Cognitive Mesh | agent routing, decomposition, compression | **yes** | cross-agent synthesis needed | -| CodeFlow Engine | PR/CI triage, failure summaries | **yes** | root cause requires deep analysis | -| AgentKit Forge | tool selection, memory shaping | **yes** | planning becomes ambiguous | -| PhoenixRooivalk | operator summaries, reports | **no** | strategic analysis or long-form | - -## Data Flow - -``` -Edge Event (Rooivalk) - │ - ▼ Classify locally (rules + signal ML) -Report - │ - ▼ Route via Gateway -AI Gateway - │ - ├─→ Route to Cognitive Mesh (agent task) - ├─→ Route to CodeFlow (CI task) - └─→ Route to AgentKit (tool task) - │ - ▼ - SLM Selection - │ - ┌────┼────┐ - │ │ │ - ▼ ▼ ▼ - Tool LLM Cache - │ - ▼ - Result + Telemetry - │ - ▼ - Cost Attribution -``` - -## Why SLMs Matter - -Across all five platforms, SLMs provide: - -| Benefit | Description | -| ---------------------- | --------------------------------------- | -| Cost Control | Large models invoked only when required | -| Latency Reduction | Routing decisions in milliseconds | -| Edge Deployment | PhoenixRooivalk inference locally | -| Deterministic Behavior | Easier to constrain and audit | - -## Practical Deployment Pattern - -### Tiered Model Strategy - -| Tier | Use For | Examples | -| ------ | --------------------- | ---------------------------------------------------------- | -| Tier 0 | deterministic/non-LLM | regex, schemas, policies, hard routing | -| Tier 1 | SLM | classification, decomposition, compression, tool selection | -| Tier 2 | LLM | synthesis, complex reasoning, ambiguous requests | - -### Operating Pattern - -``` -Tier 0 (Rules) - │ - ├─→ Direct pass/fail - │ - ▼ (pass) -Tier 1 (SLM) - │ - ├─→ Classification/compression - │ - ▼ (needs more) -Tier 2 (LLM) - │ - ├─→ Reasoning/synthesis - │ - ▼ -Response + Telemetry -``` - -## Implementation Order - -1. **AI Gateway SLM router** — Highest immediate cost-leverage -2. **CodeFlow Engine CI/PR classifier** — Fastest operational value -3. **Cognitive Mesh decomposer/router** — Strong leverage once taxonomy stabilizes -4. **AgentKit Forge tool selector** — Useful once tool inventory is mature -5. **PhoenixRooivalk operator interpreter** — Valuable, keep isolated from critical control - -## Summary - -| System | SLM Role | -| --------------- | ------------------------------------------- | -| AI Gateway | routing, policy checks, cost prediction | -| Cognitive Mesh | agent routing, task decomposition | -| PhoenixRooivalk | edge telemetry interpretation (NOT control) | -| CodeFlow Engine | CI intelligence, log analysis | -| AgentKit Forge | tool selection, context compression | - -## Key Principles - -1. **SLMs decide, LLMs reason** — SLM for routing/classification, LLM for synthesis -2. **Schema-bound outputs** — Always use structured output schemas -3. **Confidence cascades** — Low confidence → escalate to next tier -4. **Safety boundaries** — Never use SLM for safety-critical decisions (Rooivalk) -5. **Cost controls** — Budget caps, monitoring, alerts diff --git a/docs/architecture/reference/c4-architecture.md b/docs/architecture/reference/c4-architecture.md new file mode 100644 index 0000000..0ce3c98 --- /dev/null +++ b/docs/architecture/reference/c4-architecture.md @@ -0,0 +1,332 @@ +# C4-Style Architecture + +This section provides C4-style diagrams showing system context, containers, and key sequences. + +## 1. System Context + +This shows the major external actors and the five core systems. + +```mermaid +flowchart TB + User[Users / Operators / Developers] + Apps[Client Apps / Internal Portals / APIs] + GitHub[GitHub / CI Events / PRs / Issues] + Sensors[PhoenixRooivalk Sensors / RF / EO / Radar / Telemetry] + Providers[Model Providers / Hosted Models] + Tools[Azure / Terraform / Kusto / GitHub APIs / Internal Tools] + + subgraph Platform["PhoenixVC AI Platform"] + AIG[AI Gateway] + CM[Cognitive Mesh] + CFE[CodeFlow Engine] + AKF[AgentKit Forge] + PR[PhoenixRooivalk Edge + Command Layer] + end + + User --> AIG + Apps --> AIG + GitHub --> CFE + Sensors --> PR + + AIG --> CM + AIG --> CFE + AIG --> AKF + AIG --> PR + + CM --> Providers + CFE --> Providers + AKF --> Providers + AIG --> Providers + + CM --> Tools + CFE --> Tools + AKF --> Tools + PR --> AIG +``` + +### External Actors + +| Actor | Role | +| ------------------------------ | ------------------------------------------ | +| Users / Operators / Developers | Initiate requests, reviews, investigations | +| Apps / APIs | Consume AI control plane programmatically | +| GitHub | Triggers software delivery workflows | +| Sensors | Produce edge telemetry | +| Model Providers | Serve LLM/SLM inference | +| Tools | Execution surfaces, enterprise integration | + +### System Roles + +| System | Role | +| --------------- | -------------------------------------------- | +| AI Gateway | Front door, routing, policy, budget, caching | +| Cognitive Mesh | Multi-agent coordination and synthesis | +| CodeFlow Engine | SDLC/CI intelligence | +| AgentKit Forge | Tool-driven agent execution | +| PhoenixRooivalk | Edge detection interpretation | + +--- + +## 2. Container Diagram + +```mermaid +flowchart TB + subgraph Clients["Clients / Event Sources"] + C1[Web UI / Chat UI] + C2[Internal Apps / APIs] + C3[GitHub Webhooks] + C4[Operator Console] + end + + subgraph Gateway["AI Gateway"] + G1[Ingress API] + G2[SLM Classifier] + G3[Policy Scan] + G4[Budget Router] + G5[Semantic Cache] + G6[Escalation Judge] + end + + subgraph Mesh["Cognitive Mesh"] + M1[Specialist Router] + M2[Task Decomposer] + M3[State Manager] + M4[Synthesis Coordinator] + end + + subgraph Forge["AgentKit Forge"] + F1[Tool Selector] + F2[Argument Extractor] + F3[Execution Loop] + F4[Result Compressor] + end + + subgraph CodeFlow["CodeFlow Engine"] + CF1[PR / Diff Classifier] + CF2[Risk Scorer] + CF3[CI Failure Triage] + CF4[Review / Action Engine] + end + + subgraph Shared["Shared Platform Services"] + S1[Policy Engine] + S2[Observability] + S3[State Store] + S4[Vector Store] + S5[Tool Broker] + end + + subgraph Models["Model Tier"] + ML1[SLM Pool] + ML2[LLM Pool] + end + + subgraph Edge["PhoenixRooivalk Edge"] + E1[Detection Pipeline] + E2[Edge SLM Event Labeler] + E3[Edge SLM Summarizer] + E4[Edge Escalation Filter] + end + + C1 --> G1 + C2 --> G1 + C3 --> CF1 + C4 --> G1 + + G1 --> G2 + G2 --> G3 + G3 --> G4 + G4 --> G5 + G5 --> G6 + + G6 --> M1 + G6 --> F1 + G6 --> CF1 + G6 --> ML2 + + M1 --> M2 + M2 --> M3 + M3 --> M4 + + F1 --> F2 + F2 --> F3 + F3 --> F4 + + CF1 --> CF2 + CF2 --> CF3 + CF3 --> CF4 + + G3 --> S1 + G6 --> S2 + M3 --> S3 + G5 --> S3 + G5 --> S4 + F3 --> S5 + CF4 --> S5 + + E1 --> E2 + E2 --> E3 + E3 --> E4 + E4 --> G1 +``` + +### Container Responsibilities + +#### AI Gateway + +| Container | Responsibility | +| ---------------- | -------------------------------- | +| Ingress API | Entry point | +| SLM Classifier | Intent/complexity classification | +| Policy Scan | Safety/compliance gate | +| Budget Router | Tier selection | +| Semantic Cache | Avoid redundant inference | +| Escalation Judge | Small-vs-large decision | + +#### Cognitive Mesh + +| Container | Responsibility | +| --------------------- | ---------------- | +| Specialist Router | Picks agent(s) | +| Task Decomposer | Splits work | +| State Manager | Compressed state | +| Synthesis Coordinator | Merge + escalate | + +#### AgentKit Forge + +| Container | Responsibility | +| ------------------ | ------------------ | +| Tool Selector | Chooses tool | +| Argument Extractor | Structured inputs | +| Execution Loop | Run/retry/fallback | +| Result Compressor | Distills output | + +#### CodeFlow Engine + +| Container | Responsibility | +| -------------------- | ------------------- | +| PR/Diff Classifier | File classification | +| Risk Scorer | Risk assessment | +| CI Failure Triage | Failure bucketing | +| Review/Action Engine | Routing/actions | + +#### PhoenixRooivalk Edge + +| Container | Responsibility | +| ---------------------- | ------------------ | +| Detection Pipeline | Signal processing | +| Edge Event Labeler | Labels events | +| Edge Summarizer | Operator summaries | +| Edge Escalation Filter | Cloud escalation | + +--- + +## 3. CodeFlow Sequence + +```mermaid +sequenceDiagram + participant GH as GitHub + participant CF as CodeFlow + participant SLM as SLM Tier + participant TO as CI / Tool Broker + participant GW as AI Gateway + participant LLM as LLM Tier + + GH->>CF: PR opened / updated + CF->>SLM: classify files + intent + SLM-->>CF: infra-change, high risk + + CF->>TO: trigger CI / contract checks + TO-->>CF: logs, results + + CF->>SLM: triage failures + SLM-->>CF: breaking change detected + + CF->>GW: request remediation + GW->>LLM: analyze + explain + LLM-->>GW: remediation steps + GW-->>CF: response + + CF-->>GH: PR comment with findings +``` + +### SLM Handles + +- File classification +- Risk scoring +- Log bucketing +- Cause identification + +### LLM Handles + +- Remediation proposals +- Tradeoff explanation +- Evidence synthesis + +--- + +## 4. PhoenixRooivalk Sequence + +```mermaid +sequenceDiagram + participant Sensors + participant DP as Detection Pipeline + participant ESLM as Edge SLM + participant OC as Operator Console + participant GW as AI Gateway + participant CM as Cognitive Mesh + participant LLM as Cloud LLM + + Sensors->>DP: raw detections + DP->>ESLM: normalized event + ESLM-->>DP: label + summary + confidence + + DP->>OC: local alert + + alt Below threshold + DP->>OC: local record + else Above threshold + DP->>GW: compressed bundle + GW->>CM: route to workflow + CM->>LLM: deep analysis + LLM-->>CM: interpretation + CM-->>GW: response + GW-->>OC: escalated advisory + end +``` + +### Design Intent + +- Label events +- Summarize meaning +- Suppress noise +- Conserve bandwidth +- Escalate only when justified + +--- + +## 5. C4 Narrative + +### System Context + +The platform provides a unified AI control plane for developer workflows, agent orchestration, and edge intelligence. + +### Container View + +| Layer | Description | +| --------------- | ---------------------------------------- | +| Control-plane | Classification, policy, routing, caching | +| Execution | Orchestration, tools, CI, edge | +| Shared services | Policy, retrieval, memory, telemetry | +| Model | SLM and LLM workloads | +| Edge | Local interpretation + escalation | + +### Dynamic Patterns + +| Pattern | System | Description | +| -------------- | --------------- | -------------------- | +| Gateway triage | AI Gateway | Selective escalation | +| Repo triage | CodeFlow | Remediation | +| Multi-agent | Cognitive Mesh | State compression | +| Tool loops | AgentKit Forge | Result distillation | +| Edge-first | PhoenixRooivalk | Threshold escalation | diff --git a/docs/architecture/reference/contracts.md b/docs/architecture/reference/contracts.md new file mode 100644 index 0000000..7379030 --- /dev/null +++ b/docs/architecture/reference/contracts.md @@ -0,0 +1,117 @@ +# Shared Contracts + +Standardized JSON schemas used across all systems for consistent telemetry, routing, and event handling. + +--- + +## RoutingDecision + +Emitted for every routing decision in the gateway. + +```json +{ + "intent": "string", + "complexity": "low|medium|high", + "risk_level": "low|medium|high|critical", + "policy_status": "allow|redact|deny|review", + "needs_tool": true, + "recommended_tier": "slm|llm", + "recommended_path": "direct|tool_first|mesh|escalate", + "confidence": 0.0 +} +``` + +| Field | Type | Description | +| ---------------- | ------- | --------------------------------------------- | +| intent | string | Classified intent (e.g., "ci_failure_triage") | +| complexity | enum | Estimated task complexity | +| risk_level | enum | Risk assessment | +| policy_status | enum | Policy engine result | +| needs_tool | boolean | Whether tool invocation is required | +| recommended_tier | enum | SLM or LLM recommendation | +| recommended_path | enum | Execution path recommendation | +| confidence | float | 0.0-1.0 confidence score | + +--- + +## ModelUsageEvent + +Emitted for every model invocation for cost tracking and quality analysis. + +```json +{ + "trace_id": "uuid", + "system": "ai-gateway", + "model_tier": "slm", + "model_name": "phi-4-mini", + "token_in": 320, + "token_out": 64, + "latency_ms": 41, + "estimated_cost": 0.0002 +} +``` + +| Field | Type | Description | +| -------------- | ------ | ---------------------------- | +| trace_id | uuid | Distributed trace identifier | +| system | string | Originating system | +| model_tier | enum | slm or llm | +| model_name | string | Specific model used | +| token_in | int | Input tokens | +| token_out | int | Output tokens | +| latency_ms | int | Response time | +| estimated_cost | float | Estimated cost in USD | + +--- + +## ToolExecutionEvent + +Emitted for every tool invocation through the Tool Broker. + +```json +{ + "trace_id": "uuid", + "tool_name": "azure_cli", + "action": "monitor_query", + "success": true, + "latency_ms": 820, + "retry_count": 1 +} +``` + +| Field | Type | Description | +| ----------- | ------- | ---------------------------- | +| trace_id | uuid | Distributed trace identifier | +| tool_name | string | Tool identifier | +| action | string | Action performed | +| success | boolean | Execution outcome | +| latency_ms | int | Execution time | +| retry_count | int | Number of retries | + +--- + +## EdgeEscalationPacket + +Compressed escalation from PhoenixRooivalk edge nodes. + +```json +{ + "event_id": "uuid", + "site_id": "string", + "event_label": "rf_anomaly", + "summary": "Consumer quadcopter signature near perimeter", + "confidence": 0.77, + "telemetry_refs": ["blob://..."], + "requires_cloud_analysis": true +} +``` + +| Field | Type | Description | +| ----------------------- | ------- | --------------------------------- | +| event_id | uuid | Unique event identifier | +| site_id | string | Edge site identifier | +| event_label | string | Classified event type | +| summary | string | Compressed human-readable summary | +| confidence | float | 0.0-1.0 confidence score | +| telemetry_refs | array | Blob references for raw telemetry | +| requires_cloud_analysis | boolean | Needs LLM-level analysis | diff --git a/docs/architecture/reference/cross-system.md b/docs/architecture/reference/cross-system.md new file mode 100644 index 0000000..a02ba19 --- /dev/null +++ b/docs/architecture/reference/cross-system.md @@ -0,0 +1,503 @@ +# Cross-System Architecture + +This document describes the unified production architecture that separates: + +- Control plane vs execution plane +- SLM tier vs LLM tier +- Cloud vs edge +- Policy, observability, cache, and cost controls + +## Unified Production Architecture + +```mermaid +flowchart TB + subgraph Clients["Ingress Sources"] + U1[Users] + U2[Developers / PR Events] + U3[Apps / APIs] + U4[Operators / Mission Console] + U5[Sensors / Telemetry] + end + + subgraph Cloud["Cloud Control Plane"] + GW[AI Gateway] + + subgraph SLMCP["SLM Control Tier"] + S1[Intent + Complexity Classifier] + S2[Policy / PII / Secret / Injection Scan] + S3[Cost + Latency Router] + S4[Semantic Cache Admission / Reuse] + S5[Context Compressor] + S6[Escalation Judge] + end + + subgraph Orchestration["Orchestration Services"] + CM[Cognitive Mesh] + AF[AgentKit Forge] + CF[CodeFlow Engine] + end + + subgraph SharedServices["Shared Platform Services"] + POL[Policy Engine] + OBS[Observability / Telemetry / Audit] + BUD[Budget + Rate Controls] + MEM[State Store / Memory / Session Context] + VC[Vector Store / Retrieval] + TOOLS[Tools / APIs / CLI / GitHub / Azure / Kusto / Terraform] + end + + subgraph LLMZone["Deep Reasoning Tier"] + L1[Reasoning LLM] + L2[Code / Analysis LLM] + L3[Research / Synthesis LLM] + end + + subgraph Providers["Provider Layer"] + P1[OpenAI / Azure OpenAI] + P2[Other Model Providers] + P3[Local Hosted Models] + end + end + + subgraph Edge["PhoenixRooivalk Edge Plane"] + RP[Signal / Detection Pipeline] + ER1[Edge SLM: Event Labeler] + ER2[Edge SLM: Threat Summarizer] + ER3[Edge SLM: Alert Composer] + ER4[Edge SLM: Escalation Filter] + OC[Operator Console] + end + + U1 --> GW + U2 --> GW + U3 --> GW + U4 --> GW + U5 --> RP + + GW --> S1 + S1 --> S2 + S2 --> S3 + S3 --> S4 + S4 --> S5 + S5 --> S6 + + S2 --> POL + S3 --> BUD + S4 --> MEM + S5 --> VC + S6 --> OBS + + S6 --> CM + S6 --> AF + S6 --> CF + S6 --> L1 + S6 --> L2 + S6 --> L3 + + CM --> MEM + CM --> TOOLS + CM --> L1 + + AF --> MEM + AF --> TOOLS + AF --> L2 + + CF --> MEM + CF --> TOOLS + CF --> L2 + + L1 --> P1 + L2 --> P1 + L3 --> P2 + L2 --> P3 + + RP --> ER1 + ER1 --> ER2 + ER2 --> ER3 + ER3 --> OC + ER2 --> ER4 + ER4 --> GW +``` + +## System Responsibilities + +### AI Gateway + +The front door that owns: + +- Request intake +- Classification +- Safety checks +- Budget-aware routing +- Cache decisions +- Escalation decisions + +### Cognitive Mesh + +The orchestration brain for multi-agent work: + +- Specialist routing +- Decomposition +- Shared state coordination + +### AgentKit Forge + +The tool execution runtime: + +- Tool selection +- Parameter extraction +- Execution loops + +### CodeFlow Engine + +The CI/CD intelligence plane: + +- PR/diff triage +- CI failure bucketing +- Contract breakage interpretation + +### PhoenixRooivalk + +The edge interpretation plane: + +- Event labeling +- Operator alert generation +- Low-bandwidth summaries + +--- + +## Control Plane vs Execution Plane + +```mermaid +flowchart LR + subgraph CP["Control Plane"] + A[AI Gateway] + B[SLM Routing] + C[Policy Engine] + D[Budget Controls] + E[Observability] + F[State / Memory] + end + + subgraph EP["Execution Plane"] + G[Cognitive Mesh] + H[AgentKit Forge] + I[CodeFlow Engine] + J[LLM Providers] + K[Tools / APIs] + L[PhoenixRooivalk Edge] + end + + A --> B + B --> G + B --> H + B --> I + B --> J + G --> K + H --> K + I --> K + L --> A + C --> A + D --> A + E --> A + F --> G + F --> H + F --> I +``` + +--- + +## SLM Tier vs LLM Tier + +```mermaid +flowchart TD + IN[Request / Event / Telemetry] --> SLM[SLM Tier] + + subgraph SLMOps["SLM Responsibilities"] + S1[Classify] + S2[Screen] + S3[Route] + S4[Compress] + S5[Validate] + S6[Triage] + end + + SLM --> S1 + SLM --> S2 + SLM --> S3 + SLM --> S4 + SLM --> S5 + SLM --> S6 + + S3 --> D{Escalate?} + D -->|No| OUT1[Fast / Cheap Response] + D -->|Yes| LLM[LLM Tier] + + subgraph LLMOps["LLM Responsibilities"] + L1[Deep reasoning] + L2[Complex synthesis] + L3[Ambiguous tradeoffs] + L4[Novel plan generation] + end + + LLM --> L1 + LLM --> L2 + LLM --> L3 + LLM --> L4 + LLM --> OUT2[High-value response] +``` + +--- + +## Practical Request Path (AI Gateway) + +```mermaid +sequenceDiagram + participant C as Client + participant G as AI Gateway + participant S as SLM Layer + participant T as Tools + participant M as Mesh + participant L as LLM + participant O as Observability + + C->>G: Request + G->>S: classify + scan + estimate complexity + S-->>G: route decision + confidence + G->>O: log request metadata + + alt Simple + G-->>C: direct low-cost response + else Tool-first + G->>M: dispatch task + M->>T: execute tools + T-->>M: tool results + M->>S: compress results + S-->>M: compact state + M-->>C: response + else Complex + G->>L: escalate with compact context + L-->>G: deep reasoning output + G-->>C: final response + end +``` + +--- + +## CodeFlow Engine CI Path + +```mermaid +flowchart TD + PR[PR / Push / Issue Event] --> C1[SLM Diff Classifier] + C1 --> C2[SLM Risk Scorer] + C2 --> C3[SLM Test Impact Predictor] + + C3 --> D{Path} + D -->|low risk| F1[Fast checks] + D -->|high risk| F2[Full CI / security / contract tests] + D -->|uncertain| F3[LLM or human review gate] + + F1 --> L[CI Logs] + F2 --> L + F3 --> L + + L --> T1[SLM Failure Triage] + T1 --> T2[SLM Comment Draft / Routing] + T2 --> T3[Action: retry / assign / block / suggest fix] +``` + +--- + +## AgentKit Forge Tool Loop + +```mermaid +flowchart LR + A[Task] --> B[SLM Tool Selector] + B --> C[Select Tool + Args] + + C --> D1[GitHub] + C --> D2[Azure] + C --> D3[Terraform] + C --> D4[Kusto] + C --> D5[Docs / Files] + + D1 --> E[SLM Result Compressor] + D2 --> E + D3 --> E + D4 --> E + D5 --> E + + E --> F{Enough?} + F -->|yes| G[Return answer] + F -->|no| H[Escalate to LLM / Mesh] +``` + +--- + +## PhoenixRooivalk Edge Path + +```mermaid +sequenceDiagram + participant S as Sensors + participant P as Detection Pipeline + participant E as Edge SLM + participant O as Operator Console + participant C as Cloud Gateway + + S->>P: RF / EO / radar / telemetry + P->>E: normalized event packet + E-->>P: label + summary + confidence + P->>O: operator alert + + alt threshold exceeded + P->>C: send compressed evidence bundle + else local-only event + P->>O: keep local record + end +``` + +--- + +## Layer Responsibilities + +| Layer | Primary | SLM Role | LLM Role | +| ------------- | ------------------------------ | ----------------------- | -------------------- | +| Edge | PhoenixRooivalk | Reports only | None | +| Gateway | AI Gateway | Routing, security, cost | Complex reasoning | +| Orchestration | Cognitive Mesh, AgentKit Forge | Routing, tools | Synthesis | +| Intelligence | CodeFlow Engine | Triage | None | +| Synthesis | LLM Layer | None | Reasoning, synthesis | + +--- + +## Ownership Boundaries + +### AI Gateway owns + +- Ingress control +- Policy enforcement +- Routing +- Cost governance +- Model/provider abstraction +- Shared telemetry + +### Cognitive Mesh owns + +- Multi-agent coordination +- Task decomposition +- State fusion +- Escalation into deep synthesis + +### AgentKit Forge owns + +- Tool loops +- Action execution +- Extraction +- Retry/fallback behavior + +### CodeFlow Engine owns + +- Software delivery intelligence +- Repo event interpretation +- CI analysis +- Developer feedback automation + +### PhoenixRooivalk owns + +- Edge summarization +- Local alerting +- Compressed event escalation + +--- + +## Implementation Phases + +### Phase 1 — Gateway-first + +Build SLM control plane: intent classifier, policy scanner, budget router, cache gate, escalation judge + +### Phase 2 — CodeFlow Engine + +Add SLMs: diff classifier, PR risk scorer, CI failure bucketer + +### Phase 3 — AgentKit Forge + +Optimize tool loops: tool selector, arg extractor, result compressor + +### Phase 4 — Cognitive Mesh + +Add: specialist router, decomposer, state manager + +### Phase 5 — PhoenixRooivalk + +Deploy edge SLMs: event label, alert text, escalation filter + +--- + +## Shared Telemetry Schema + +```json +{ + "trace_id": "uuid", + "system": "ai-gateway|cognitive-mesh|codeflow-engine|agentkit-forge|phoenixrooivalk", + "stage": "classify|route|tool_call|llm_escalation|edge_alert", + "model_tier": "slm|llm", + "model_name": "example-model", + "decision": "allow|block|tool_first|escalate|local_only", + "confidence": 0.92, + "latency_ms": 83, + "token_in": 540, + "token_out": 96, + "estimated_cost": 0.0014, + "policy_flags": ["pii:none", "secret:none"], + "outcome": "success" +} +``` + +--- + +## Production Rules + +### Escalate to LLM when: + +- Confidence below threshold +- Ambiguity above threshold +- Multiple specialists disagree +- Tool results conflict +- Output is user-facing and high-stakes +- Architecture/tradeoff reasoning required + +### Stay in SLM path when: + +- Task is classification +- Task is screening +- Task is extraction +- Task is summarization +- Task is repetitive CI triage +- Task is edge-local operator support + +--- + +## C4-Style Architecture + +For detailed C4-style diagrams including: + +- System Context diagram +- Container diagram +- CodeFlow sequence +- PhoenixRooivalk edge-to-cloud sequence + +See [c4-architecture.md](c4-architecture.md) + +--- + +## Bottom Line + +The most practical target architecture: + +- **AI Gateway** as the centralized SLM control plane +- **Cognitive Mesh / AgentKit Forge / CodeFlow Engine** as execution systems +- **PhoenixRooivalk** as edge plane with local SLM autonomy +- **LLMs** reserved for synthesis, ambiguity, and hard reasoning + +> Gateway governs. SLMs triage and steer. Specialist systems execute. LLMs arbitrate the hard cases. Edge stays local unless escalation is justified. diff --git a/docs/architecture/reference/dashboards.md b/docs/architecture/reference/dashboards.md new file mode 100644 index 0000000..9e70f36 --- /dev/null +++ b/docs/architecture/reference/dashboards.md @@ -0,0 +1,17 @@ +# Recommended Dashboards + +Grafana/ADX dashboard recommendations for operational visibility. + +--- + +## Dashboard Pack + +Split Grafana/ADX dashboards into these boards: + +| Dashboard | Metrics | +| -------------------- | ------------------------------------------------------------------------------------- | +| **Executive / Cost** | Total requests, SLM vs LLM ratio, cost by route, cost per outcome, escalation rate | +| **Reliability** | Error rate, tool failure rate, retry hotspots, provider latency, queue backlog | +| **Governance** | Policy blocks, redaction counts, provider data-boundary usage, audit completeness | +| **CodeFlow** | PR risk distribution, CI triage buckets, contract-break suspects, feedback usefulness | +| **Rooivalk** | Detections vs alerts, local vs escalated, site alert volume, edge latency | diff --git a/docs/architecture/reference/deployment-observability.md b/docs/architecture/reference/deployment-observability.md new file mode 100644 index 0000000..f237e21 --- /dev/null +++ b/docs/architecture/reference/deployment-observability.md @@ -0,0 +1,383 @@ +# Deployment, Trust Boundaries & Observability + +This set extends the C4 view into operational architecture including deployment, security boundaries, and telemetry. + +--- + +## 1. Deployment Diagram + +This is the practical cloud/edge deployment shape for your stack. + +```mermaid +flowchart TB + subgraph Internet["Public / External"] + U1[Users / Browsers / Chat Clients] + U2[GitHub Webhooks] + U3[External APIs / Apps] + MP[Model Providers] + end + + subgraph Azure["Azure Subscription"] + DNS[Azure DNS / Front Door / App Gateway] + KV[Key Vault] + LAW[Log Analytics] + ADX[Azure Data Explorer / Kusto] + BLOB[Blob Storage] + REDIS[Redis / Cache] + DB[Postgres / Cosmos / State DB] + AISEARCH[Vector Store / AI Search] + GRAF[Grafana] + BUS[Service Bus / Queue] + MON[Azure Monitor / App Insights] + + subgraph Runtime["Runtime Plane"] + GW[AI Gateway] + CM[Cognitive Mesh] + AKF[AgentKit Forge] + CFE[CodeFlow Engine] + TB[Tool Broker] + OPA[Policy Engine] + end + + subgraph Workers["Background / Event Workers"] + W1[PR / CI Worker] + W2[Agent Task Worker] + W3[Telemetry Ingest Worker] + W4[Cost / Audit Aggregator] + end + + subgraph Models["Hosted Model Zone"] + SLM[SLM Serving Pool] + LLM[LLM Adapter / Provider Proxy] + end + end + + subgraph Edge["PhoenixRooivalk Edge Sites"] + SENS[RF / EO / Radar / Telemetry Sensors] + EDGEPIPE[Detection Pipeline] + E1[Edge SLM Event Labeler] + E2[Edge SLM Summarizer] + E3[Edge Escalation Filter] + OPC[Operator Console] + SYNC[Secure Sync Agent] + end + + U1 --> DNS + U2 --> DNS + U3 --> DNS + DNS --> GW + + GW --> REDIS + GW --> DB + GW --> AISEARCH + GW --> KV + GW --> OPA + GW --> TB + GW --> SLM + GW --> LLM + GW --> MON + + CM --> DB + CM --> BUS + CM --> AISEARCH + CM --> TB + CM --> SLM + CM --> LLM + CM --> MON + + AKF --> DB + AKF --> BUS + AKF --> TB + AKF --> SLM + AKF --> LLM + AKF --> MON + + CFE --> DB + CFE --> BUS + CFE --> TB + CFE --> SLM + CFE --> LLM + CFE --> MON + + W1 --> CFE + W2 --> AKF + W3 --> GW + W4 --> ADX + + MON --> LAW + LAW --> ADX + BLOB --> ADX + ADX --> GRAF + + MP --> LLM + + SENS --> EDGEPIPE + EDGEPIPE --> E1 + E1 --> E2 + E2 --> E3 + E2 --> OPC + E3 --> SYNC + SYNC --> GW +``` + +### Practical Reading of Deployment + +| Zone | Components | Purpose | +| -------------------- | --------------------------------------------------------- | ----------------------- | +| **Front door** | Azure DNS / Front Door / App Gateway | Ingress and routing | +| **Shared backing** | Key Vault, Redis, Postgres/Cosmos, AI Search, Service Bus | State, caching, secrets | +| **Runtime services** | AI Gateway, Cognitive Mesh, AgentKit Forge, CodeFlow | Core execution | +| **Workers** | PR/CI, Agent Task, Telemetry, Cost Aggregators | Background processing | +| **Model zone** | SLM Pool, LLM Adapter | AI inference | +| **Edge** | Detection Pipeline, Edge SLMs, Operator Console | Local operation | + +--- + +## 2. Trust Boundary Diagram + +This is the security-relevant segmentation. + +```mermaid +flowchart LR + subgraph TB1["Boundary 1: Public / Untrusted"] + A[Users / Browsers] + B[GitHub Webhooks] + C[External Apps] + D[Internet Traffic] + end + + subgraph TB2["Boundary 2: Controlled Ingress"] + E[Front Door / API Gateway / WAF] + F[AI Gateway] + end + + subgraph TB3["Boundary 3: Internal Control Plane"] + G[Policy Engine] + H[Budget / Rate Controls] + I[Session / State Store] + J[Semantic Cache] + K[Observability / Audit] + end + + subgraph TB4["Boundary 4: Internal Execution Plane"] + L[Cognitive Mesh] + M[AgentKit Forge] + N[CodeFlow Engine] + O[Tool Broker] + end + + subgraph TB5["Boundary 5: Sensitive Integration Zone"] + P[Key Vault] + Q[Azure APIs] + R[GitHub APIs] + S[Kusto / Terraform / Internal Tools] + end + + subgraph TB6["Boundary 6: External Model Providers"] + T[LLM Providers] + U[Hosted / External SLM Providers] + end + + subgraph TB7["Boundary 7: Edge / Field Environment"] + V[PhoenixRooivalk Edge Node] + W[Sensors] + X[Operator Console] + end + + A --> E + B --> E + C --> E + D --> E + E --> F + + F --> G + F --> H + F --> I + F --> J + F --> K + + F --> L + F --> M + F --> N + L --> O + M --> O + N --> O + + O --> Q + O --> R + O --> S + F --> P + L --> P + M --> P + N --> P + + F --> T + F --> U + L --> T + M --> T + N --> T + + W --> V + V --> X + V --> F +``` + +### Security Interpretation + +| Boundary | Description | +| --------- | ----------------------------------------------------------------------------------------- | +| **1 → 2** | Treat all inbound as hostile until authenticated, rate-limited, schema-validated, logged | +| **2 → 3** | AI Gateway is the only entry into internal AI control plane | +| **3 → 4** | Control-plane services decide policy, routing, cost, escalation | +| **4 → 5** | Sensitive zone: credentials, infra mutation, production APIs, write actions | +| **6** | External providers are semi-trusted - apply output scanning and redaction | +| **7** | Edge nodes are partially disconnected - need signed software, local audit, encrypted sync | + +--- + +## 3. Observability Architecture + +This is the unified telemetry design across all systems. + +```mermaid +flowchart TB + subgraph Producers["Telemetry Producers"] + P1[AI Gateway] + P2[Cognitive Mesh] + P3[AgentKit Forge] + P4[CodeFlow Engine] + P5[PhoenixRooivalk Edge] + P6[Tool Broker] + P7[Policy Engine] + end + + subgraph Signals["Signal Types"] + S1[Request / Response Logs] + S2[Routing Decisions] + S3[Policy Events] + S4[Tool Calls] + S5[Model Usage] + S6[CI / PR Events] + S7[Edge Detection Events] + S8[Cost / Token Metrics] + S9[Audit Trail] + end + + subgraph Ingest["Ingestion"] + I1[OpenTelemetry Collectors] + I2[Azure Monitor / App Insights] + I3[Blob Export] + I4[Log Analytics] + end + + subgraph Analytics["Analytics / Query"] + A1[Azure Data Explorer / Kusto] + A2[Cost Aggregates] + A3[Decision Quality Metrics] + A4[Security / Audit Views] + end + + subgraph Viz["Visualization / Alerting"] + V1[Grafana Dashboards] + V2[Alerts / On-call] + V3[Ops Runbooks] + V4[Executive Cost Views] + end + + P1 --> S1 + P1 --> S2 + P1 --> S5 + P1 --> S8 + P1 --> S9 + + P2 --> S2 + P2 --> S4 + P2 --> S5 + P2 --> S9 + + P3 --> S4 + P3 --> S5 + P3 --> S9 + + P4 --> S6 + P4 --> S2 + P4 --> S5 + P4 --> S9 + + P5 --> S7 + P5 --> S2 + P5 --> S9 + + P6 --> S4 + P7 --> S3 + + S1 --> I1 + S2 --> I1 + S3 --> I1 + S4 --> I1 + S5 --> I2 + S6 --> I2 + S7 --> I3 + S8 --> I2 + S9 --> I4 + + I1 --> A1 + I2 --> A1 + I3 --> A1 + I4 --> A1 + + A1 --> A2 + A1 --> A3 + A1 --> A4 + + A2 --> V1 + A3 --> V1 + A4 --> V1 + V1 --> V2 + V1 --> V3 + V1 --> V4 +``` + +### What to Measure + +#### Gateway metrics + +- Requests by route +- SLM vs LLM escalation rate +- Confidence distribution +- Token in/out averages +- Semantic cache hit rate +- Refusal/block counts +- Provider latency/error rate + +#### Cognitive Mesh metrics + +- Route-to-specialist distribution +- Decomposition count per task +- Summary compression ratio +- Multi-agent disagreement rate +- Escalation rate to LLM synthesis + +#### AgentKit Forge metrics + +- Tool selection accuracy +- Retry counts +- Fallback frequency +- Avg tool-loop depth +- Tool output compression ratio + +#### CodeFlow Engine metrics + +- PR classification distribution +- False positive/negative on risk tier +- CI failure bucket frequency +- Contract-break detection precision +- Comment usefulness feedback + +#### PhoenixRooivalk metrics + +- Local-only vs escalated events +- Edge summary latency +- Alert volume per session +- Signal-to-alert compression ratio +- Dropped/deferred syncs diff --git a/docs/architecture/reference/matrix-agentkit.md b/docs/architecture/reference/matrix-agentkit.md new file mode 100644 index 0000000..ef3cd79 --- /dev/null +++ b/docs/architecture/reference/matrix-agentkit.md @@ -0,0 +1,110 @@ +# AgentKit Forge SLM Implementation + +## SLM Endpoints + +| Endpoint | Method | Purpose | +| ----------------------- | ------ | ------------------------------------------------- | +| `/slm/select-tool` | POST | Maps request to GitHub/Azure/Terraform/Kusto/docs | +| `/slm/filter-context` | POST | Selects only relevant memory/state | +| `/slm/estimate-budget` | POST | Predicts steps, token tier, tool-first viability | +| `/slm/check-escalation` | POST | Decides whether LLM planning is needed | + +## Service Boundaries + +```mermaid +flowchart TD + A[Agent Runtime] --> B[Task Intake] + B --> C[SLM Tool Selector] + C --> D{Tool / Reason / Direct} + D --> E[Tool Adapter Layer] + D --> F[Direct Response] + D --> G[LLM Planner] + E --> H[State Store] + G --> H + H --> I[SLM Context Filter] + I --> J[Next Action] +``` + +## Example Responses + +**select-tool:** + +```json +{ + "action_mode": "tool", + "tool": "azure_cli", + "operation_family": "cost_management", + "arguments_hint": { "service": "foundry", "time_window": "last_30_days" }, + "confidence": 0.89 +} +``` + +**estimate-budget:** + +```json +{ + "predicted_steps": 4, + "token_cost_tier": "medium", + "tool_first_recommended": true, + "llm_needed": false, + "confidence": 0.81 +} +``` + +## Contract Shapes + +```typescript +interface SelectToolOutput { + action_mode: "tool" | "reason" | "direct"; + tool: "github" | "azure_cli" | "terraform" | "kusto" | "docs_search"; + operation_family: string; + arguments_hint: Record; + confidence: number; +} + +interface EstimateBudgetOutput { + predicted_steps: number; + token_cost_tier: "low" | "medium" | "high"; + tool_first_recommended: boolean; + llm_needed: boolean; + confidence: number; +} +``` + +## Telemetry Fields + +| Field | Type | Description | +| ------------------- | ------- | ------------------ | +| `agent_run_id` | uuid | Unique run ID | +| `selected_tool` | string | Tool selected | +| `action_mode` | string | tool/reason/direct | +| `budget_tier` | string | Cost tier | +| `predicted_steps` | number | Steps predicted | +| `escalated_to_llm` | boolean | LLM invoked | +| `compression_ratio` | number | Context reduced | + +## Fallback Rules + +| Condition | Action | +| ----------------------------- | --------------------------- | +| No tool confidence >= 0.80 | Don't execute automatically | +| Context filter low | Preserve more context | +| Budget low but ambiguity high | Escalate to planner | +| Tool failure | Classify before retry | + +## Configurable Thresholds + +```typescript +const DEFAULT_THRESHOLDS = { + tool_selection: { direct_execute: 0.85, require_confirm: 0.7 }, + context_filtering: { aggressive: 0.85, conservative: 0.78 }, + escalation_check: { continue_tools: 0.8, llm_planning: 0.65 }, + budget_estimate: { reliable: 0.75, uncertain: 0.6 }, +}; +``` + +| Threshold | Action | +| --------- | -------------------- | +| >= 0.85 | Direct execution | +| 0.70-0.84 | Require confirmation | +| < 0.70 | Decline / clarify | diff --git a/docs/architecture/reference/matrix-codeflow.md b/docs/architecture/reference/matrix-codeflow.md new file mode 100644 index 0000000..90d2fe7 --- /dev/null +++ b/docs/architecture/reference/matrix-codeflow.md @@ -0,0 +1,110 @@ +# CodeFlow Engine SLM Implementation + +## SLM Endpoints + +| Endpoint | Method | Purpose | +| ---------------------------- | ------ | --------------------------------------------------------------- | +| `/slm/classify-change` | POST | Determines: docs/code/config/infra/security, risk, blast radius | +| `/slm/suggest-pipeline` | POST | Fast path vs full path | +| `/slm/summarize-failure` | POST | Turns CI output into actionable summary | +| `/slm/release-note-fragment` | POST | Generates structured change summary | + +## Service Boundaries + +```mermaid +flowchart TD + A[GitHub Event] --> B[Diff / Metadata Collector] + B --> C[SLM Change Classifier] + C --> D[Pipeline Policy Engine] + D --> E[CI Path Selection] + E --> F[Workflow Execution] + F --> G[SLM Failure Summarizer] + G --> H[PR Comment / Status] +``` + +## Example Responses + +**classify-change:** + +```json +{ + "change_type": "infra", + "risk": "high", + "blast_radius": "shared_environment", + "requires_contract_validation": false, + "requires_security_scan": true, + "recommended_pipeline": "full", + "confidence": 0.91 +} +``` + +**summarize-failure:** + +```json +{ + "failure_type": "test_failure", + "subtype": "integration_environment", + "retryable": true, + "summary": "Integration tests failed due to unreachable dependent service.", + "recommended_next_action": "retry once and verify service container health", + "confidence": 0.83 +} +``` + +## Contract Shapes + +```typescript +interface ClassifyChangeOutput { + change_type: "docs" | "code" | "config" | "schema" | "infra" | "security"; + risk: "low" | "medium" | "high" | "critical"; + blast_radius: "local_only" | "shared_environment" | "production"; + requires_security_scan: boolean; + recommended_pipeline: "fast" | "full"; + confidence: number; +} + +interface SummarizeFailureOutput { + failure_type: string; + retryable: boolean; + summary: string; + recommended_next_action: string; + confidence: number; +} +``` + +## Telemetry Fields + +| Field | Type | Description | +| ------------------------------- | ------ | ------------------- | +| `repo` | string | Repository | +| `pr_number` | number | PR number | +| `change_type` | string | Classified type | +| `risk` | string | Risk level | +| `pipeline_selected` | string | Path chosen | +| `slm_classification_latency_ms` | number | Classification time | +| `workflow_duration_ms` | number | Total duration | + +## Fallback Rules + +| Condition | Action | +| ----------------------------------- | ------------------------------------ | +| Never skip mandatory tests from SLM | Hard policy enforcement | +| High-risk + low confidence | Choose stricter pipeline | +| Classifier unavailable | Default conservative path | +| Failure uncertain | No destructive reruns without policy | + +## Configurable Thresholds + +```typescript +const DEFAULT_THRESHOLDS = { + change_classification: { direct_use: 0.88, manual_review: 0.75 }, + pipeline_suggestion: { direct_path: 0.85, force_full_path: 0.7 }, + failure_summary: { direct_use: 0.8, require_human: 0.65 }, +}; +``` + +| Threshold | Action | +| --------- | ----------------- | +| >= 0.88 | Direct use | +| 0.75-0.87 | Verify with rules | +| < 0.75 | Manual review | diff --git a/docs/architecture/reference/matrix-cognitive-mesh.md b/docs/architecture/reference/matrix-cognitive-mesh.md new file mode 100644 index 0000000..5d11f33 --- /dev/null +++ b/docs/architecture/reference/matrix-cognitive-mesh.md @@ -0,0 +1,112 @@ +# Cognitive Mesh SLM Implementation + +## SLM Endpoints + +| Endpoint | Method | Purpose | +| ------------------------ | ------ | ------------------------------------------- | +| `/slm/decompose-task` | POST | Break complex request into agent tasks | +| `/slm/route-agent` | POST | Route task to appropriate specialist agent | +| `/slm/compress-context` | POST | Compress long context for agent consumption | +| `/slm/validate-response` | POST | Validate agent response coherence | + +## Service Boundaries + +```mermaid +flowchart TD + A[Mesh Entry] --> B[SLM Router] + B --> C{Single or Multi-Agent?} + C -->|Single| D[Specialist Agent] + C -->|Multi| E[SLM Decomposer] + E --> F[Task Graph] + F --> G[Specialist Agents] + D --> H[Evidence Store] + G --> H + H --> I[SLM Compressor] + I --> J[LLM Synthesizer] +``` + +## Example Responses + +**route-agent:** + +```json +{ + "mode": "multi_agent", + "agents": ["infra_agent", "cost_agent", "security_agent"], + "priority": "normal", + "reason_codes": ["azure", "cost", "security_terms"], + "confidence": 0.87 +} +``` + +**decompose-task:** + +```json +{ + "subtasks": [ + { "id": "t1", "agent": "infra_agent", "goal": "inventory deployed Azure resources" }, + { "id": "t2", "agent": "cost_agent", "goal": "identify cost spikes" }, + { "id": "t3", "agent": "security_agent", "goal": "check for unauthorized usage" } + ], + "confidence": 0.82 +} +``` + +## Contract Shapes + +```typescript +interface RouteAgentOutput { + target_agent: string; + mode: "single_agent" | "parallel_agents" | "sequential"; + escalation_required: boolean; + fallback_agent?: string; + confidence: number; +} + +interface DecomposeTaskOutput { + tasks: { + id: string; + description: string; + agent_type: string; + dependencies: string[]; + }[]; + estimated_complexity: "low" | "medium" | "high"; + confidence: number; +} +``` + +## Telemetry Fields + +| Field | Type | Description | +| --------------------- | -------- | ------------------- | +| `mesh_run_id` | uuid | Unique execution ID | +| `route_mode` | string | single/multi agent | +| `selected_agents` | string[] | Agents selected | +| `decomposition_count` | number | Subtasks created | +| `compression_ratio` | number | Tokens reduced | +| `escalated_to_llm` | boolean | LLM used | + +## Fallback Rules + +| Condition | Action | +| -------------------- | ------------------------- | +| Route confidence low | Send to orchestration LLM | +| Decomposition low | Single-agent fallback | +| Compression low | Pass fuller context | +| No agent matches | Default to "research" | + +## Configurable Thresholds + +```typescript +const DEFAULT_THRESHOLDS = { + agent_routing: { direct_route: 0.85, verify_with_rules: 0.7 }, + task_decomposition: { direct_decompose: 0.8, single_agent_fallback: 0.65 }, + context_compression: { aggressive: 0.85, conservative: 0.78 }, +}; +``` + +| Threshold | Action | +| --------- | ----------------- | +| >= 0.85 | Direct routing | +| 0.70-0.84 | Verify with rules | +| < 0.70 | Escalate to LLM | diff --git a/docs/architecture/reference/matrix-gateway.md b/docs/architecture/reference/matrix-gateway.md new file mode 100644 index 0000000..4551887 --- /dev/null +++ b/docs/architecture/reference/matrix-gateway.md @@ -0,0 +1,111 @@ +# AI Gateway SLM Implementation + +## SLM Endpoints + +| Endpoint | Method | Purpose | +| ------------------------ | ------ | ----------------------------------------------------- | +| `/slm/classify-request` | POST | Infer intent, estimate complexity, detect toolability | +| `/slm/policy-screen` | POST | PII/secrets/prompt injection scan, tenant policy fit | +| `/slm/post-tag-response` | POST | Normalize telemetry tags, classify business category | + +## Service Boundaries + +```mermaid +flowchart TD + A[Gateway API] --> B[Policy Engine] + B --> C[SLM Router Service] + C --> D[Model Selection Engine] + D --> E[Provider Adapter] + E --> F[LLM / SLM / Tool] + F --> G[Response Validator] + G --> H[Telemetry + Billing] +``` + +## Example Request/Response + +**Request:** + +```json +{ + "tenant_id": "phoenixvc-prod", + "user_input": "Review this PR and tell me if the API contract changed.", + "context": { + "channel": "web", + "has_files": true, + "history_len": 7 + } +} +``` + +**Response:** + +```json +{ + "intent": "code_review", + "complexity": "medium", + "tool_candidate": true, + "recommended_target": "codeflow-engine", + "recommended_model_tier": "small", + "escalation_required": false, + "confidence": 0.93 +} +``` + +## Contract Shapes + +```typescript +interface ClassifyRequestOutput { + request_id: string; + label: "code_review" | "chat" | "analysis" | "tool_invocation" | "embedding"; + confidence: number; + complexity: "low" | "medium" | "high"; + tool_candidate: boolean; + recommended_tier: "slm" | "small" | "large"; + cacheable: boolean; +} + +interface PolicyScreenOutput { + allowed: boolean; + risk_level: "low" | "medium" | "high" | "critical"; + risk_categories: string[]; + action: "allow" | "rewrite" | "block" | "escalate"; + confidence: number; +} +``` + +## Telemetry Fields + +| Field | Type | Description | +| ------------------- | ------- | ------------------- | +| `tenant_id` | string | Tenant identifier | +| `slm_latency_ms` | number | SLM processing time | +| `intent` | string | Classified intent | +| `complexity` | string | Complexity level | +| `risk_level` | string | Risk assessment | +| `tool_candidate` | boolean | Tool recommendation | +| `escalated_to_llm` | boolean | Whether escalated | +| `cost_estimate_usd` | number | Estimated cost | + +## Fallback Rules + +| Condition | Action | +| -------------------------------- | ---------------------- | +| `policy-screen.allowed == false` | Block or redact | +| `confidence < 0.70` | Escalate to LLM | +| Tool suggested but no mapping | Send to general LLM | +| Tagging fails | Mark telemetry partial | + +## Configurable Thresholds + +```typescript +const DEFAULT_THRESHOLDS = { + intent_classification: { direct_route: 0.9, verify_with_rules: 0.75 }, + policy: { block_immediately: ["critical_secrets"], escalate_to_review: 0.6 }, +}; +``` + +| Threshold | Action | +| --------- | ----------------- | +| >= 0.90 | Direct routing | +| 0.75-0.89 | Verify with rules | +| < 0.75 | Escalate to LLM | diff --git a/docs/architecture/reference/matrix-mystira.md b/docs/architecture/reference/matrix-mystira.md new file mode 100644 index 0000000..ae81461 --- /dev/null +++ b/docs/architecture/reference/matrix-mystira.md @@ -0,0 +1,137 @@ +# Mystira SLM Implementation + +## SLM Endpoints + +| Endpoint | Method | Purpose | +| ---------------------------- | ------ | ---------------------------------------------------------------- | +| `/slm/classify-session` | POST | Determines: bedtime/educational/adventure/branching/continuation | +| `/slm/check-safety-agefit` | POST | Ensures age appropriateness, tone, blocked content | +| `/slm/check-continuity` | POST | Maintains character consistency, world rules | +| `/slm/shape-image-prompt` | POST | Converts story scene to safe, style-consistent prompt | +| `/slm/compress-story-memory` | POST | Keeps only relevant story state | + +## Service Boundaries + +```mermaid +flowchart TD + A[User / Parent / Educator Input] --> B[Story Session Manager] + B --> C[SLM Session Classifier] + C --> D[Safety + Age Fit] + D --> E{Simple or Creative} + E -->|Simple| F[Template / Guided Story Engine] + E -->|Creative| G[LLM Narrative Engine] + G --> H[SLM Continuity + Reading Level Pass] + F --> H + H --> I[Story Output] + H --> J[Image Prompt Shaper] +``` + +## Example Responses + +**check-safety-agefit:** + +```json +{ + "allowed": true, + "age_band": "8-10", + "tone": "gentle_adventure", + "rewrite_needed": false, + "blocked_categories": [], + "confidence": 0.94 +} +``` + +**check-continuity:** + +```json +{ + "consistent": true, + "issues": [], + "retained_story_facts": [ + "main character is Luma", + "forest companion is a silver fox", + "setting is moonlit valley" + ], + "confidence": 0.86 +} +``` + +**shape-image-prompt:** + +```json +{ + "prompt": "A child-safe illustrated moonlit valley scene with Luma and a silver fox, soft wonder, readable composition, no frightening imagery.", + "safety_checked": true, + "style_profile": "mystira_storybook_v1", + "confidence": 0.9 +} +``` + +## Contract Shapes + +```typescript +interface ClassifySessionOutput { + story_type: "bedtime" | "educational" | "adventure" | "branching" | "continuation"; + age_band: string; + is_interactive: boolean; + needs_images: boolean; + curriculum_tags: string[]; + confidence: number; +} + +interface CheckSafetyAgefitOutput { + allowed: boolean; + age_band: string; + tone: string; + rewrite_needed: boolean; + blocked_categories: string[]; + confidence: number; +} + +interface ShapeImagePromptOutput { + prompt: string; + safety_checked: boolean; + style_profile: string; + confidence: number; +} +``` + +## Telemetry Fields + +| Field | Type | Description | +| ----------------------- | ------- | ---------------- | +| `session_id` | uuid | Session ID | +| `story_mode` | string | Classification | +| `age_band` | string | Target age | +| `safety_action` | string | Action taken | +| `rewrite_applied` | boolean | Rewritten | +| `continuity_check_used` | boolean | Validated | +| `image_prompt_shaped` | boolean | Prompt generated | +| `slm_cost` | number | SLM cost | +| `llm_cost` | number | LLM cost | + +## Fallback Rules + +| Condition | Action | +| ------------------ | ------------------------ | +| Safety uncertainty | Safe rewrite or refuse | +| Continuity low | Pass more history to LLM | +| Image shaping low | Conservative template | +| Age-fit uncertain | Default younger-safe | + +## Configurable Thresholds + +```typescript +const DEFAULT_THRESHOLDS = { + session_classification: { direct_use: 0.88, require_review: 0.75 }, + safety_agefit: { direct_allow: 0.92, require_rewrite: 0.8, block: 0.8 }, + continuity: { direct_use: 0.82, pass_to_llm: 0.7 }, + image_prompt: { direct_use: 0.88, conservative: 0.75 }, +}; +``` + +| Threshold | Action | +| --------- | ------------- | +| >= 0.92 | Direct allow | +| 0.80-0.91 | Rewrite/adapt | +| < 0.80 | Block content | diff --git a/docs/architecture/reference/matrix-rooivalk.md b/docs/architecture/reference/matrix-rooivalk.md new file mode 100644 index 0000000..29f20da --- /dev/null +++ b/docs/architecture/reference/matrix-rooivalk.md @@ -0,0 +1,120 @@ +# PhoenixRooivalk SLM Implementation + +## SLM Endpoints + +| Endpoint | Method | Purpose | +| ------------------------------- | ------ | ---------------------------------------------------- | +| `/slm/interpret-event` | POST | Turns fused detection into operator-readable summary | +| `/slm/suggest-sop` | POST | Maps event type to likely SOP references | +| `/slm/condense-mission-log` | POST | Produces incident record | +| `/slm/classify-incident-report` | POST | Creates structured post-event label set | + +## Service Boundaries + +```mermaid +flowchart TD + A[RF / Radar / EO / IR / Rules] --> B[Fusion + Threat Scoring] + B --> C[Hard Decision Layer] + B --> D[SLM Interpretation Layer] + D --> E[Operator Console Summary] + D --> F[SOP Suggestions] + D --> G[Mission Narrative] + C --> H[Manual Review / Control Path] +``` + +## CRITICAL: SLM is for Reporting Only + +``` +┌─────────────────────────────────────────────────────────┐ +│ IMPORTANT - SAFETY BOUNDARY │ +├─────────────────────────────────────────────────────────┤ +│ Hard Decision Layer must NOT depend on free-form SLM │ +│ │ +│ SLM output is for OBSERVATION and REPORTING only: │ +│ • Operator summaries │ +│ • SOP suggestions (non-binding) │ +│ • Mission log condensation │ +│ │ +│ SLM must NEVER be used for: │ +│ • Autonomous threat response │ +│ • Access control decisions │ +│ • Resource isolation actions │ +│ • Any kinetic or hard control actions │ +└─────────────────────────────────────────────────────────┘ +``` + +## Example Responses + +**interpret-event:** + +```json +{ + "title": "Low-altitude inbound contact", + "facts": ["sector north-east", "altitude 35m", "consumer quadcopter RF profile"], + "inferences": ["possible perimeter reconnaissance"], + "operator_summary": "Inbound low-altitude contact detected from north-east sector.", + "confidence": 0.77 +} +``` + +**suggest-sop:** + +```json +{ + "recommended_sops": ["SOP-12 Verify EO feed", "SOP-21 Raise perimeter alert state"], + "confidence": 0.74 +} +``` + +## Contract Shapes + +```typescript +interface InterpretEventOutput { + title: string; + facts: string[]; + inferences: string[]; + operator_summary: string; + confidence: number; +} + +interface SuggestSopOutput { + recommended_sops: string[]; + confidence: number; +} +``` + +## Telemetry Fields + +| Field | Type | Description | +| ------------------------- | -------- | ------------------ | +| `incident_id` | uuid | Incident ID | +| `sensor_fusion_version` | string | Fusion version | +| `threat_score` | number | Calculated score | +| `slm_interpretation_used` | boolean | SLM invoked | +| `sop_suggestions` | string[] | SOPs suggested | +| `human_acknowledged` | boolean | Human acknowledged | +| `offline_mode` | boolean | Offline mode | + +## Fallback Rules + +| Condition | Action | +| ----------------------------- | ---------------------------- | +| Interpretation low confidence | Show facts only | +| SOP low confidence | "Manual SOP lookup required" | +| Edge model unavailable | Use non-LLM summaries | +| SOP generated | NEVER pass to control path | + +## Configurable Thresholds + +```typescript +const DEFAULT_THRESHOLDS = { + operator_summary: { direct_use: 0.8, facts_only: 0.65 }, + sop_suggestion: { direct_suggest: 0.78, manual_lookup: 0.65 }, +}; +``` + +| Threshold | Action | +| --------- | ---------------------------- | +| >= 0.80 | Full summary with inferences | +| 0.65-0.79 | Facts only, no inferences | +| < 0.65 | Human analysis | diff --git a/docs/architecture/reference/operations-patterns.md b/docs/architecture/reference/operations-patterns.md new file mode 100644 index 0000000..01e9b84 --- /dev/null +++ b/docs/architecture/reference/operations-patterns.md @@ -0,0 +1,152 @@ +# Operations Patterns + +Operational patterns including SLM→LLM decision flows, ownership maps, and implementation guidance. + +--- + +## SLM → LLM Decision Flow + +Production handoff logic for routing between SLM and LLM tiers. + +```mermaid +flowchart TD + A[Incoming task / event / request] --> B[SLM preprocess] + + B --> C[Intent classification] + C --> D[Policy / risk scan] + D --> E[Tool-needed check] + E --> F[Complexity estimate] + F --> G[Confidence score] + + G --> H{Blocked by policy?} + H -->|Yes| X[Refuse / redact / quarantine] + H -->|No| I{Simple and high confidence?} + + I -->|Yes| Y[Return SLM path result] + I -->|No| J{Tool first?} + + J -->|Yes| K[Run tool / workflow] + K --> L[SLM compress + validate tool output] + L --> M{Enough to answer?} + M -->|Yes| Y + M -->|No| N[Escalate] + + J -->|No| N[Escalate] + + N --> O[Prepare compact escalation context] + O --> P[LLM reasoning / synthesis] + P --> Q[Post-LLM policy / quality check] + Q --> R[Return final response] +``` + +### Threshold Guidelines + +Use configurable thresholds, not hardcoded logic. + +| Stay in SLM Path | Escalate to LLM | +| ----------------------------------- | -------------------------- | +| High confidence | Confidence below threshold | +| Classification/extraction/screening | Policy ambiguity exists | +| Short, bounded output | Tool outputs conflict | +| Unambiguous tool result | Multi-agent disagreement | +| Low risk | User-facing, high impact | + +### Decision Schema + +```json +{ + "intent": "ci_failure_triage", + "risk_level": "medium", + "needs_tool": true, + "complexity": "medium", + "confidence": 0.81, + "policy_status": "allow", + "recommended_path": "tool_first", + "escalate": false +} +``` + +--- + +## Repo-to-Service Ownership Map + +Maps conceptual stack into likely repo/service boundaries. + +```mermaid +flowchart LR + R1[pvc-ai-gateway repo] --> S1[AI Gateway Service] + R2[cognitive-mesh repo] --> S2[Cognitive Mesh Service] + R3[codeflow-engine repo] --> S3[CodeFlow Engine Service] + R4[agentkit-forge repo] --> S4[AgentKit Forge Service] + R5[phoenixrooivalk-* repos] --> S5[PhoenixRooivalk Edge + Command Services] + R6[shared-platform / contracts / schemas repo] --> S6[Shared Contracts / Telemetry / Policy / SDKs] + R7[infra repo] --> S7[Azure Infra / Monitoring / Deployment Pipelines] +``` + +### Ownership Summary + +| Repo | Owns | +| ---------------------- | ------------------------------------------------------------------------------------------------------- | +| **pvc-ai-gateway** | Ingress API, routing contracts, escalation policy, provider abstraction, semantic cache, audit envelope | +| **cognitive-mesh** | Specialist routing, task decomposition, agent state model, synthesis orchestration, disagreement logic | +| **codeflow-engine** | PR event models, diff classification, CI log triage, contract break workflows, comment generation | +| **agentkit-forge** | Tool registry, tool selection schemas, arg extraction, execution-loop state, retry/fallback logic | +| **phoenixrooivalk-\*** | Edge event schema, local alerting, escalation packet format, command-layer integration | +| **shared-platform** | Telemetry envelope, routing decision schema, model usage schema, audit/trace IDs, reusable schemas | +| **infra** | Azure deployment, Grafana/ADX dashboards, Key Vault wiring, service identities, networking | + +--- + +## Implementation Order + +### First + +Define shared contracts: + +- Routing decision schema +- Model usage event +- Tool execution event +- Audit envelope +- Edge escalation packet + +### Second + +Implement telemetry in the gateway: + +- Trace ID propagation +- Decision logs +- Provider usage events +- Cost estimation fields + +### Third + +Bring CodeFlow and AgentKit onto same telemetry envelope. + +### Fourth + +Add Cognitive Mesh orchestration and disagreement telemetry. + +### Fifth + +Add Rooivalk edge packet telemetry and sync audit. + +--- + +## Architectural Recommendation + +For your environment, the strongest production stance is: + +1. **AI Gateway is the only public AI ingress** +2. **All routing decisions emit one shared RoutingDecision contract** +3. **All model calls emit one shared ModelUsageEvent** +4. **All tool invocations flow through a broker or shared event schema** +5. **All edge escalations use compact evidence packets** +6. **ADX/Kusto + Grafana becomes the operational truth layer** + +This gives you: + +- Cost visibility +- Quality visibility +- Compliance evidence +- Easier A/B testing of SLM routing +- Cleaner failure diagnosis diff --git a/docs/architecture/reference/slm-implementation-matrix.md b/docs/architecture/reference/slm-implementation-matrix.md new file mode 100644 index 0000000..eb1fc6e --- /dev/null +++ b/docs/architecture/reference/slm-implementation-matrix.md @@ -0,0 +1,260 @@ +# SLM Implementation Matrix + +This document provides a repo-by-repo implementation matrix showing SLM endpoints, contract shapes, telemetry fields, fallback rules, confidence thresholds, and practical service boundaries across all six platforms. + +## Quick Reference + +| Platform | SLM Role | Key Endpoints | +| --------------- | --------------------------------------- | ----------------------------------------- | +| AI Gateway | routing, policy, cost control | /classify-request, /policy-screen | +| Cognitive Mesh | agent routing, decomposition | /route-agent, /decompose-task | +| CodeFlow Engine | PR triage, failure analysis | /classify-change, /summarize-failure | +| AgentKit Forge | tool selection, context shaping | /select-tool, /estimate-budget | +| PhoenixRooivalk | event interpretation, SOP suggestions | /interpret-event, /suggest-sop | +| Mystira | story safety, continuity, image prompts | /check-safety-agefit, /shape-image-prompt | + +## Documentation Structure + +``` +reference/ +├── slm-implementation-matrix.md # This file +├── matrix-gateway.md # AI Gateway details +├── matrix-cognitive-mesh.md # Cognitive Mesh details +├── matrix-codeflow.md # CodeFlow Engine details +├── matrix-agentkit.md # AgentKit Forge details +├── matrix-rooivalk.md # PhoenixRooivalk details +└── matrix-mystira.md # Mystira details +``` + +--- + +## 1. Cross-Stack Operating Model + +Use the same control pattern everywhere: + +```mermaid +flowchart LR + A[Input / Event / Request] --> B[Deterministic Guards] + B --> C[SLM Control Layer] + C --> D{Confidence + Policy} + D -->|high confidence| E[Direct Action / Route / Summarize] + D -->|medium confidence| F[Tool Path / Restricted Flow] + D -->|low confidence| G[LLM Escalation] + G --> H[Post-Validation] + E --> I[Telemetry + Audit] + F --> I + H --> I +``` + +--- + +## 2. Canonical SLM Service Interfaces + +These are the reusable interface families standardized across the stack. + +### A. Classification Contract + +```json +{ + "request_id": "uuid", + "label": "code_review", + "confidence": 0.91, + "secondary_labels": ["security_review"], + "reason_codes": ["contains_diff", "contains_code_terms"], + "recommended_action": "route_security_agent" +} +``` + +### B. Routing Contract + +```json +{ + "request_id": "uuid", + "target": "infra_agent", + "mode": "single_agent", + "escalation_required": false, + "tool_candidate": true, + "cost_tier": "low", + "confidence": 0.88 +} +``` + +### C. Compression Contract + +```json +{ + "request_id": "uuid", + "summary": "User wants Azure cost anomaly investigation for Foundry usage.", + "retained_facts": [ + "resource deleted on 2026-03-05", + "billing visible from 2026-03-03", + "suspected partner local usage" + ], + "dropped_categories": ["small talk", "repeated screenshots"], + "confidence": 0.84 +} +``` + +### D. Safety / Moderation Contract + +```json +{ + "request_id": "uuid", + "allowed": true, + "risk_level": "low", + "risk_categories": [], + "action": "allow", + "confidence": 0.96 +} +``` + +### E. Summarization / Operator Brief Contract + +```json +{ + "request_id": "uuid", + "title": "Possible perimeter drone approach", + "summary": "Low-altitude approach detected from north-east sector.", + "facts": ["altitude 35m", "entry vector north-east", "rf profile matched consumer quadcopter"], + "inferences": ["possible surveillance behavior"], + "recommended_next_step": "verify EO feed and initiate SOP-12", + "confidence": 0.79 +} +``` + +--- + +## 3. Cross-Platform Confidence Policy + +A unified confidence policy across all platforms: + +| Confidence | Meaning | Action | +| ---------- | ----------------- | ----------------------------------------- | +| 0.90-1.00 | Strong confidence | Direct automated route/action | +| 0.80-0.89 | Acceptable | Automate with validation | +| 0.70-0.79 | Uncertain | Restricted automation or human/LLM assist | +| < 0.70 | Weak | Escalate or safe fallback | + +```mermaid +flowchart TD + A[SLM Result] --> B{Confidence >= 0.90?} + B -->|Yes| C[Direct Action] + B -->|No| D{Confidence >= 0.80?} + D -->|Yes| E[Validate & Proceed] + D -->|No| F{Confidence >= 0.70?} + F -->|Yes| G[Restricted Automation] + F -->|No| H[Escalate / Fallback] + E --> I[Execute] + G --> I + H --> I +``` + +--- + +## 4. Cross-Platform Telemetry Schema + +Use a common event envelope across all repos: + +```json +{ + "event_id": "uuid", + "timestamp_utc": "2026-03-15T10:00:00Z", + "platform": "codeflow-engine", + "component": "slm-change-classifier", + "model": "phi-3-mini", + "operation": "classify-change", + "latency_ms": 42, + "input_tokens": 612, + "output_tokens": 87, + "confidence": 0.91, + "action_taken": "full_pipeline", + "escalated": false, + "cost_estimate_usd": 0.0004, + "trace_id": "trace-123" +} +``` + +### Recommended Common Fields + +| Field | Type | Description | +| ------------------- | ------- | ------------------------ | +| `event_id` | uuid | Unique event identifier | +| `trace_id` | uuid | Distributed trace ID | +| `platform` | string | System name | +| `component` | string | Specific component | +| `operation` | string | Operation performed | +| `model` | string | Model used | +| `model_version` | string | Model version | +| `latency_ms` | number | Processing time | +| `input_tokens` | number | Input token count | +| `output_tokens` | number | Output token count | +| `confidence` | number | Model confidence | +| `action_taken` | string | Action taken | +| `escalated` | boolean | Whether escalated to LLM | +| `fallback_reason` | string | Fallback reason | +| `cost_estimate_usd` | number | Estimated cost | +| `tenant_or_project` | string | Tenant identifier | +| `environment` | string | Environment | + +--- + +## 5. Recommended Model-Role Mapping + +This is a practical role map, not a vendor mandate. + +| Role | Recommended Model Profile | +| ----------------------- | ------------------------------------- | +| Classification | Very small, fast instruct model | +| Routing | Small instruct model with strict JSON | +| Safety Prefilter | Small model + deterministic rules | +| Compression | Small/medium model with schema output | +| Failure Summarization | Small instruct model | +| Creative Storytelling | Larger narrative-capable model | +| Deep Synthesis | Larger reasoning model | +| Edge Operator Summaries | Compact on-device model | + +--- + +## 6. Implementation Order + +### Phase 1: Foundation + +- AI Gateway request classifier +- CodeFlow change classifier +- AgentKit tool selector + +### Phase 2: Expansion + +- Cognitive Mesh router + decomposer +- Mystira safety/continuity layer + +### Phase 3: Maturation + +- PhoenixRooivalk operator interpreter +- Shared telemetry normalization +- Confidence calibration dashboards + +--- + +## 7. Cross-System Summary + +### Confidence Threshold Summary + +| System | High (direct) | Medium (verify) | Low (escalate) | +| --------------- | ------------- | --------------- | -------------- | +| AI Gateway | >= 0.90 | 0.75-0.89 | < 0.75 | +| Cognitive Mesh | >= 0.85 | 0.70-0.84 | < 0.70 | +| CodeFlow | >= 0.88 | 0.75-0.87 | < 0.75 | +| AgentKit Forge | >= 0.85 | 0.70-0.84 | < 0.70 | +| PhoenixRooivalk | >= 0.80 | 0.65-0.79 | < 0.65 | +| Mystira | >= 0.92 | 0.80-0.91 | < 0.80 | + +### Standard Fallback Pattern + +``` +1. SLM timeout → Deterministic rules +2. Low confidence → LLM escalation +3. Safety critical → Block immediately +4. Unknown classification → Safe default +5. All failures → Log + alert + human review +``` diff --git a/docs/architecture/slm-management-plan.md b/docs/architecture/reference/slm-management-plan.md similarity index 100% rename from docs/architecture/slm-management-plan.md rename to docs/architecture/reference/slm-management-plan.md diff --git a/docs/architecture/reference/strategic/01-why-slms-matter.md b/docs/architecture/reference/strategic/01-why-slms-matter.md new file mode 100644 index 0000000..653e273 --- /dev/null +++ b/docs/architecture/reference/strategic/01-why-slms-matter.md @@ -0,0 +1,86 @@ +# Why SLMs Matter in These Systems + +This document explains the strategic value of Small Language Models (SLMs) across the ecosystem. + +## Executive Summary + +Across all six platforms, SLMs provide: + +| Benefit | Description | +| -------------------------- | ------------------------------------------- | +| **Cost Control** | Large models are invoked only when required | +| **Latency Reduction** | Routing decisions happen in milliseconds | +| **Edge Deployment** | PhoenixRooivalk can run inference locally | +| **Deterministic Behavior** | SLMs are easier to constrain and audit | + +## Summary Table + +| System | SLM Role | +| --------------- | --------------------------------------- | +| AI Gateway | routing, policy checks, cost prediction | +| Cognitive Mesh | agent routing, task decomposition | +| PhoenixRooivalk | edge telemetry analysis | +| CodeFlow Engine | CI intelligence, log analysis | +| AgentKit Forge | tool selection, context compression | +| Mystira | story safety, continuity, age-fit | + +--- + +## Design Principle + +The best use of SLMs is not "replace the big model." It is: + +```mermaid +flowchart LR + S[Screen First] --> R[Route Cheap] + R --> E[Escalate Selectively] + E --> C[Compress Context Aggressively] + C --> L[Keep Edge Decisions Local] +``` + +| Principle | Description | +| ------------------------ | -------------------------------------------------------------- | +| **Screen First** | SLMs handle initial classification before expensive operations | +| **Route Cheap** | Direct simple requests to SLMs or small models | +| **Escalate Selectively** | Only invoke LLMs for complex, ambiguous tasks | +| **Compress Context** | SLMs reduce token volume before LLM processing | +| **Keep Edge Local** | PhoenixRooivalk operates without cloud dependency | + +--- + +## Reference Architecture + +```mermaid +flowchart TD + U[Users / Operators / CI Events / Sensor Feeds] + U --> G[AI Gateway] + G --> G1[SLM: intent classification] + G --> G2[SLM: safety / policy scan] + G --> G3[SLM: cost routing] + G --> G4[Cache / provider selection] + G4 --> CM[Cognitive Mesh] + G4 --> CF[CodeFlow Engine] + G4 --> AF[AgentKit Forge] + G4 --> PR[PhoenixRooivalk] + G4 --> MY[Mystira] + CM --> L1[LLM: deep reasoning] + CF --> L2[LLM: remediation] + AF --> L3[LLM: synthesis] + MY --> L4[LLM: narrative] +``` + +--- + +## Strategic Recommendation + +SLMs should be treated as: + +- **Control-plane intelligence**: Routing, classification, decision-making +- **Cheap operational cognition**: Fast, repetitive tasks +- **First-pass classifiers**: Initial triage before expensive operations +- **Context reducers**: Compressing data for efficient processing +- **Edge interpreters**: Local processing without cloud dependency + +**Not** as replacements for the reasoning tier. + +> **SLMs run the flow. LLMs solve the hard parts.** diff --git a/docs/architecture/reference/strategic/02-gateway-slm-use-cases.md b/docs/architecture/reference/strategic/02-gateway-slm-use-cases.md new file mode 100644 index 0000000..1d62542 --- /dev/null +++ b/docs/architecture/reference/strategic/02-gateway-slm-use-cases.md @@ -0,0 +1,90 @@ +# AI Gateway — Practical SLM Use Cases + +AI Gateway is the highest-leverage place to put SLMs because every request passes through it. + +## Best-Fit SLM Tasks + +### A. Intent and Complexity Classification + +The SLM predicts: + +- Request type +- Risk level +- Likely tool need +- Token size estimate +- Recommended model tier + +```json +{ + "intent": "repo_analysis", + "complexity": "medium", + "tool_required": true, + "security_risk": "low", + "recommended_tier": "mid" +} +``` + +### B. Safety and Data-Boundary Screening + +Before a request hits an expensive model: + +- Secret leakage scan +- PII detection +- Jailbreak/prompt-injection detection +- Tenant/policy checks +- Outbound data classification + +### C. Budget-Aware Routing + +Use the SLM to decide: + +- Direct answer with small model +- Call tool first +- Escalate to reasoning model +- Deny or redact +- Cache hit / semantic cache reuse + +## Practical Gateway Flow + +```mermaid +flowchart LR + A[Client Request] --> B[Gateway Ingress] + B --> C[SLM Classifier] + C --> D[SLM Policy Scan] + D --> E[Budget / Latency Rules] + E --> F{Decision} + F -->|simple| G[Small Model] + F -->|tool-first| H[Tool Execution] + F -->|complex| I[Large Model] + F -->|blocked| J[Policy Refusal] + H --> K[Post-tool SLM summarizer] + K --> I +``` + +## Why It Fits AI Gateway + +| Benefits | Tradeoffs | +| ----------------------------- | -------------------------------------- | +| Major cost reduction | Misrouting risk if classifier is weak | +| Faster median latency | Extra hop in pipeline | +| Consistent policy enforcement | Need calibration, thresholds, fallback | +| Cleaner observability | | + +## Where It Breaks Down + +- Vague prompts +- Multi-domain prompts +- Hidden tool requirements +- Requests where complexity is not obvious + +## Recommended Pattern + +Use the SLM as a **triage layer, not the final authority**. If confidence is low, escalate automatically. + +### Threshold Guide + +| Confidence | Action | +| ---------- | ----------------- | +| >= 0.90 | Direct routing | +| 0.75-0.89 | Verify with rules | +| < 0.75 | Escalate to LLM | diff --git a/docs/architecture/reference/strategic/03-cognitive-mesh-use-cases.md b/docs/architecture/reference/strategic/03-cognitive-mesh-use-cases.md new file mode 100644 index 0000000..4255e87 --- /dev/null +++ b/docs/architecture/reference/strategic/03-cognitive-mesh-use-cases.md @@ -0,0 +1,95 @@ +# Cognitive Mesh — Practical SLM Use Cases + +Cognitive Mesh is where SLMs become orchestration primitives. + +## Best-Fit SLM Tasks + +### A. Specialist Routing + +The SLM decides which node gets the task: + +- infra +- code +- security +- research +- finance +- documentation +- architecture + +### B. Task Decomposition + +Before invoking expensive reasoning, the SLM splits tasks into atomic units. + +Example: "Review this repo and propose a deploy plan" becomes: + +1. Detect stack +2. Detect infra +3. Detect secrets/compliance issues +4. Map CI/CD +5. Draft deploy sequence + +### C. State Summarization + +Multi-agent systems accumulate long histories. An SLM maintains: + +- Current objective +- Known constraints +- Prior decisions +- Unresolved blockers +- Tool outputs summary + +### D. Agent Health and Loop Detection + +The SLM can classify: + +- Repeated retries +- Tool thrashing +- No-progress loops +- Conflicting agent outputs + +## Practical Cognitive Mesh Flow + +```mermaid +flowchart TD + U[User] --> R[SLM Router] + R --> A1[Architect Agent] + R --> A2[Coder Agent] + R --> A3[Security Agent] + R --> A4[Infra Agent] + R --> A5[Research Agent] + A1 --> S[SLM State Manager] + A2 --> S + A3 --> S + A4 --> S + A5 --> S + S --> X{Need deep reasoning?} + X -->|No| O[Assemble Response] + X -->|Yes| L[LLM Synthesis] + L --> O +``` + +## Why It Fits Cognitive Mesh + +| Benefits | Tradeoffs | +| --------------------- | ----------------------------- | +| Cheaper orchestration | Decomposition quality matters | +| Faster routing | Errors compound downstream | +| Smaller context | Summaries can lose nuance | +| Better determinism | | + +## Best Operational Pattern + +| Use SLMs For | Use LLMs For | +| ------------------------ | ----------------------- | +| "Who should do this?" | Final synthesis | +| "What is the next step?" | Architecture evaluation | +| "What matters here?" | Novel reasoning | +| "Are we stuck?" | | + +## Threshold Guide + +| Confidence | Action | +| ---------- | ----------------- | +| >= 0.85 | Direct routing | +| 0.70-0.84 | Verify with rules | +| < 0.70 | Escalate to LLM | diff --git a/docs/architecture/reference/strategic/04-codeflow-use-cases.md b/docs/architecture/reference/strategic/04-codeflow-use-cases.md new file mode 100644 index 0000000..f1024fe --- /dev/null +++ b/docs/architecture/reference/strategic/04-codeflow-use-cases.md @@ -0,0 +1,87 @@ +# CodeFlow Engine — Practical SLM Use Cases + +CodeFlow Engine is one of the strongest SLM domains because CI/CD workloads are repetitive, structured, and high-volume. + +## Best-Fit SLM Tasks + +### A. PR Classification + +Classify a PR as: + +- docs-only +- low-risk refactor +- dependency update +- infra change +- security-sensitive +- contract-breaking +- test-only +- release-impacting + +### B. Diff Summarization + +Generate short structured summaries from git diff and changed files. + +### C. CI Failure Triage + +Classify failures into: + +- test regression +- flaky test +- dependency resolution +- auth/secret issue +- infra provisioning error +- timeout/resource exhaustion +- lint/type failure + +### D. Review Routing + +Decide which reviewers or agent flows should be triggered. + +### E. Release-Note Extraction + +Extract user-facing change notes without using a full LLM. + +## Practical CodeFlow Pipeline + +```mermaid +flowchart LR + GP[Git Push / PR] --> IN[Ingest] + IN --> S1[SLM: diff classifier] + IN --> S2[SLM: risk scorer] + S1 --> D{Decision} + S2 --> D + D -->|low-risk| Q[Fast CI] + D -->|high-risk| F[Full CI] + D -->|unclear| L[LLM Review] + F --> T[CI Logs] + Q --> T + T --> C[SLM: triage] + C --> R[Action] +``` + +## Why It Fits CodeFlow Engine + +| Benefits | Tradeoffs | +| -------------------------- | --------------------------- | +| Huge cost savings at scale | False negatives possible | +| Strong consistency | Requires designed schemas | +| Better PR throughput | Model drift affects quality | +| Repetitive workload fit | | + +## Strongest SLM Opportunities + +Given emphasis on contract diffs, OpenAPI breakage, schema validation, CI gates: + +- Change intent detection +- Docs generation hints +- Issue bucketing +- Runbook suggestion +- Log compression before escalation + +## Threshold Guide + +| Confidence | Action | +| ---------- | ----------------- | +| >= 0.88 | Direct use | +| 0.75-0.87 | Verify with rules | +| < 0.75 | Manual review | diff --git a/docs/architecture/reference/strategic/05-agentkit-use-cases.md b/docs/architecture/reference/strategic/05-agentkit-use-cases.md new file mode 100644 index 0000000..ebd5d4d --- /dev/null +++ b/docs/architecture/reference/strategic/05-agentkit-use-cases.md @@ -0,0 +1,85 @@ +# AgentKit Forge — Practical SLM Use Cases + +AgentKit Forge is ideal for SLMs because tool-heavy agents don't need a large model for every micro-decision. + +## Best-Fit SLM Tasks + +### A. Tool Selection + +Choose among: + +- GitHub API +- Azure CLI +- Terraform +- Kusto +- File retrieval +- Documentation lookup +- Shell command +- Search + +### B. Parameter Extraction + +Pull structured arguments out of the request before calling the tool. + +### C. Context Compression + +Convert long tool traces into compact operational state. + +### D. Step Validation + +Check whether a step result is sufficient before moving to next step. + +### E. Retry / Fallback Logic + +Classify whether an error merits: + +- Retry +- Alternate tool +- Human intervention +- Escalation to larger model + +## Practical AgentKit Flow + +```mermaid +flowchart TD + T[Agent Task] --> P[SLM Planner] + P --> TS[SLM Tool Selector] + TS --> G1[GitHub] + TS --> G2[Azure] + TS --> G3[Terraform] + TS --> G4[Kusto] + G1 --> M[SLM Compressor] + G2 --> M + G3 --> M + G4 --> M + M --> V{Enough?} + V -->|Yes| A[Response] + V -->|No| L[Escalate LLM] + L --> A +``` + +## Why It Fits AgentKit Forge + +| Benefits | Tradeoffs | +| -------------------- | --------------------------- | +| Lower token burn | Brittle if schemas weak | +| Faster tool loops | Poor extraction = bad calls | +| Improved determinism | Compression can lose detail | +| Cleaner contracts | | + +## Design Rule + +| Let SLMs Own | Let LLMs Own | +| ------------ | -------------------- | +| Selection | Synthesis | +| Extraction | Ambiguity resolution | +| Compression | Multi-tool planning | +| Validation | | + +## Threshold Guide + +| Confidence | Action | +| ---------- | -------------------- | +| >= 0.85 | Direct execution | +| 0.70-0.84 | Require confirmation | +| < 0.70 | Decline / clarify | diff --git a/docs/architecture/reference/strategic/06-rooivalk-use-cases.md b/docs/architecture/reference/strategic/06-rooivalk-use-cases.md new file mode 100644 index 0000000..6bdb6dc --- /dev/null +++ b/docs/architecture/reference/strategic/06-rooivalk-use-cases.md @@ -0,0 +1,76 @@ +# PhoenixRooivalk — Practical SLM Use Cases + +PhoenixRooivalk is different because the core advantage is locality, latency, and resilience—not just cost. + +## Best-Fit SLM Tasks + +### A. Edge Event Labeling + +Convert telemetry into categories: + +- loitering +- fast ingress +- signal loss +- RF anomaly +- perimeter breach candidate +- operator attention required + +### B. Operator-Facing Summary + +Turn noisy sensor events into concise, human-readable alerts. + +### C. Log-to-Report Conversion + +Mission logs, detections, and post-event evidence can be summarized locally. + +### D. Escalation Gating + +Only send selected events to cloud when: + +- Confidence above threshold +- Event duration exceeds threshold +- Evidence bundle sufficient +- Bandwidth available + +## Practical Edge Flow + +```mermaid +flowchart LR + S[RF / EO / Radar / Telemetry] --> N[Detection Pipeline] + N --> E[Edge SLM] + E --> L1[Event Label] + E --> L2[Threat Summary] + E --> L3[Alert Text] + E --> L4[Escalation] + L4 -->|local| O[Console] + L4 -->|upstream| C[Cloud] +``` + +## Why It Fits PhoenixRooivalk + +| Benefits | Tradeoffs | +| ------------------------ | ------------------------- | +| Low latency | Limited reasoning depth | +| Offline capability | Edge hardware constraints | +| Bandwidth savings | Must handle noisy inputs | +| Privacy / sovereignty | Needs tight prompt design | +| Constrained hardware fit | | + +## CRITICAL: Important Boundary + +Do NOT let SLM become sole authority for kinetic or high-stakes decisions. + +| Use SLM For | NOT For | +| ---------------- | ---------------------------- | +| Interpretation | Critical threat adjudication | +| Summarization | Response triggering | +| Prioritization | Access control | +| Operator support | Resource isolation | + +## Threshold Guide + +| Confidence | Action | +| ---------- | -------------- | +| >= 0.80 | Full summary | +| 0.65-0.79 | Facts only | +| < 0.65 | Human analysis | diff --git a/docs/architecture/reference/strategic/07-deployment-model.md b/docs/architecture/reference/strategic/07-deployment-model.md new file mode 100644 index 0000000..e4e6f8d --- /dev/null +++ b/docs/architecture/reference/strategic/07-deployment-model.md @@ -0,0 +1,75 @@ +# Practical Deployment Model + +This is the recommended stack for the ecosystem. + +## Full Stack Architecture + +```mermaid +flowchart TD + A[Ingress] --> B[AI Gateway SLM] + B --> C1[Fast-path] + B --> C2[Tool-first] + B --> C3[Escalation] + C2 --> D1[AgentKit] + C2 --> D2[CodeFlow] + C2 --> D3[Cognitive Mesh] + D1 --> E1[SLM tool loops] + D2 --> E2[SLM CI triage] + D3 --> E3[SLM orchestration] + E1 --> F[LLM Pool] + E2 --> F + E3 --> F + G[Rooivalk Edge] --> H[Local SLM] + H --> I[Local / Cloud] + F --> J[Observability] +``` + +## Decision Matrix + +| System | Best SLM Jobs | Less Suitable | +| --------------- | -------------------------- | ------------------------------ | +| AI Gateway | routing, screening, cost | Nuanced synthesis | +| Cognitive Mesh | routing, decomposition | Final judgment | +| CodeFlow | PR triage, log analysis | Root cause across dependencies | +| AgentKit | tool selection, extraction | Multi-step planning | +| PhoenixRooivalk | summaries, alerts | Sole threat authority | +| Mystira | safety, continuity | Rich narrative | + +## Practical Gateway Flow + +```mermaid +flowchart LR + A[Request] --> B[Classifier] + B --> C[Policy Scan] + C --> D[Budget Rules] + D --> E{Decision} + E -->|simple| F[Small Model] + E -->|tool| G[Tools] + E -->|complex| H[LLM] + E -->|blocked| I[Refusal] + G --> J[Post-tool Summarizer] + J --> H +``` + +## End-to-End Example + +A developer opens a PR that changes Terraform, GitHub Actions, and an OpenAPI spec: + +```mermaid +sequenceDiagram + Dev->>GH: Open PR + GH->>CF: Event + CF->>SLM: Classify + risk + SLM-->>CF: infra-change, high risk + CF->>GH: Full CI + contract checks + GH-->>CF: Results + CF->>SLM: Triage logs + SLM-->>CF: Breaking change detected + CF->>AG: Escalate + AG->>LLM: Reasoning + LLM-->>AG: Advice + AG-->>CF: Response + CF-->>GH: Comment +``` + +SLMs handle repetitive triage; LLMs solve the hard part. diff --git a/docs/architecture/reference/strategic/08-implementation-order.md b/docs/architecture/reference/strategic/08-implementation-order.md new file mode 100644 index 0000000..fde2524 --- /dev/null +++ b/docs/architecture/reference/strategic/08-implementation-order.md @@ -0,0 +1,56 @@ +# Recommended Implementation Order + +For the stack, the highest ROI sequence is: + +## Phase 1: Gateway Foundation + +- AI Gateway intent classifier +- AI Gateway policy scan +- Route-to-tier decision +- Semantic cache admission + +**Value**: Highest immediate cost-leverage + +## Phase 2: CI Intelligence + +- CodeFlow Engine PR risk classifier +- CodeFlow Engine CI failure bucketing +- CodeFlow Engine release-note summarizer + +**Value**: Fastest operational value + +## Phase 3: Agent Runtime + +- AgentKit Forge tool selector +- AgentKit Forge parameter extractor +- AgentKit Forge context compressor + +**Value**: Lower token burn, faster tool loops + +## Phase 4: Orchestration + +- Cognitive Mesh specialist router +- Cognitive Mesh decomposition engine +- Cognitive Mesh state manager + +**Value**: Strong leverage once taxonomy stabilizes + +## Phase 5: Edge + +- PhoenixRooivalk edge event summarizer +- PhoenixRooivalk operator alert composer +- PhoenixRooivalk escalation filter + +**Value**: Keep isolated from critical control + +## Summary + +| Phase | System | Priority | +| ----- | --------------- | -------- | +| 1 | AI Gateway | Highest | +| 2 | CodeFlow | High | +| 3 | AgentKit Forge | Medium | +| 4 | Cognitive Mesh | Medium | +| 5 | PhoenixRooivalk | Lower | + +That order gives fastest operational value with lowest implementation risk. diff --git a/docs/architecture/reference/strategic/README.md b/docs/architecture/reference/strategic/README.md new file mode 100644 index 0000000..0644088 --- /dev/null +++ b/docs/architecture/reference/strategic/README.md @@ -0,0 +1,28 @@ +# Strategic SLM Guidance + +This folder contains strategic guidance on why SLMs matter and how to deploy them across the ecosystem. + +## Documents + +- [01-why-slms-matter.md](01-why-slms-matter.md) - Executive summary and core principles +- [02-gateway-slm-use-cases.md](02-gateway-slm-use-cases.md) - AI Gateway practical use cases +- [03-cognitive-mesh-use-cases.md](03-cognitive-mesh-use-cases.md) - Cognitive Mesh practical use cases +- [04-codeflow-use-cases.md](04-codeflow-use-cases.md) - CodeFlow Engine practical use cases +- [05-agentkit-use-cases.md](05-agentkit-use-cases.md) - AgentKit Forge practical use cases +- [06-rooivalk-use-cases.md](06-rooivalk-use-cases.md) - PhoenixRooivalk practical use cases +- [07-deployment-model.md](07-deployment-model.md) - Practical deployment model and decision matrix +- [08-implementation-order.md](08-implementation-order.md) - Recommended implementation sequence + +## Quick Navigation + +| Phase | System | Document | +| --------------- | --------------- | ---------------------------------------------------------------- | +| Foundation | AI Gateway | [02-gateway-slm-use-cases.md](02-gateway-slm-use-cases.md) | +| CI Intelligence | CodeFlow Engine | [04-codeflow-use-cases.md](04-codeflow-use-cases.md) | +| Agent Runtime | AgentKit Forge | [05-agentkit-use-cases.md](05-agentkit-use-cases.md) | +| Orchestration | Cognitive Mesh | [03-cognitive-mesh-use-cases.md](03-cognitive-mesh-use-cases.md) | +| Edge | PhoenixRooivalk | [06-rooivalk-use-cases.md](06-rooivalk-use-cases.md) | + +## Core Principle + +> **SLMs run the flow. LLMs solve the hard parts.** diff --git a/docs/architecture/agentkit-forge.md b/docs/architecture/systems/agentkit-forge.md similarity index 100% rename from docs/architecture/agentkit-forge.md rename to docs/architecture/systems/agentkit-forge.md diff --git a/docs/architecture/ai-gateway.md b/docs/architecture/systems/ai-gateway.md similarity index 100% rename from docs/architecture/ai-gateway.md rename to docs/architecture/systems/ai-gateway.md diff --git a/docs/architecture/codeflow-engine.md b/docs/architecture/systems/codeflow-engine.md similarity index 100% rename from docs/architecture/codeflow-engine.md rename to docs/architecture/systems/codeflow-engine.md diff --git a/docs/architecture/cognitive-mesh.md b/docs/architecture/systems/cognitive-mesh.md similarity index 100% rename from docs/architecture/cognitive-mesh.md rename to docs/architecture/systems/cognitive-mesh.md diff --git a/docs/architecture/systems/mystira.md b/docs/architecture/systems/mystira.md new file mode 100644 index 0000000..17079e7 --- /dev/null +++ b/docs/architecture/systems/mystira.md @@ -0,0 +1,584 @@ +# Mystira + +Mystira is an interactive story generation system for children. The SLM serves as a **content-shaping, moderation, personalization, and cost-control layer** around story generation and interactive experience flows. + +## Architecture Overview + +```mermaid +flowchart TB + subgraph User["User Layer"] + U[Child/Parent Input] + P[Parent Controls] + end + + subgraph SLML["SLM Experience Control Layer"] + C[Story Classifier] + A[Age-Tone Controller] + M[Moderation Filter] + W[World Consistency] + end + + subgraph State["State Management"] + S[Story State] + Pr[Child Profile] + Ch[Character Registry] + end + + subgraph Content["Content Pipeline"] + PC[Prompt Constructor] + LLM[Story LLM] + IMG[Illustration Generator] + end + + subgraph Output["Output Processing"] + SA[Safety Audit] + RL[Reading Level Check] + IP[Image Prompt Shaper] + end + + U --> C + P --> Pr + C --> A + A --> M + M --> W + W --> S + S --> Pr + Pr --> PC + PC --> LLM + LLM --> SA + SA --> RL + RL --> IMG + IMG --> IP + PC --> Ch + Ch --> W +``` + +## Detailed Data Flow + +```mermaid +sequenceDiagram + participant U as User Input + participant SLM as SLM Control Layer + participant SS as Story State + participant LLM as Story LLM + participant IMG as Image Service + participant MOD as Moderation + + U->>SLM: Story request + SLM->>SLM: Classify request type + SLM->>SLM: Check age appropriateness + SLM->>SLM: Validate parental controls + SLM->>SS: Update session context + SLM->>SS: Compress memory if needed + + alt Simple continuation + SLM->>SLM: Generate simple continuation + SLM->>MOD: Quick safety check + SLM->>U: Return response + else Full story generation + SLM->>LLM: Prepare enriched prompt + LLM->>SLM: Generated story + SLM->>SLM: Validate continuity + SLM->>MOD: Full moderation check + SLM->>SLM: Adapt reading level + SLM->>IMG: Shape image prompts + IMG->>SLM: Generated illustrations + SLM->>U: Return enriched story + end +``` + +## SLM as Experience Orchestrator + +The SLM sits between: + +1. **User input** — Classification, safety pre-check, parental control validation +2. **Story state / profile state** — Memory compression, continuity tracking +3. **Generation pipeline** — Prompt enrichment, context window management +4. **Illustration / asset prompts** — Style consistency, character adherence +5. **Moderation / age-appropriateness checks** — Multi-layer safety filtering + +```mermaid +flowchart LR + subgraph Input["Input Processing"] + I1[User Request] + I2[Parent Settings] + I3[Session History] + end + + subgraph SLM["SLM Decision Points"] + S1[Request Classification] + S2[Age-Tone Mapping] + S3[Safety Filter] + S4[Memory Compression] + end + + subgraph Decision["Routing Decision"] + D1{Complexity?} + D1 -->|Simple| R1[SLM Direct] + D1 -->|Complex| D2{Age Appropriate?} + D2 -->|Yes| R2[LLM Generation] + D2 -->|No| R3[Safe Rewrite] + end + + subgraph Output["Output Processing"] + O1[Continuity Check] + O2[Image Prompt] + O3[Reading Level] + O4[Final Safety] + end + + I1 --> S1 + I2 --> S2 + I3 --> S4 + S1 --> D1 + S2 --> D1 + S3 --> Decision + Decision --> Output +``` + +## Best SLM Use Cases + +### 1. Story Request Classification + +Determine request type: + +```json +{ + "story_type": "bedtime|learning|adventure|interactive|customization|continuation|image", + "age_range": "3-5|5-8|8-10|10-12", + "is_interactive": true, + "continuation": true, + "needs_images": true, + "curriculum_tags": ["kindness", "sharing", "animals"], + "estimated_complexity": "low|medium|high" +} +``` + +### 2. Age and Tone Control + +Enforce cheaply: + +```json +{ + "reading_level": "easy|moderate|advanced", + "sentence_length": "short|medium|long", + "emotional_tone": "calm|exciting|gentle|funny", + "safe_themes": true, + "lesson_alignment": ["kindness", "courage"], + "content_rating": "G|PG", + "prohibited_elements": [] +} +``` + +### 3. Moderation and Safe Rewriting + +Catch or rewrite: + +- Frightening content +- Inappropriate content +- Emotionally unsuitable scenes +- Unsafe user prompts +- Age-inappropriate vocabulary + +```json +{ + "flagged": false, + "rewritten": null, + "content_rating": "safe|caution|blocked", + "age_appropriate": true, + "concerns": [], + "rewrite_suggestions": [] +} +``` + +### 4. Session Memory Compression + +Keep only essential state: + +```json +{ + "session_id": "abc123", + "active_characters": ["Luna", "Bear"], + "current_quest": "find_moon", + "tone_constraints": "gentle_adventure", + "age_band": "3-5", + "plot_anchors": ["discovered_moon_stone", "met_starlight_friend"], + "character_states": { + "Luna": { "mood": "curious", "location": "forest_edge" }, + "Bear": { "mood": "helpful", "location": "forest_edge" } + }, + "reader_preferences": { "likes": ["animals", "stars"], "dislikes": ["scary"] } +} +``` + +### 5. Character and World Consistency + +Validate: + +- Names remain consistent +- World rules not violated +- Prior events respected +- Visual prompts align with canon + +```json +{ + "valid": true, + "inconsistencies": [], + "suggested_corrections": [], + "world_rules_violated": [], + "character_continuity_ok": true +} +``` + +### 6. Illustration Prompt Shaping + +Convert story scene to constrained image prompts: + +```json +{ + "prompt": "Luna the fox and Bear walking through moonlit forest, children's book style, soft colors, no scary elements", + "style": "children_book", + "style_params": { + "illustration_type": "watercolor", + "color_palette": "warm", + "lighting": "soft_moonlight" + }, + "character_refs": ["luna", "bear"], + "safety_check": "passed", + "age_appropriate": true, + "brand_compliant": true +} +``` + +## Implementation + +### Story Classification + +```python +async def classify_story_request( + user_input: str, + session: Session, + profile: ChildProfile +) -> StoryClassification: + prompt = f"""Classify this story request: + +User input: {user_input} +Session history: {session.summary} +Child age band: {profile.age_band} +Parent settings: {profile.parent_controls} + +Output as JSON with fields: +- story_type: bedtime|learning|adventure|interactive|customization|continuation|image +- age_range: target age range +- is_interactive: boolean +- needs_images: boolean +- curriculum_tags: array of educational tags +- complexity: low|medium|high""" + + return await slm_completion(prompt, schema=StoryClassification) +``` + +### Age and Tone Control + +```python +async def enforce_age_tone( + content: str, + profile: ChildProfile +) -> ControlledContent: + prompt = f"""Adapt content for age group: + +Content: {content[:1000]} +Age band: {profile.age_band} +Profile preferences: {profile.preferences} +Parent tone settings: {profile.parent_tone_settings} + +Output as JSON: +- adapted_content: rewritten content +- reading_level: easy|moderate|advanced +- safety_flag: boolean +- concerns: array of any issues""" + + return await slm_completion(prompt, schema=ControlledContent) +``` + +### Safe Rewriting + +```python +async def safe_rewrite(content: str, age_band: str) -> RewriteResult: + prompt = f"""Rewrite for safety: + +Content: {content[:2000]} +Age band: {age_band} + +If content is safe: return unchanged with "safe" status. +If content needs rewriting: return rewritten version with reason. +If content is unsafe: return blocked with specific reason. + +Output as JSON: +- status: safe|rewritten|blocked +- original: original content +- result: content after rewrite (if applicable) +- reason: explanation""" + + return await slm_completion(prompt, schema=RewriteResult) +``` + +### Memory Compression + +```python +async def compress_session(session: Session) -> CompressedSession: + prompt = f"""Compress session memory for story continuity: + +Current session messages: {session.messages[-20:]} +Active characters: {session.characters} +Current plot state: {session.plot_state} + +Output as JSON: +- summary: 2-3 sentence story summary +- active_characters: array of character names with key traits +- current_quest: current story goal or "none" +- plot_anchors: array of key events that must be remembered +- tone_constraints: current tone settings +- age_band: current age target""" + + return await slm_completion(prompt, schema=CompressedSession) +``` + +### Illustration Prompt Shaping + +```python +async def shape_image_prompt( + scene: StoryScene, + characters: list[Character], + brand_guidelines: BrandGuidelines +) -> ImagePrompt: + prompt = f"""Create child-safe, brand-aligned image prompt: + +Scene: {scene.description} +Characters: {format_characters(characters)} +Story tone: {scene.tone} +Brand style: {brand_guidelines.style} + +Output as JSON: +- prompt: complete image generation prompt +- style: illustration style +- style_params: detailed style parameters +- character_refs: references to character assets +- safety_check: passed|needs_review|failed +- age_appropriate: boolean +- brand_compliant: boolean""" + + return await slm_completion(prompt, schema=ImagePrompt) +``` + +## Implementation Matrix + +### SLM Endpoints + +| Function | Endpoint | Model | Latency Target | +| -------------------- | --------------- | ---------- | -------------- | +| Story Classification | `/classify` | Phi-3 Mini | <100ms | +| Age-Tone Control | `/age-tone` | Phi-3 Mini | <100ms | +| Safe Rewrite | `/safewrite` | Llama 3 8B | <200ms | +| Memory Compression | `/compress` | Phi-3 Mini | <100ms | +| Consistency Check | `/validate` | Phi-3 Mini | <100ms | +| Image Prompt | `/image-prompt` | Phi-3 Mini | <100ms | + +### Contract Shapes + +```typescript +interface StoryClassification { + story_type: StoryType; + age_range: AgeRange; + is_interactive: boolean; + continuation: boolean; + needs_images: boolean; + curriculum_tags: string[]; + complexity: Complexity; + confidence: number; +} + +interface ControlledContent { + adapted_content: string; + reading_level: ReadingLevel; + safety_flag: boolean; + concerns: string[]; + confidence: number; +} + +interface CompressedSession { + summary: string; + active_characters: CharacterSummary[]; + current_quest: string | null; + plot_anchors: string[]; + tone_constraints: ToneConstraints; + age_band: AgeBand; +} + +interface ImagePrompt { + prompt: string; + style: IllustrationStyle; + style_params: StyleParams; + character_refs: string[]; + safety_check: SafetyStatus; + age_appropriate: boolean; + brand_compliant: boolean; +} +``` + +### Telemetry Fields + +| Field | Type | Description | +| ------------------- | ------- | ------------------------------ | +| `request_id` | string | Unique request identifier | +| `session_id` | string | Story session identifier | +| `timestamp` | ISO8601 | Request timestamp | +| `slm_model` | string | SLM model used | +| `function` | string | Classification function called | +| `latency_ms` | number | SLM processing time | +| `confidence` | number | Model confidence score | +| `routed_to_llm` | boolean | Whether LLM was invoked | +| `age_band` | string | Target age range | +| `story_type` | string | Classified story type | +| `safety_flagged` | boolean | Content was flagged | +| `content_rewritten` | boolean | Content was rewritten | +| `tokens_used` | number | Total tokens consumed | +| `cost_usd` | number | Estimated cost | + +### Fallback Rules + +| Condition | Action | +| --------------------------- | ---------------------------------- | +| SLM confidence < 0.7 | Escalate to LLM for classification | +| SLM timeout | Use deterministic rules fallback | +| Moderation flag = "blocked" | Return safe error to user | +| Age band mismatch | Enforce age-appropriate rewrite | +| Consistency check fails | Notify, allow LLM override | +| Image prompt fails safety | Use default safe prompt | + +### Confidence Thresholds Flowchart + +```mermaid +flowchart TD + A[Classification Result] --> B{Confidence >= 0.9?} + B -->|Yes| C[Use SLM Result] + B -->|No| D{Confidence >= 0.7?} + D -->|Yes| E{LLM Verification} + E -->|Agree| C + E -->|Disagree| F[Use LLM Result] + D -->|No| F + F --> G[Log Discrepancy] +``` + +## Tradeoffs + +| Pros | Cons | +| ---------------------------------------- | ----------------------------------------------------- | +| Lowers cost for interactive sessions | SLMs are weaker for rich narrative creativity | +| Improves safety and consistency | Overuse can make stories feel templated | +| Helps maintain story canon | Compression may lose subtle emotional continuity | +| Enables scalable personalization | Moderation can become too restrictive if tuned poorly | +| Reduces unnecessary LLM for simple steps | Image prompts may lack artistic nuance | + +## Correct Role + +| Use SLM For | Use LLM For | +| --------------- | --------------------------- | +| Preparation | Rich storytelling | +| Guardrails | Emotionally nuanced scenes | +| Continuity | Narrative synthesis | +| Personalization | Creative expansions | +| Prompt shaping | Final polished storytelling | + +## Combined Cross-System Architecture + +```mermaid +flowchart TB + U[Users / Apps / Operators / Dev Events] + + U --> G[AI Gateway] + G --> G1[SLM: intent + safety + routing] + G1 --> G2{Path} + + G2 -->|agentic work| M[Cognitive Mesh] + G2 -->|repo / CI work| C[CodeFlow Engine] + G2 -->|tooling / automation| A[AgentKit Forge] + G2 -->|creative storytelling| Y[Mystira] + G2 -->|simple response| S[Small / Mid Model] + G2 -->|deep reasoning| L[Large Model] + + M --> M1[SLM: specialist router] + M1 --> M2[Architecture Agent] + M1 --> M3[Infra Agent] + M1 --> M4[Security Agent] + M1 --> M5[Research / Cost Agent] + M2 --> X[Shared State / Evidence] + M3 --> X + M4 --> X + M5 --> X + X --> M6[SLM: context compressor] + M6 --> L + + C --> C1[SLM: PR / CI classifier] + C1 --> C2[Pipeline policy] + C2 --> C3[Fast path / full path / contract checks] + + A --> A1[SLM: tool selector] + A1 --> A2[GitHub] + A1 --> A3[Azure] + A1 --> A4[Terraform] + A1 --> A5[Kusto / Docs] + + Y --> Y1[SLM: age-fit + moderation + continuity] + Y1 --> Y2[Story Model / Creative LLM] + Y2 --> Y3[SLM: consistency + reading level + image prompt shaping] + + R[PhoenixRooivalk Edge] --> R1[Fusion / Threat Scoring] + R1 --> R2[SLM: operator interpretation] + R2 --> R3[Console / Incident Reports] + + L --> Z[Final synthesis / high-complexity outputs] +``` + +## Platform Comparison + +| Platform | Best SLM Role | Should SLM be Primary? | Escalate to LLM When | +| --------------- | ----------------------------------------------- | ------------------------- | ------------------------------------------------ | +| AI Gateway | routing, safety, cost control | **yes** | ambiguity, complex reasoning | +| Cognitive Mesh | agent routing, decomposition, compression | **yes** | cross-agent synthesis needed | +| CodeFlow Engine | PR/CI triage, failure summaries | **yes** | root cause requires deep analysis | +| AgentKit Forge | tool selection, memory shaping | **yes** | planning becomes ambiguous or multi-step | +| PhoenixRooivalk | operator summaries, reports | **no** | strategic analysis or long-form reporting | +| **Mystira** | moderation, age-fit, continuity, prompt shaping | **yes** for control layer | rich storytelling, emotionally nuanced narrative | + +## Key Concerns + +| Concern | Strategy | +| ------------------- | ------------------------------------------------------------ | +| Safety | SLM pre-filter + LLM post-filter + deterministic rules | +| Age-appropriateness | Hard rules for age bands + SLM adaptation | +| Story continuity | SLM validates consistency with plot anchors | +| Cost | Route simple steps through SLM; LLM only for rich generation | +| Creativity | Reserve LLM for emotionally nuanced storytelling | +| Parental controls | Deterministic rules + SLM suggestion refinement | +| Brand consistency | SLM enforces brand guidelines in image prompts | + +## Canonical Principle for Mystira + +> **Use SLMs to make stories safe, consistent, and affordable.** +> **Use LLMs to make them magical.** + +## Implementation Checklist + +- [ ] Add story request classification endpoint +- [ ] Implement age and tone control pipeline +- [ ] Add moderation and safe rewriting +- [ ] Implement session memory compression +- [ ] Add character/world consistency validation +- [ ] Implement illustration prompt shaping +- [ ] Set up cost tracking per session type +- [ ] Configure confidence threshold cascades +- [ ] Add parental controls integration +- [ ] Implement brand guidelines enforcement +- [ ] Add telemetry and observability +- [ ] Set up fallback deterministic rules diff --git a/docs/architecture/phoenix-rooivalk.md b/docs/architecture/systems/phoenix-rooivalk.md similarity index 100% rename from docs/architecture/phoenix-rooivalk.md rename to docs/architecture/systems/phoenix-rooivalk.md From 4c5e50686d1b05f856841197128c76814f1039aa Mon Sep 17 00:00:00 2001 From: JustAGhosT Date: Sun, 15 Mar 2026 13:20:25 +0200 Subject: [PATCH 11/11] ci: add reusable deploy-environment workflow Refactor deployment process by extracting environment-specific deployment logic into a reusable workflow. This reduces code duplication across dev/staging/prod environments and centralizes deployment configuration parameters. --- .github/workflows/deploy-environment.yaml | 232 +++++++ .github/workflows/deploy.yaml | 572 ++---------------- docs/planning/request_to_token_attribution.md | 8 +- 3 files changed, 286 insertions(+), 526 deletions(-) create mode 100644 .github/workflows/deploy-environment.yaml diff --git a/.github/workflows/deploy-environment.yaml b/.github/workflows/deploy-environment.yaml new file mode 100644 index 0000000..1485e2e --- /dev/null +++ b/.github/workflows/deploy-environment.yaml @@ -0,0 +1,232 @@ +name: Deploy Environment + +on: + workflow_call: + inputs: + env_name: + required: true + type: string + description: Environment name (dev/staging/prod) + tf_state_key: + required: true + type: string + description: Terraform state key (e.g., dev.terraform.tfstate) + codex_model: + required: true + type: string + description: Codex model deployment name + codex_api_version: + required: true + type: string + description: Codex API version + terraform_working_directory: + required: true + type: string + description: Terraform working directory (e.g., infra/env/dev) + smoke_retry_sleep: + required: false + type: string + default: "10" + description: Retry sleep for smoke tests + smoke_models_wait_sleep: + required: false + type: string + default: "15" + description: Wait sleep for model registration + include_aoai_host_check: + required: false + type: boolean + default: false + description: Include AOAI endpoint host validation + secrets: + AZURE_OPENAI_ENDPOINT: + required: true + AZURE_OPENAI_API_KEY: + required: true + AZURE_OPENAI_EMBEDDING_ENDPOINT: + required: true + AZURE_OPENAI_EMBEDDING_API_KEY: + required: true + AIGATEWAY_KEY: + required: true + +env: + TF_VAR_env: ${{ inputs.env_name }} + TF_VAR_projname: "aigateway" + TF_VAR_location: "southafricanorth" + TF_VAR_location_short: "san" + TF_VAR_azure_openai_endpoint: ${{ secrets.AZURE_OPENAI_ENDPOINT }} + TF_VAR_azure_openai_api_key: ${{ secrets.AZURE_OPENAI_API_KEY }} + TF_VAR_azure_openai_embedding_endpoint: ${{ secrets.AZURE_OPENAI_EMBEDDING_ENDPOINT }} + TF_VAR_azure_openai_embedding_api_key: ${{ secrets.AZURE_OPENAI_EMBEDDING_API_KEY }} + TF_VAR_gateway_key: ${{ secrets.AIGATEWAY_KEY }} + TF_VAR_codex_model: ${{ inputs.codex_model }} + TF_VAR_codex_api_version: ${{ inputs.codex_api_version }} + TF_VAR_embedding_deployment: "text-embedding-3-large" + TF_VAR_embeddings_api_version: "2024-02-01" + +jobs: + deploy: + runs-on: ubuntu-latest + defaults: + run: + working-directory: ${{ inputs.terraform_working_directory }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Quickcheck required secrets and config + shell: bash + run: | + set -euo pipefail + missing=0 + required=( + AZURE_CLIENT_ID + AZURE_TENANT_ID + AZURE_SUBSCRIPTION_ID + TF_BACKEND_RG + TF_BACKEND_SA + TF_BACKEND_CONTAINER + TF_VAR_azure_openai_endpoint + TF_VAR_azure_openai_api_key + TF_VAR_gateway_key + ) + for v in "${required[@]}"; do + if [ -z "${!v:-}" ]; then + echo "::error::Missing required value: ${v}" + missing=1 + else + echo "${v}=SET" + fi + done + echo "TF_VAR_env=${TF_VAR_env:-unset}" + echo "TF_VAR_embedding_deployment=${TF_VAR_embedding_deployment:-unset}" + echo "TF_VAR_codex_model=${TF_VAR_codex_model:-unset}" + if [ -n "${TF_VAR_azure_openai_endpoint:-}" ]; then + echo "Azure OpenAI endpoint=${TF_VAR_azure_openai_endpoint}" + endpoint_host=$(echo "${TF_VAR_azure_openai_endpoint}" | sed -E 's#^https?://([^/]+)/?.*$#\1#') + echo "Azure OpenAI endpoint host=${endpoint_host}" + if [ "${{ inputs.include_aoai_host_check }}" = "true" ] && [ -n "${EXPECTED_AOAI_ENDPOINT_HOST:-}" ] && [ "${endpoint_host}" != "${EXPECTED_AOAI_ENDPOINT_HOST}" ]; then + echo "::error::Prod AOAI endpoint host mismatch. Expected '${EXPECTED_AOAI_ENDPOINT_HOST}', got '${endpoint_host}'. Check environment secret AZURE_OPENAI_ENDPOINT." + missing=1 + fi + fi + if [ "${missing}" -ne 0 ]; then + exit 1 + fi + + - name: Azure Login + uses: azure/login@v2 + with: + client-id: ${{ env.AZURE_CLIENT_ID }} + tenant-id: ${{ env.AZURE_TENANT_ID }} + subscription-id: ${{ env.AZURE_SUBSCRIPTION_ID }} + + - name: Setup Terraform + uses: hashicorp/setup-terraform@v3 + with: + terraform_version: 1.14.6 + + - name: Terraform Init + run: | + terraform init \ + -backend-config="resource_group_name=${TF_BACKEND_RG}" \ + -backend-config="storage_account_name=${TF_BACKEND_SA}" \ + -backend-config="container_name=${TF_BACKEND_CONTAINER}" \ + -backend-config="key=${{ inputs.tf_state_key }}" + + - name: Import existing Container App into Terraform state + uses: ./.github/actions/import-container-app + with: + projname: ${{ env.TF_VAR_projname }} + env: ${{ env.TF_VAR_env }} + location_short: ${{ env.TF_VAR_location_short }} + subscription_id: ${{ env.AZURE_SUBSCRIPTION_ID }} + terraform_working_directory: ${{ inputs.terraform_working_directory }} + + - name: Terraform Plan + run: | + terraform plan -out=tfplan + + - name: Terraform Apply + run: | + terraform apply -auto-approve tfplan + + - name: Get gateway URL + id: gw + run: echo "url=$(terraform output -raw gateway_url)" >> $GITHUB_OUTPUT + + - name: Get dashboard URL + id: db + run: echo "url=$(terraform output -raw dashboard_url 2>/dev/null || true)" >> $GITHUB_OUTPUT + + - name: Runtime diagnostics (Container App config) + shell: bash + run: | + set -euo pipefail + RG_NAME="pvc-${TF_VAR_env}-${TF_VAR_projname}-rg-${TF_VAR_location_short}" + CA_NAME="pvc-${TF_VAR_env}-${TF_VAR_projname}-ca-${TF_VAR_location_short}" + echo "Resource Group: ${RG_NAME}" + echo "Container App: ${CA_NAME}" + echo "Gateway URL (terraform output): ${{ steps.gw.outputs.url }}" + echo "Latest revision:" + az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.latestRevisionName" -o tsv + echo "Active revisions (name, active, created):" + az containerapp revision list -g "${RG_NAME}" -n "${CA_NAME}" --query "[].{name:name,active:properties.active,created:properties.createdTime}" -o table + echo "Configured env vars for LiteLLM secret refs:" + az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.template.containers[0].env[?name=='LITELLM_AZURE_OPENAI_API_KEY' || name=='LITELLM_GATEWAY_KEY']" -o json + echo "Configured secret sources (names + key vault URLs):" + az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.configuration.secrets[].{name:name,keyVaultUrl:keyVaultUrl}" -o table + echo "LITELLM_CONFIG_CONTENT excerpt (first 2000 chars):" + az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.template.containers[0].env[?name=='LITELLM_CONFIG_CONTENT'].value | [0]" -o tsv | head -c 2000 || true + echo + + - name: Integration test (Azure OpenAI backend) + shell: bash + env: + AZURE_OPENAI_ENDPOINT: ${{ env.TF_VAR_azure_openai_endpoint }} + AZURE_OPENAI_API_KEY: ${{ env.TF_VAR_azure_openai_api_key }} + AZURE_OPENAI_EMBEDDING_ENDPOINT: ${{ env.TF_VAR_azure_openai_embedding_endpoint }} + AZURE_OPENAI_EMBEDDING_API_KEY: ${{ env.TF_VAR_azure_openai_embedding_api_key }} + AZURE_OPENAI_EMBEDDING_DEPLOYMENT: ${{ env.TF_VAR_embedding_deployment }} + AZURE_OPENAI_API_VERSION: ${{ env.TF_VAR_embeddings_api_version }} + AZURE_OPENAI_CHAT_DEPLOYMENT: "gpt-4.1" + AZURE_OPENAI_CHAT_API_VERSION: ${{ env.TF_VAR_codex_api_version }} + AZURE_OPENAI_CODEX_MODEL: ${{ env.TF_VAR_codex_model }} + working-directory: ${{ github.workspace }} + run: python3 scripts/integration_test.py + + - name: Smoke test gateway (embeddings + responses) + uses: ./.github/actions/smoke-test-gateway + with: + gateway_url: ${{ steps.gw.outputs.url }} + gateway_key: ${{ secrets.AIGATEWAY_KEY }} + embedding_model: ${{ env.TF_VAR_embedding_deployment }} + codex_model: ${{ env.TF_VAR_codex_model }} + aoai_endpoint: ${{ env.TF_VAR_azure_openai_endpoint }} + aoai_api_key: ${{ env.TF_VAR_azure_openai_api_key }} + max_attempts: "3" + retry_sleep: ${{ inputs.smoke_retry_sleep }} + models_wait_attempts: ${{ if(inputs.env_name == 'prod', '3', '1') }} + models_wait_sleep: ${{ inputs.smoke_models_wait_sleep }} + + - name: Smoke test shared state API (dashboard proxy) + if: env.TF_VAR_state_service_container_image != '' + shell: bash + run: | + set -euo pipefail + DASHBOARD_URL="${{ steps.db.outputs.url }}" + TEST_USER="ci-smoke-${TF_VAR_env}" + + curl -fsS --connect-timeout 5 --max-time 15 "${DASHBOARD_URL}/api/state/catalog" > /tmp/catalog.json + + curl -fsS --connect-timeout 5 --max-time 15 -X PUT "${DASHBOARD_URL}/api/state/selection" \ + -H "Content-Type: application/json" \ + -H "X-User-Id: ${TEST_USER}" \ + -d '{"enabled":true,"selected_model":"'"${TF_VAR_codex_model}"'"}' > /tmp/selection-put.json + + curl -fsS --connect-timeout 5 --max-time 15 "${DASHBOARD_URL}/api/state/selection" \ + -H "X-User-Id: ${TEST_USER}" > /tmp/selection-get.json + + jq -e '.enabled == true' /tmp/selection-get.json > /dev/null diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml index 2a0e590..59ece73 100644 --- a/.github/workflows/deploy.yaml +++ b/.github/workflows/deploy.yaml @@ -149,539 +149,67 @@ jobs: deploy-dev: name: Deploy dev needs: plan - runs-on: ubuntu-latest if: github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'dev' environment: dev - defaults: - run: - working-directory: infra/env/dev - - env: - TF_VAR_env: "dev" - TF_VAR_projname: "aigateway" - TF_VAR_location: "southafricanorth" - TF_VAR_location_short: "san" - TF_VAR_azure_openai_endpoint: ${{ secrets.AZURE_OPENAI_ENDPOINT }} - TF_VAR_azure_openai_api_key: ${{ secrets.AZURE_OPENAI_API_KEY }} - TF_VAR_azure_openai_embedding_endpoint: ${{ secrets.AZURE_OPENAI_EMBEDDING_ENDPOINT }} - TF_VAR_azure_openai_embedding_api_key: ${{ secrets.AZURE_OPENAI_EMBEDDING_API_KEY }} - TF_VAR_gateway_key: ${{ secrets.AIGATEWAY_KEY }} - TF_VAR_codex_model: "gpt-5.3-codex" - TF_VAR_codex_api_version: "2025-04-01-preview" - TF_VAR_embedding_deployment: "text-embedding-3-large" - TF_VAR_embeddings_api_version: "2024-02-01" - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Quickcheck required secrets and config - shell: bash - run: | - set -euo pipefail - missing=0 - required=( - AZURE_CLIENT_ID - AZURE_TENANT_ID - AZURE_SUBSCRIPTION_ID - TF_BACKEND_RG - TF_BACKEND_SA - TF_BACKEND_CONTAINER - TF_VAR_azure_openai_endpoint - TF_VAR_azure_openai_api_key - TF_VAR_gateway_key - ) - for v in "${required[@]}"; do - if [ -z "${!v:-}" ]; then - echo "::error::Missing required value: ${v}" - missing=1 - else - echo "${v}=SET" - fi - done - echo "TF_VAR_env=${TF_VAR_env:-unset}" - echo "TF_VAR_embedding_deployment=${TF_VAR_embedding_deployment:-unset}" - echo "TF_VAR_codex_model=${TF_VAR_codex_model:-unset}" - if [ -n "${TF_VAR_azure_openai_endpoint:-}" ]; then - echo "Azure OpenAI endpoint=${TF_VAR_azure_openai_endpoint}" - endpoint_host=$(echo "${TF_VAR_azure_openai_endpoint}" | sed -E 's#^https?://([^/]+)/?.*$#\1#') - echo "Azure OpenAI endpoint host=${endpoint_host}" - fi - if [ "${missing}" -ne 0 ]; then - exit 1 - fi - - - name: Azure Login - uses: azure/login@v2 - with: - client-id: ${{ env.AZURE_CLIENT_ID }} - tenant-id: ${{ env.AZURE_TENANT_ID }} - subscription-id: ${{ env.AZURE_SUBSCRIPTION_ID }} - - - name: Setup Terraform - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: 1.14.6 - - - name: Terraform Init - run: | - terraform init \ - -backend-config="resource_group_name=${TF_BACKEND_RG}" \ - -backend-config="storage_account_name=${TF_BACKEND_SA}" \ - -backend-config="container_name=${TF_BACKEND_CONTAINER}" \ - -backend-config="key=dev.terraform.tfstate" - - - name: Import existing Container App into Terraform state - uses: ./.github/actions/import-container-app - with: - projname: ${{ env.TF_VAR_projname }} - env: ${{ env.TF_VAR_env }} - location_short: ${{ env.TF_VAR_location_short }} - subscription_id: ${{ env.AZURE_SUBSCRIPTION_ID }} - terraform_working_directory: infra/env/dev - - - name: Terraform Plan - run: | - terraform plan -out=tfplan - - - name: Terraform Apply - run: | - terraform apply -auto-approve tfplan - - - name: Get gateway URL - id: gw - run: echo "url=$(terraform output -raw gateway_url)" >> $GITHUB_OUTPUT - - - name: Get dashboard URL - id: db - run: echo "url=$(terraform output -raw dashboard_url 2>/dev/null || true)" >> $GITHUB_OUTPUT - - - name: Runtime diagnostics (Container App config) - shell: bash - run: | - set -euo pipefail - RG_NAME="pvc-${TF_VAR_env}-${TF_VAR_projname}-rg-${TF_VAR_location_short}" - CA_NAME="pvc-${TF_VAR_env}-${TF_VAR_projname}-ca-${TF_VAR_location_short}" - echo "Resource Group: ${RG_NAME}" - echo "Container App: ${CA_NAME}" - echo "Gateway URL (terraform output): ${{ steps.gw.outputs.url }}" - echo "Latest revision:" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.latestRevisionName" -o tsv - echo "Active revisions (name, active, created):" - az containerapp revision list -g "${RG_NAME}" -n "${CA_NAME}" --query "[].{name:name,active:properties.active,created:properties.createdTime}" -o table - echo "Configured env vars for LiteLLM secret refs:" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.template.containers[0].env[?name=='LITELLM_AZURE_OPENAI_API_KEY' || name=='LITELLM_GATEWAY_KEY']" -o json - echo "Configured secret sources (names + key vault URLs):" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.configuration.secrets[].{name:name,keyVaultUrl:keyVaultUrl}" -o table - echo "LITELLM_CONFIG_CONTENT excerpt (first 2000 chars):" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.template.containers[0].env[?name=='LITELLM_CONFIG_CONTENT'].value | [0]" -o tsv | head -c 2000 || true - echo - - - name: Integration test (Azure OpenAI backend) - shell: bash - env: - AZURE_OPENAI_ENDPOINT: ${{ env.TF_VAR_azure_openai_endpoint }} - AZURE_OPENAI_API_KEY: ${{ env.TF_VAR_azure_openai_api_key }} - AZURE_OPENAI_EMBEDDING_ENDPOINT: ${{ env.TF_VAR_azure_openai_embedding_endpoint }} - AZURE_OPENAI_EMBEDDING_API_KEY: ${{ env.TF_VAR_azure_openai_embedding_api_key }} - AZURE_OPENAI_EMBEDDING_DEPLOYMENT: ${{ env.TF_VAR_embedding_deployment }} - AZURE_OPENAI_API_VERSION: ${{ env.TF_VAR_embeddings_api_version }} - AZURE_OPENAI_CHAT_DEPLOYMENT: "gpt-4.1" - AZURE_OPENAI_CHAT_API_VERSION: ${{ env.TF_VAR_codex_api_version }} - AZURE_OPENAI_CODEX_MODEL: ${{ env.TF_VAR_codex_model }} - working-directory: ${{ github.workspace }} - run: python3 scripts/integration_test.py - - - name: Smoke test gateway (embeddings + responses) - uses: ./.github/actions/smoke-test-gateway - with: - gateway_url: ${{ steps.gw.outputs.url }} - gateway_key: ${{ secrets.AIGATEWAY_KEY }} - embedding_model: ${{ env.TF_VAR_embedding_deployment }} - codex_model: ${{ env.TF_VAR_codex_model }} - aoai_endpoint: ${{ env.TF_VAR_azure_openai_endpoint }} - aoai_api_key: ${{ env.TF_VAR_azure_openai_api_key }} - max_attempts: "3" - retry_sleep: "10" - - - name: Smoke test shared state API (dashboard proxy) - if: env.TF_VAR_state_service_container_image != '' - shell: bash - run: | - set -euo pipefail - DASHBOARD_URL="${{ steps.db.outputs.url }}" - TEST_USER="ci-smoke-${TF_VAR_env}" - - curl -fsS --connect-timeout 5 --max-time 15 "${DASHBOARD_URL}/api/state/catalog" > /tmp/catalog.json - - curl -fsS --connect-timeout 5 --max-time 15 -X PUT "${DASHBOARD_URL}/api/state/selection" \ - -H "Content-Type: application/json" \ - -H "X-User-Id: ${TEST_USER}" \ - -d '{"enabled":true,"selected_model":"'"${TF_VAR_codex_model}"'"}' > /tmp/selection-put.json - - curl -fsS --connect-timeout 5 --max-time 15 "${DASHBOARD_URL}/api/state/selection" \ - -H "X-User-Id: ${TEST_USER}" > /tmp/selection-get.json - - jq -e '.enabled == true' /tmp/selection-get.json > /dev/null + uses: ./.github/workflows/deploy-environment.yaml + with: + env_name: dev + tf_state_key: dev.terraform.tfstate + codex_model: gpt-5.3-codex + codex_api_version: 2025-04-01-preview + terraform_working_directory: infra/env/dev + smoke_retry_sleep: "10" + smoke_models_wait_sleep: "15" + include_aoai_host_check: false + secrets: + AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} + AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} + AZURE_OPENAI_EMBEDDING_ENDPOINT: ${{ secrets.AZURE_OPENAI_EMBEDDING_ENDPOINT }} + AZURE_OPENAI_EMBEDDING_API_KEY: ${{ secrets.AZURE_OPENAI_EMBEDDING_API_KEY }} + AIGATEWAY_KEY: ${{ secrets.AIGATEWAY_KEY }} deploy-staging: name: Deploy staging needs: plan - runs-on: ubuntu-latest if: github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main' && contains(join(github.event.pull_request.labels.*.name, ','), 'run-staging') environment: staging - defaults: - run: - working-directory: infra/env/staging - - env: - TF_VAR_env: "staging" - TF_VAR_projname: "aigateway" - TF_VAR_location: "southafricanorth" - TF_VAR_location_short: "san" - TF_VAR_azure_openai_endpoint: ${{ secrets.AZURE_OPENAI_ENDPOINT }} - TF_VAR_azure_openai_api_key: ${{ secrets.AZURE_OPENAI_API_KEY }} - TF_VAR_azure_openai_embedding_endpoint: ${{ secrets.AZURE_OPENAI_EMBEDDING_ENDPOINT }} - TF_VAR_azure_openai_embedding_api_key: ${{ secrets.AZURE_OPENAI_EMBEDDING_API_KEY }} - TF_VAR_gateway_key: ${{ secrets.AIGATEWAY_KEY }} - TF_VAR_codex_model: "gpt-5.3-codex" - TF_VAR_codex_api_version: "2025-04-01-preview" - TF_VAR_embedding_deployment: "text-embedding-3-large" - TF_VAR_embeddings_api_version: "2024-02-01" - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Quickcheck required secrets and config - shell: bash - run: | - set -euo pipefail - missing=0 - required=( - AZURE_CLIENT_ID - AZURE_TENANT_ID - AZURE_SUBSCRIPTION_ID - TF_BACKEND_RG - TF_BACKEND_SA - TF_BACKEND_CONTAINER - TF_VAR_azure_openai_endpoint - TF_VAR_azure_openai_api_key - TF_VAR_gateway_key - ) - for v in "${required[@]}"; do - if [ -z "${!v:-}" ]; then - echo "::error::Missing required value: ${v}" - missing=1 - else - echo "${v}=SET" - fi - done - echo "TF_VAR_env=${TF_VAR_env:-unset}" - echo "TF_VAR_embedding_deployment=${TF_VAR_embedding_deployment:-unset}" - echo "TF_VAR_codex_model=${TF_VAR_codex_model:-unset}" - if [ -n "${TF_VAR_azure_openai_endpoint:-}" ]; then - echo "Azure OpenAI endpoint=${TF_VAR_azure_openai_endpoint}" - endpoint_host=$(echo "${TF_VAR_azure_openai_endpoint}" | sed -E 's#^https?://([^/]+)/?.*$#\1#') - echo "Azure OpenAI endpoint host=${endpoint_host}" - fi - if [ "${missing}" -ne 0 ]; then - exit 1 - fi - - - name: Azure Login - uses: azure/login@v2 - with: - client-id: ${{ env.AZURE_CLIENT_ID }} - tenant-id: ${{ env.AZURE_TENANT_ID }} - subscription-id: ${{ env.AZURE_SUBSCRIPTION_ID }} - - - name: Setup Terraform - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: 1.14.6 - - - name: Terraform Init - run: | - terraform init \ - -backend-config="resource_group_name=${TF_BACKEND_RG}" \ - -backend-config="storage_account_name=${TF_BACKEND_SA}" \ - -backend-config="container_name=${TF_BACKEND_CONTAINER}" \ - -backend-config="key=staging.terraform.tfstate" - - - name: Import existing Container App into Terraform state - uses: ./.github/actions/import-container-app - with: - projname: ${{ env.TF_VAR_projname }} - env: ${{ env.TF_VAR_env }} - location_short: ${{ env.TF_VAR_location_short }} - subscription_id: ${{ env.AZURE_SUBSCRIPTION_ID }} - terraform_working_directory: infra/env/staging - - - name: Terraform Plan - run: | - terraform plan -out=tfplan - - - name: Terraform Apply - run: | - terraform apply -auto-approve tfplan - - - name: Get gateway URL - id: gw - run: echo "url=$(terraform output -raw gateway_url)" >> $GITHUB_OUTPUT - - - name: Get dashboard URL - id: db - run: echo "url=$(terraform output -raw dashboard_url 2>/dev/null || true)" >> $GITHUB_OUTPUT - - - name: Runtime diagnostics (Container App config) - shell: bash - run: | - set -euo pipefail - RG_NAME="pvc-${TF_VAR_env}-${TF_VAR_projname}-rg-${TF_VAR_location_short}" - CA_NAME="pvc-${TF_VAR_env}-${TF_VAR_projname}-ca-${TF_VAR_location_short}" - echo "Resource Group: ${RG_NAME}" - echo "Container App: ${CA_NAME}" - echo "Gateway URL (terraform output): ${{ steps.gw.outputs.url }}" - echo "Latest revision:" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.latestRevisionName" -o tsv - echo "Active revisions (name, active, created):" - az containerapp revision list -g "${RG_NAME}" -n "${CA_NAME}" --query "[].{name:name,active:properties.active,created:properties.createdTime}" -o table - echo "Configured env vars for LiteLLM secret refs:" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.template.containers[0].env[?name=='LITELLM_AZURE_OPENAI_API_KEY' || name=='LITELLM_GATEWAY_KEY']" -o json - echo "Configured secret sources (names + key vault URLs):" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.configuration.secrets[].{name:name,keyVaultUrl:keyVaultUrl}" -o table - echo "LITELLM_CONFIG_CONTENT excerpt (first 2000 chars):" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.template.containers[0].env[?name=='LITELLM_CONFIG_CONTENT'].value | [0]" -o tsv | head -c 2000 || true - echo - - - name: Integration test (Azure OpenAI backend) - shell: bash - env: - AZURE_OPENAI_ENDPOINT: ${{ env.TF_VAR_azure_openai_endpoint }} - AZURE_OPENAI_API_KEY: ${{ env.TF_VAR_azure_openai_api_key }} - AZURE_OPENAI_EMBEDDING_ENDPOINT: ${{ env.TF_VAR_azure_openai_embedding_endpoint }} - AZURE_OPENAI_EMBEDDING_API_KEY: ${{ env.TF_VAR_azure_openai_embedding_api_key }} - AZURE_OPENAI_EMBEDDING_DEPLOYMENT: ${{ env.TF_VAR_embedding_deployment }} - AZURE_OPENAI_API_VERSION: ${{ env.TF_VAR_embeddings_api_version }} - AZURE_OPENAI_CHAT_DEPLOYMENT: "gpt-4.1" - AZURE_OPENAI_CHAT_API_VERSION: ${{ env.TF_VAR_codex_api_version }} - AZURE_OPENAI_CODEX_MODEL: ${{ env.TF_VAR_codex_model }} - working-directory: ${{ github.workspace }} - run: python3 scripts/integration_test.py - - - name: Smoke test gateway (embeddings + responses) - uses: ./.github/actions/smoke-test-gateway - with: - gateway_url: ${{ steps.gw.outputs.url }} - gateway_key: ${{ secrets.AIGATEWAY_KEY }} - embedding_model: ${{ env.TF_VAR_embedding_deployment }} - codex_model: ${{ env.TF_VAR_codex_model }} - aoai_endpoint: ${{ env.TF_VAR_azure_openai_endpoint }} - aoai_api_key: ${{ env.TF_VAR_azure_openai_api_key }} - max_attempts: "3" - retry_sleep: "10" - - - name: Smoke test shared state API (dashboard proxy) - if: env.TF_VAR_state_service_container_image != '' - shell: bash - run: | - set -euo pipefail - DASHBOARD_URL="${{ steps.db.outputs.url }}" - TEST_USER="ci-smoke-${TF_VAR_env}" - - curl -fsS --connect-timeout 5 --max-time 15 "${DASHBOARD_URL}/api/state/catalog" > /tmp/catalog.json - - curl -fsS --connect-timeout 5 --max-time 15 -X PUT "${DASHBOARD_URL}/api/state/selection" \ - -H "Content-Type: application/json" \ - -H "X-User-Id: ${TEST_USER}" \ - -d '{"enabled":true,"selected_model":"'"${TF_VAR_codex_model}"'"}' > /tmp/selection-put.json - - curl -fsS --connect-timeout 5 --max-time 15 "${DASHBOARD_URL}/api/state/selection" \ - -H "X-User-Id: ${TEST_USER}" > /tmp/selection-get.json - - jq -e '.enabled == true' /tmp/selection-get.json > /dev/null + uses: ./.github/workflows/deploy-environment.yaml + with: + env_name: staging + tf_state_key: staging.terraform.tfstate + codex_model: gpt-5.3-codex + codex_api_version: 2025-04-01-preview + terraform_working_directory: infra/env/staging + smoke_retry_sleep: "10" + smoke_models_wait_sleep: "15" + include_aoai_host_check: false + secrets: + AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} + AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} + AZURE_OPENAI_EMBEDDING_ENDPOINT: ${{ secrets.AZURE_OPENAI_EMBEDDING_ENDPOINT }} + AZURE_OPENAI_EMBEDDING_API_KEY: ${{ secrets.AZURE_OPENAI_EMBEDDING_API_KEY }} + AIGATEWAY_KEY: ${{ secrets.AIGATEWAY_KEY }} deploy-prod: name: Deploy prod needs: plan - runs-on: ubuntu-latest if: github.event_name == 'workflow_dispatch' || (github.event_name == 'push' && github.ref == 'refs/heads/main') environment: prod - defaults: - run: - working-directory: infra/env/prod - - env: - TF_VAR_env: "prod" - TF_VAR_projname: "aigateway" - TF_VAR_location: "southafricanorth" - TF_VAR_location_short: "san" - TF_VAR_azure_openai_endpoint: ${{ secrets.AZURE_OPENAI_ENDPOINT }} - TF_VAR_azure_openai_api_key: ${{ secrets.AZURE_OPENAI_API_KEY }} - TF_VAR_azure_openai_embedding_endpoint: ${{ secrets.AZURE_OPENAI_EMBEDDING_ENDPOINT }} - TF_VAR_azure_openai_embedding_api_key: ${{ secrets.AZURE_OPENAI_EMBEDDING_API_KEY }} - TF_VAR_gateway_key: ${{ secrets.AIGATEWAY_KEY }} - TF_VAR_codex_model: "gpt-4o" - TF_VAR_codex_api_version: "2025-01-01-preview" - TF_VAR_embedding_deployment: "text-embedding-3-large" - TF_VAR_embeddings_api_version: "2024-02-01" - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Quickcheck required secrets and config - shell: bash - run: | - set -euo pipefail - missing=0 - required=( - AZURE_CLIENT_ID - AZURE_TENANT_ID - AZURE_SUBSCRIPTION_ID - TF_BACKEND_RG - TF_BACKEND_SA - TF_BACKEND_CONTAINER - TF_VAR_azure_openai_endpoint - TF_VAR_azure_openai_api_key - TF_VAR_gateway_key - ) - for v in "${required[@]}"; do - if [ -z "${!v:-}" ]; then - echo "::error::Missing required value: ${v}" - missing=1 - else - echo "${v}=SET" - fi - done - echo "TF_VAR_env=${TF_VAR_env:-unset}" - echo "TF_VAR_embedding_deployment=${TF_VAR_embedding_deployment:-unset}" - echo "TF_VAR_codex_model=${TF_VAR_codex_model:-unset}" - if [ -n "${TF_VAR_azure_openai_endpoint:-}" ]; then - echo "Azure OpenAI endpoint=${TF_VAR_azure_openai_endpoint}" - endpoint_host=$(echo "${TF_VAR_azure_openai_endpoint}" | sed -E 's#^https?://([^/]+)/?.*$#\1#') - echo "Azure OpenAI endpoint host=${endpoint_host}" - if [ -n "${EXPECTED_AOAI_ENDPOINT_HOST:-}" ] && [ "${endpoint_host}" != "${EXPECTED_AOAI_ENDPOINT_HOST}" ]; then - echo "::error::Prod AOAI endpoint host mismatch. Expected '${EXPECTED_AOAI_ENDPOINT_HOST}', got '${endpoint_host}'. Check environment secret AZURE_OPENAI_ENDPOINT." - missing=1 - fi - fi - if [ "${missing}" -ne 0 ]; then - exit 1 - fi - - - name: Azure Login - uses: azure/login@v2 - with: - client-id: ${{ env.AZURE_CLIENT_ID }} - tenant-id: ${{ env.AZURE_TENANT_ID }} - subscription-id: ${{ env.AZURE_SUBSCRIPTION_ID }} - - - name: Setup Terraform - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: 1.14.6 - - - name: Terraform Init - run: | - terraform init \ - -backend-config="resource_group_name=${TF_BACKEND_RG}" \ - -backend-config="storage_account_name=${TF_BACKEND_SA}" \ - -backend-config="container_name=${TF_BACKEND_CONTAINER}" \ - -backend-config="key=prod.terraform.tfstate" - - - name: Import existing Container App into Terraform state - uses: ./.github/actions/import-container-app - with: - projname: ${{ env.TF_VAR_projname }} - env: ${{ env.TF_VAR_env }} - location_short: ${{ env.TF_VAR_location_short }} - subscription_id: ${{ env.AZURE_SUBSCRIPTION_ID }} - terraform_working_directory: infra/env/prod - - - name: Terraform Plan - run: | - terraform plan -out=tfplan - - - name: Terraform Apply - run: | - terraform apply -auto-approve tfplan - - - name: Get gateway URL - id: gw - run: echo "url=$(terraform output -raw gateway_url)" >> $GITHUB_OUTPUT - - - name: Get dashboard URL - id: db - run: echo "url=$(terraform output -raw dashboard_url 2>/dev/null || true)" >> $GITHUB_OUTPUT - - - name: Runtime diagnostics (Container App config) - shell: bash - run: | - set -euo pipefail - RG_NAME="pvc-${TF_VAR_env}-${TF_VAR_projname}-rg-${TF_VAR_location_short}" - CA_NAME="pvc-${TF_VAR_env}-${TF_VAR_projname}-ca-${TF_VAR_location_short}" - echo "Resource Group: ${RG_NAME}" - echo "Container App: ${CA_NAME}" - echo "Gateway URL (terraform output): ${{ steps.gw.outputs.url }}" - echo "Latest revision:" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.latestRevisionName" -o tsv - echo "Active revisions (name, active, created):" - az containerapp revision list -g "${RG_NAME}" -n "${CA_NAME}" --query "[].{name:name,active:properties.active,created:properties.createdTime}" -o table - echo "Configured env vars for LiteLLM secret refs:" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.template.containers[0].env[?name=='LITELLM_AZURE_OPENAI_API_KEY' || name=='LITELLM_GATEWAY_KEY']" -o json - echo "Configured secret sources (names + key vault URLs):" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.configuration.secrets[].{name:name,keyVaultUrl:keyVaultUrl}" -o table - echo "LITELLM_CONFIG_CONTENT excerpt (first 2000 chars):" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.template.containers[0].env[?name=='LITELLM_CONFIG_CONTENT'].value | [0]" -o tsv | head -c 2000 || true - echo - - - name: Integration test (Azure OpenAI backend) - shell: bash - env: - AZURE_OPENAI_ENDPOINT: ${{ env.TF_VAR_azure_openai_endpoint }} - AZURE_OPENAI_API_KEY: ${{ env.TF_VAR_azure_openai_api_key }} - AZURE_OPENAI_EMBEDDING_ENDPOINT: ${{ env.TF_VAR_azure_openai_embedding_endpoint }} - AZURE_OPENAI_EMBEDDING_API_KEY: ${{ env.TF_VAR_azure_openai_embedding_api_key }} - AZURE_OPENAI_EMBEDDING_DEPLOYMENT: ${{ env.TF_VAR_embedding_deployment }} - AZURE_OPENAI_API_VERSION: ${{ env.TF_VAR_embeddings_api_version }} - AZURE_OPENAI_CHAT_DEPLOYMENT: "gpt-4.1" - AZURE_OPENAI_CHAT_API_VERSION: ${{ env.TF_VAR_codex_api_version }} - AZURE_OPENAI_CODEX_MODEL: ${{ env.TF_VAR_codex_model }} - working-directory: ${{ github.workspace }} - run: python3 scripts/integration_test.py - - - name: Smoke test gateway (embeddings + responses) - uses: ./.github/actions/smoke-test-gateway - with: - gateway_url: ${{ steps.gw.outputs.url }} - gateway_key: ${{ secrets.AIGATEWAY_KEY }} - embedding_model: ${{ env.TF_VAR_embedding_deployment }} - codex_model: ${{ env.TF_VAR_codex_model }} - aoai_endpoint: ${{ env.TF_VAR_azure_openai_endpoint }} - aoai_api_key: ${{ env.TF_VAR_azure_openai_api_key }} - max_attempts: "3" - retry_sleep: "15" # prod: longer cold-start; allow more time between retries - models_wait_attempts: "3" # prod: wait longer for LiteLLM to register healthy deployments - models_wait_sleep: "30" - - - name: Smoke test shared state API (dashboard proxy) - if: env.TF_VAR_state_service_container_image != '' - shell: bash - run: | - set -euo pipefail - DASHBOARD_URL="${{ steps.db.outputs.url }}" - TEST_USER="ci-smoke-${TF_VAR_env}" - - curl -fsS --connect-timeout 5 --max-time 15 "${DASHBOARD_URL}/api/state/catalog" > /tmp/catalog.json - - curl -fsS --connect-timeout 5 --max-time 15 -X PUT "${DASHBOARD_URL}/api/state/selection" \ - -H "Content-Type: application/json" \ - -H "X-User-Id: ${TEST_USER}" \ - -d '{"enabled":true,"selected_model":"'"${TF_VAR_codex_model}"'"}' > /tmp/selection-put.json - - curl -fsS --connect-timeout 5 --max-time 15 "${DASHBOARD_URL}/api/state/selection" \ - -H "X-User-Id: ${TEST_USER}" > /tmp/selection-get.json - - jq -e '.enabled == true' /tmp/selection-get.json > /dev/null + uses: ./.github/workflows/deploy-environment.yaml + with: + env_name: prod + tf_state_key: prod.terraform.tfstate + codex_model: gpt-4o + codex_api_version: 2025-01-01-preview + terraform_working_directory: infra/env/prod + smoke_retry_sleep: "15" + smoke_models_wait_sleep: "30" + include_aoai_host_check: true + secrets: + AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} + AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} + AZURE_OPENAI_EMBEDDING_ENDPOINT: ${{ secrets.AZURE_OPENAI_EMBEDDING_ENDPOINT }} + AZURE_OPENAI_EMBEDDING_API_KEY: ${{ secrets.AZURE_OPENAI_EMBEDDING_API_KEY }} + AIGATEWAY_KEY: ${{ secrets.AIGATEWAY_KEY }} + + # Legacy inline deployments removed - see deploy-environment.yaml diff --git a/docs/planning/request_to_token_attribution.md b/docs/planning/request_to_token_attribution.md index 89a7a49..d5790c1 100644 --- a/docs/planning/request_to_token_attribution.md +++ b/docs/planning/request_to_token_attribution.md @@ -122,7 +122,7 @@ Start with downstream aggregation in pvc-costops-analytics - the cheapest and fa ### 1. cognitive-mesh (Upstream Caller) -**Required:** Must pass correlation headers when calling gateway. There are two methods: +**Required:** Pass correlation metadata in request body when calling gateway. There are two methods: **Method A: Via Request Metadata (Recommended)** Pass correlation IDs in the request body `metadata` field: @@ -216,9 +216,9 @@ _Note: Method B requires additional LiteLLM configuration or middleware._ ## Dependencies -- cognitive-mesh: Must pass correlation metadata to gateway +- cognitive-mesh: Pass correlation metadata in request body - pvc-costops-analytics: Must create KQL queries for new event shape -- infra: Application Insights being added for trace storage +- infra: Application Insights resource + APPLICATIONINSIGHTS_CONNECTION_STRING wiring added; trace export requires custom LiteLLM image (with azure-monitor-opentelemetry) or explicit OTEL_EXPORTER_OTLP_ENDPOINT configuration (currently empty by default) ## Action Items @@ -226,7 +226,7 @@ _Note: Method B requires additional LiteLLM configuration or middleware._ 1. ✅ ai-gateway: Add OTEL callback for token telemetry (Phase 1) 2. ✅ ai-gateway: Document correlation ID requirements (Phase 2) -3. ✅ ai-gateway: Add Application Insights for trace storage (Phase 1b) +3. ✅ ai-gateway: Add Application Insights connection string wiring (Phase 1b - trace export requires custom image or OTLP collector) ### Pending