diff --git a/.github/actions/import-container-app/action.yml b/.github/actions/import-container-app/action.yml index cdd6872..5950bca 100644 --- a/.github/actions/import-container-app/action.yml +++ b/.github/actions/import-container-app/action.yml @@ -10,7 +10,7 @@ inputs: description: Project name component of the Container App name (TF_VAR_projname) env: required: true - description: Environment name (dev|uat|prod) + description: Environment name (dev|staging|prod) location_short: required: true description: Short location code (TF_VAR_location_short) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 47b33b7..2081df5 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -15,10 +15,10 @@ - [ ] No environment/config changes required - [ ] Environment/config changes required (describe below) -## UAT Toggle (PRs to `main`) +## Staging Toggle (PRs to `main`) -- Add label `run-uat` to this PR to enable UAT deployment (`deploy-uat`). -- Remove label `run-uat` to skip UAT deployment. +- Add label `run-staging` to this PR to enable staging deployment (`deploy-staging`). +- Remove label `run-staging` to skip staging deployment. ## Risk / Rollback diff --git a/.github/workflows/deploy-environment.yaml b/.github/workflows/deploy-environment.yaml new file mode 100644 index 0000000..1485e2e --- /dev/null +++ b/.github/workflows/deploy-environment.yaml @@ -0,0 +1,232 @@ +name: Deploy Environment + +on: + workflow_call: + inputs: + env_name: + required: true + type: string + description: Environment name (dev/staging/prod) + tf_state_key: + required: true + type: string + description: Terraform state key (e.g., dev.terraform.tfstate) + codex_model: + required: true + type: string + description: Codex model deployment name + codex_api_version: + required: true + type: string + description: Codex API version + terraform_working_directory: + required: true + type: string + description: Terraform working directory (e.g., infra/env/dev) + smoke_retry_sleep: + required: false + type: string + default: "10" + description: Retry sleep for smoke tests + smoke_models_wait_sleep: + required: false + type: string + default: "15" + description: Wait sleep for model registration + include_aoai_host_check: + required: false + type: boolean + default: false + description: Include AOAI endpoint host validation + secrets: + AZURE_OPENAI_ENDPOINT: + required: true + AZURE_OPENAI_API_KEY: + required: true + AZURE_OPENAI_EMBEDDING_ENDPOINT: + required: true + AZURE_OPENAI_EMBEDDING_API_KEY: + required: true + AIGATEWAY_KEY: + required: true + +env: + TF_VAR_env: ${{ inputs.env_name }} + TF_VAR_projname: "aigateway" + TF_VAR_location: "southafricanorth" + TF_VAR_location_short: "san" + TF_VAR_azure_openai_endpoint: ${{ secrets.AZURE_OPENAI_ENDPOINT }} + TF_VAR_azure_openai_api_key: ${{ secrets.AZURE_OPENAI_API_KEY }} + TF_VAR_azure_openai_embedding_endpoint: ${{ secrets.AZURE_OPENAI_EMBEDDING_ENDPOINT }} + TF_VAR_azure_openai_embedding_api_key: ${{ secrets.AZURE_OPENAI_EMBEDDING_API_KEY }} + TF_VAR_gateway_key: ${{ secrets.AIGATEWAY_KEY }} + TF_VAR_codex_model: ${{ inputs.codex_model }} + TF_VAR_codex_api_version: ${{ inputs.codex_api_version }} + TF_VAR_embedding_deployment: "text-embedding-3-large" + TF_VAR_embeddings_api_version: "2024-02-01" + +jobs: + deploy: + runs-on: ubuntu-latest + defaults: + run: + working-directory: ${{ inputs.terraform_working_directory }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Quickcheck required secrets and config + shell: bash + run: | + set -euo pipefail + missing=0 + required=( + AZURE_CLIENT_ID + AZURE_TENANT_ID + AZURE_SUBSCRIPTION_ID + TF_BACKEND_RG + TF_BACKEND_SA + TF_BACKEND_CONTAINER + TF_VAR_azure_openai_endpoint + TF_VAR_azure_openai_api_key + TF_VAR_gateway_key + ) + for v in "${required[@]}"; do + if [ -z "${!v:-}" ]; then + echo "::error::Missing required value: ${v}" + missing=1 + else + echo "${v}=SET" + fi + done + echo "TF_VAR_env=${TF_VAR_env:-unset}" + echo "TF_VAR_embedding_deployment=${TF_VAR_embedding_deployment:-unset}" + echo "TF_VAR_codex_model=${TF_VAR_codex_model:-unset}" + if [ -n "${TF_VAR_azure_openai_endpoint:-}" ]; then + echo "Azure OpenAI endpoint=${TF_VAR_azure_openai_endpoint}" + endpoint_host=$(echo "${TF_VAR_azure_openai_endpoint}" | sed -E 's#^https?://([^/]+)/?.*$#\1#') + echo "Azure OpenAI endpoint host=${endpoint_host}" + if [ "${{ inputs.include_aoai_host_check }}" = "true" ] && [ -n "${EXPECTED_AOAI_ENDPOINT_HOST:-}" ] && [ "${endpoint_host}" != "${EXPECTED_AOAI_ENDPOINT_HOST}" ]; then + echo "::error::Prod AOAI endpoint host mismatch. Expected '${EXPECTED_AOAI_ENDPOINT_HOST}', got '${endpoint_host}'. Check environment secret AZURE_OPENAI_ENDPOINT." + missing=1 + fi + fi + if [ "${missing}" -ne 0 ]; then + exit 1 + fi + + - name: Azure Login + uses: azure/login@v2 + with: + client-id: ${{ env.AZURE_CLIENT_ID }} + tenant-id: ${{ env.AZURE_TENANT_ID }} + subscription-id: ${{ env.AZURE_SUBSCRIPTION_ID }} + + - name: Setup Terraform + uses: hashicorp/setup-terraform@v3 + with: + terraform_version: 1.14.6 + + - name: Terraform Init + run: | + terraform init \ + -backend-config="resource_group_name=${TF_BACKEND_RG}" \ + -backend-config="storage_account_name=${TF_BACKEND_SA}" \ + -backend-config="container_name=${TF_BACKEND_CONTAINER}" \ + -backend-config="key=${{ inputs.tf_state_key }}" + + - name: Import existing Container App into Terraform state + uses: ./.github/actions/import-container-app + with: + projname: ${{ env.TF_VAR_projname }} + env: ${{ env.TF_VAR_env }} + location_short: ${{ env.TF_VAR_location_short }} + subscription_id: ${{ env.AZURE_SUBSCRIPTION_ID }} + terraform_working_directory: ${{ inputs.terraform_working_directory }} + + - name: Terraform Plan + run: | + terraform plan -out=tfplan + + - name: Terraform Apply + run: | + terraform apply -auto-approve tfplan + + - name: Get gateway URL + id: gw + run: echo "url=$(terraform output -raw gateway_url)" >> $GITHUB_OUTPUT + + - name: Get dashboard URL + id: db + run: echo "url=$(terraform output -raw dashboard_url 2>/dev/null || true)" >> $GITHUB_OUTPUT + + - name: Runtime diagnostics (Container App config) + shell: bash + run: | + set -euo pipefail + RG_NAME="pvc-${TF_VAR_env}-${TF_VAR_projname}-rg-${TF_VAR_location_short}" + CA_NAME="pvc-${TF_VAR_env}-${TF_VAR_projname}-ca-${TF_VAR_location_short}" + echo "Resource Group: ${RG_NAME}" + echo "Container App: ${CA_NAME}" + echo "Gateway URL (terraform output): ${{ steps.gw.outputs.url }}" + echo "Latest revision:" + az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.latestRevisionName" -o tsv + echo "Active revisions (name, active, created):" + az containerapp revision list -g "${RG_NAME}" -n "${CA_NAME}" --query "[].{name:name,active:properties.active,created:properties.createdTime}" -o table + echo "Configured env vars for LiteLLM secret refs:" + az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.template.containers[0].env[?name=='LITELLM_AZURE_OPENAI_API_KEY' || name=='LITELLM_GATEWAY_KEY']" -o json + echo "Configured secret sources (names + key vault URLs):" + az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.configuration.secrets[].{name:name,keyVaultUrl:keyVaultUrl}" -o table + echo "LITELLM_CONFIG_CONTENT excerpt (first 2000 chars):" + az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.template.containers[0].env[?name=='LITELLM_CONFIG_CONTENT'].value | [0]" -o tsv | head -c 2000 || true + echo + + - name: Integration test (Azure OpenAI backend) + shell: bash + env: + AZURE_OPENAI_ENDPOINT: ${{ env.TF_VAR_azure_openai_endpoint }} + AZURE_OPENAI_API_KEY: ${{ env.TF_VAR_azure_openai_api_key }} + AZURE_OPENAI_EMBEDDING_ENDPOINT: ${{ env.TF_VAR_azure_openai_embedding_endpoint }} + AZURE_OPENAI_EMBEDDING_API_KEY: ${{ env.TF_VAR_azure_openai_embedding_api_key }} + AZURE_OPENAI_EMBEDDING_DEPLOYMENT: ${{ env.TF_VAR_embedding_deployment }} + AZURE_OPENAI_API_VERSION: ${{ env.TF_VAR_embeddings_api_version }} + AZURE_OPENAI_CHAT_DEPLOYMENT: "gpt-4.1" + AZURE_OPENAI_CHAT_API_VERSION: ${{ env.TF_VAR_codex_api_version }} + AZURE_OPENAI_CODEX_MODEL: ${{ env.TF_VAR_codex_model }} + working-directory: ${{ github.workspace }} + run: python3 scripts/integration_test.py + + - name: Smoke test gateway (embeddings + responses) + uses: ./.github/actions/smoke-test-gateway + with: + gateway_url: ${{ steps.gw.outputs.url }} + gateway_key: ${{ secrets.AIGATEWAY_KEY }} + embedding_model: ${{ env.TF_VAR_embedding_deployment }} + codex_model: ${{ env.TF_VAR_codex_model }} + aoai_endpoint: ${{ env.TF_VAR_azure_openai_endpoint }} + aoai_api_key: ${{ env.TF_VAR_azure_openai_api_key }} + max_attempts: "3" + retry_sleep: ${{ inputs.smoke_retry_sleep }} + models_wait_attempts: ${{ if(inputs.env_name == 'prod', '3', '1') }} + models_wait_sleep: ${{ inputs.smoke_models_wait_sleep }} + + - name: Smoke test shared state API (dashboard proxy) + if: env.TF_VAR_state_service_container_image != '' + shell: bash + run: | + set -euo pipefail + DASHBOARD_URL="${{ steps.db.outputs.url }}" + TEST_USER="ci-smoke-${TF_VAR_env}" + + curl -fsS --connect-timeout 5 --max-time 15 "${DASHBOARD_URL}/api/state/catalog" > /tmp/catalog.json + + curl -fsS --connect-timeout 5 --max-time 15 -X PUT "${DASHBOARD_URL}/api/state/selection" \ + -H "Content-Type: application/json" \ + -H "X-User-Id: ${TEST_USER}" \ + -d '{"enabled":true,"selected_model":"'"${TF_VAR_codex_model}"'"}' > /tmp/selection-put.json + + curl -fsS --connect-timeout 5 --max-time 15 "${DASHBOARD_URL}/api/state/selection" \ + -H "X-User-Id: ${TEST_USER}" > /tmp/selection-get.json + + jq -e '.enabled == true' /tmp/selection-get.json > /dev/null diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml index 34a6c9f..59ece73 100644 --- a/.github/workflows/deploy.yaml +++ b/.github/workflows/deploy.yaml @@ -33,23 +33,23 @@ env: jobs: plan: - # PR into dev → dev | PR into main + label 'run-uat' → uat | Push to main/workflow_dispatch → prod + # PR into dev → dev | PR into main + label 'run-staging' → staging | Push to main/workflow_dispatch → prod # Skip plan for PRs from forks (no repo secrets; avoids AADSTS700213) - # Runtime UAT toggle: add PR label 'run-uat' to enable UAT on PRs into main. + # Runtime staging toggle: add PR label 'run-staging' to enable staging on PRs into main. if: | (github.event_name != 'pull_request' || github.event.pull_request.head.repo.fork == false) && ( (github.event_name == 'push' && github.ref == 'refs/heads/main') || (github.event_name == 'workflow_dispatch') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'dev') || - (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main' && contains(join(github.event.pull_request.labels.*.name, ','), 'run-uat')) + (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main' && contains(join(github.event.pull_request.labels.*.name, ','), 'run-staging')) ) name: Plan ${{ matrix.environment }} runs-on: ubuntu-latest strategy: fail-fast: false matrix: - environment: ${{ (github.event_name == 'workflow_dispatch' && fromJSON('["prod"]')) || (github.event_name == 'push' && github.ref == 'refs/heads/main' && fromJSON('["prod"]')) || (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'dev' && fromJSON('["dev"]')) || (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main' && contains(join(github.event.pull_request.labels.*.name, ','), 'run-uat') && fromJSON('["uat"]')) || fromJSON('["prod"]') }} + environment: ${{ (github.event_name == 'workflow_dispatch' && fromJSON('["prod"]')) || (github.event_name == 'push' && github.ref == 'refs/heads/main' && fromJSON('["prod"]')) || (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'dev' && fromJSON('["dev"]')) || (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main' && contains(join(github.event.pull_request.labels.*.name, ','), 'run-staging') && fromJSON('["staging"]')) || fromJSON('["prod"]') }} environment: ${{ matrix.environment }} defaults: run: @@ -149,539 +149,67 @@ jobs: deploy-dev: name: Deploy dev needs: plan - runs-on: ubuntu-latest if: github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'dev' environment: dev - defaults: - run: - working-directory: infra/env/dev - - env: - TF_VAR_env: "dev" - TF_VAR_projname: "aigateway" - TF_VAR_location: "southafricanorth" - TF_VAR_location_short: "san" - TF_VAR_azure_openai_endpoint: ${{ secrets.AZURE_OPENAI_ENDPOINT }} - TF_VAR_azure_openai_api_key: ${{ secrets.AZURE_OPENAI_API_KEY }} - TF_VAR_azure_openai_embedding_endpoint: ${{ secrets.AZURE_OPENAI_EMBEDDING_ENDPOINT }} - TF_VAR_azure_openai_embedding_api_key: ${{ secrets.AZURE_OPENAI_EMBEDDING_API_KEY }} - TF_VAR_gateway_key: ${{ secrets.AIGATEWAY_KEY }} - TF_VAR_codex_model: "gpt-5.3-codex" - TF_VAR_codex_api_version: "2025-04-01-preview" - TF_VAR_embedding_deployment: "text-embedding-3-large" - TF_VAR_embeddings_api_version: "2024-02-01" - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Quickcheck required secrets and config - shell: bash - run: | - set -euo pipefail - missing=0 - required=( - AZURE_CLIENT_ID - AZURE_TENANT_ID - AZURE_SUBSCRIPTION_ID - TF_BACKEND_RG - TF_BACKEND_SA - TF_BACKEND_CONTAINER - TF_VAR_azure_openai_endpoint - TF_VAR_azure_openai_api_key - TF_VAR_gateway_key - ) - for v in "${required[@]}"; do - if [ -z "${!v:-}" ]; then - echo "::error::Missing required value: ${v}" - missing=1 - else - echo "${v}=SET" - fi - done - echo "TF_VAR_env=${TF_VAR_env:-unset}" - echo "TF_VAR_embedding_deployment=${TF_VAR_embedding_deployment:-unset}" - echo "TF_VAR_codex_model=${TF_VAR_codex_model:-unset}" - if [ -n "${TF_VAR_azure_openai_endpoint:-}" ]; then - echo "Azure OpenAI endpoint=${TF_VAR_azure_openai_endpoint}" - endpoint_host=$(echo "${TF_VAR_azure_openai_endpoint}" | sed -E 's#^https?://([^/]+)/?.*$#\1#') - echo "Azure OpenAI endpoint host=${endpoint_host}" - fi - if [ "${missing}" -ne 0 ]; then - exit 1 - fi - - - name: Azure Login - uses: azure/login@v2 - with: - client-id: ${{ env.AZURE_CLIENT_ID }} - tenant-id: ${{ env.AZURE_TENANT_ID }} - subscription-id: ${{ env.AZURE_SUBSCRIPTION_ID }} - - - name: Setup Terraform - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: 1.14.6 - - - name: Terraform Init - run: | - terraform init \ - -backend-config="resource_group_name=${TF_BACKEND_RG}" \ - -backend-config="storage_account_name=${TF_BACKEND_SA}" \ - -backend-config="container_name=${TF_BACKEND_CONTAINER}" \ - -backend-config="key=dev.terraform.tfstate" - - - name: Import existing Container App into Terraform state - uses: ./.github/actions/import-container-app - with: - projname: ${{ env.TF_VAR_projname }} - env: ${{ env.TF_VAR_env }} - location_short: ${{ env.TF_VAR_location_short }} - subscription_id: ${{ env.AZURE_SUBSCRIPTION_ID }} - terraform_working_directory: infra/env/dev - - - name: Terraform Plan - run: | - terraform plan -out=tfplan - - - name: Terraform Apply - run: | - terraform apply -auto-approve tfplan - - - name: Get gateway URL - id: gw - run: echo "url=$(terraform output -raw gateway_url)" >> $GITHUB_OUTPUT - - - name: Get dashboard URL - id: db - run: echo "url=$(terraform output -raw dashboard_url 2>/dev/null || true)" >> $GITHUB_OUTPUT - - - name: Runtime diagnostics (Container App config) - shell: bash - run: | - set -euo pipefail - RG_NAME="pvc-${TF_VAR_env}-${TF_VAR_projname}-rg-${TF_VAR_location_short}" - CA_NAME="pvc-${TF_VAR_env}-${TF_VAR_projname}-ca-${TF_VAR_location_short}" - echo "Resource Group: ${RG_NAME}" - echo "Container App: ${CA_NAME}" - echo "Gateway URL (terraform output): ${{ steps.gw.outputs.url }}" - echo "Latest revision:" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.latestRevisionName" -o tsv - echo "Active revisions (name, active, created):" - az containerapp revision list -g "${RG_NAME}" -n "${CA_NAME}" --query "[].{name:name,active:properties.active,created:properties.createdTime}" -o table - echo "Configured env vars for LiteLLM secret refs:" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.template.containers[0].env[?name=='LITELLM_AZURE_OPENAI_API_KEY' || name=='LITELLM_GATEWAY_KEY']" -o json - echo "Configured secret sources (names + key vault URLs):" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.configuration.secrets[].{name:name,keyVaultUrl:keyVaultUrl}" -o table - echo "LITELLM_CONFIG_CONTENT excerpt (first 2000 chars):" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.template.containers[0].env[?name=='LITELLM_CONFIG_CONTENT'].value | [0]" -o tsv | head -c 2000 || true - echo - - - name: Integration test (Azure OpenAI backend) - shell: bash - env: - AZURE_OPENAI_ENDPOINT: ${{ env.TF_VAR_azure_openai_endpoint }} - AZURE_OPENAI_API_KEY: ${{ env.TF_VAR_azure_openai_api_key }} - AZURE_OPENAI_EMBEDDING_ENDPOINT: ${{ env.TF_VAR_azure_openai_embedding_endpoint }} - AZURE_OPENAI_EMBEDDING_API_KEY: ${{ env.TF_VAR_azure_openai_embedding_api_key }} - AZURE_OPENAI_EMBEDDING_DEPLOYMENT: ${{ env.TF_VAR_embedding_deployment }} - AZURE_OPENAI_API_VERSION: ${{ env.TF_VAR_embeddings_api_version }} - AZURE_OPENAI_CHAT_DEPLOYMENT: "gpt-4.1" - AZURE_OPENAI_CHAT_API_VERSION: ${{ env.TF_VAR_codex_api_version }} - AZURE_OPENAI_CODEX_MODEL: ${{ env.TF_VAR_codex_model }} - working-directory: ${{ github.workspace }} - run: python3 scripts/integration_test.py - - - name: Smoke test gateway (embeddings + responses) - uses: ./.github/actions/smoke-test-gateway - with: - gateway_url: ${{ steps.gw.outputs.url }} - gateway_key: ${{ secrets.AIGATEWAY_KEY }} - embedding_model: ${{ env.TF_VAR_embedding_deployment }} - codex_model: ${{ env.TF_VAR_codex_model }} - aoai_endpoint: ${{ env.TF_VAR_azure_openai_endpoint }} - aoai_api_key: ${{ env.TF_VAR_azure_openai_api_key }} - max_attempts: "3" - retry_sleep: "10" - - - name: Smoke test shared state API (dashboard proxy) - if: env.TF_VAR_state_service_container_image != '' - shell: bash - run: | - set -euo pipefail - DASHBOARD_URL="${{ steps.db.outputs.url }}" - TEST_USER="ci-smoke-${TF_VAR_env}" - - curl -fsS --connect-timeout 5 --max-time 15 "${DASHBOARD_URL}/api/state/catalog" > /tmp/catalog.json - - curl -fsS --connect-timeout 5 --max-time 15 -X PUT "${DASHBOARD_URL}/api/state/selection" \ - -H "Content-Type: application/json" \ - -H "X-User-Id: ${TEST_USER}" \ - -d '{"enabled":true,"selected_model":"'"${TF_VAR_codex_model}"'"}' > /tmp/selection-put.json - - curl -fsS --connect-timeout 5 --max-time 15 "${DASHBOARD_URL}/api/state/selection" \ - -H "X-User-Id: ${TEST_USER}" > /tmp/selection-get.json - - jq -e '.enabled == true' /tmp/selection-get.json > /dev/null - - deploy-uat: - name: Deploy uat + uses: ./.github/workflows/deploy-environment.yaml + with: + env_name: dev + tf_state_key: dev.terraform.tfstate + codex_model: gpt-5.3-codex + codex_api_version: 2025-04-01-preview + terraform_working_directory: infra/env/dev + smoke_retry_sleep: "10" + smoke_models_wait_sleep: "15" + include_aoai_host_check: false + secrets: + AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} + AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} + AZURE_OPENAI_EMBEDDING_ENDPOINT: ${{ secrets.AZURE_OPENAI_EMBEDDING_ENDPOINT }} + AZURE_OPENAI_EMBEDDING_API_KEY: ${{ secrets.AZURE_OPENAI_EMBEDDING_API_KEY }} + AIGATEWAY_KEY: ${{ secrets.AIGATEWAY_KEY }} + + deploy-staging: + name: Deploy staging needs: plan - runs-on: ubuntu-latest - if: github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main' && contains(join(github.event.pull_request.labels.*.name, ','), 'run-uat') - environment: uat - defaults: - run: - working-directory: infra/env/uat - - env: - TF_VAR_env: "uat" - TF_VAR_projname: "aigateway" - TF_VAR_location: "southafricanorth" - TF_VAR_location_short: "san" - TF_VAR_azure_openai_endpoint: ${{ secrets.AZURE_OPENAI_ENDPOINT }} - TF_VAR_azure_openai_api_key: ${{ secrets.AZURE_OPENAI_API_KEY }} - TF_VAR_azure_openai_embedding_endpoint: ${{ secrets.AZURE_OPENAI_EMBEDDING_ENDPOINT }} - TF_VAR_azure_openai_embedding_api_key: ${{ secrets.AZURE_OPENAI_EMBEDDING_API_KEY }} - TF_VAR_gateway_key: ${{ secrets.AIGATEWAY_KEY }} - TF_VAR_codex_model: "gpt-5.3-codex" - TF_VAR_codex_api_version: "2025-04-01-preview" - TF_VAR_embedding_deployment: "text-embedding-3-large" - TF_VAR_embeddings_api_version: "2024-02-01" - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Quickcheck required secrets and config - shell: bash - run: | - set -euo pipefail - missing=0 - required=( - AZURE_CLIENT_ID - AZURE_TENANT_ID - AZURE_SUBSCRIPTION_ID - TF_BACKEND_RG - TF_BACKEND_SA - TF_BACKEND_CONTAINER - TF_VAR_azure_openai_endpoint - TF_VAR_azure_openai_api_key - TF_VAR_gateway_key - ) - for v in "${required[@]}"; do - if [ -z "${!v:-}" ]; then - echo "::error::Missing required value: ${v}" - missing=1 - else - echo "${v}=SET" - fi - done - echo "TF_VAR_env=${TF_VAR_env:-unset}" - echo "TF_VAR_embedding_deployment=${TF_VAR_embedding_deployment:-unset}" - echo "TF_VAR_codex_model=${TF_VAR_codex_model:-unset}" - if [ -n "${TF_VAR_azure_openai_endpoint:-}" ]; then - echo "Azure OpenAI endpoint=${TF_VAR_azure_openai_endpoint}" - endpoint_host=$(echo "${TF_VAR_azure_openai_endpoint}" | sed -E 's#^https?://([^/]+)/?.*$#\1#') - echo "Azure OpenAI endpoint host=${endpoint_host}" - fi - if [ "${missing}" -ne 0 ]; then - exit 1 - fi - - - name: Azure Login - uses: azure/login@v2 - with: - client-id: ${{ env.AZURE_CLIENT_ID }} - tenant-id: ${{ env.AZURE_TENANT_ID }} - subscription-id: ${{ env.AZURE_SUBSCRIPTION_ID }} - - - name: Setup Terraform - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: 1.14.6 - - - name: Terraform Init - run: | - terraform init \ - -backend-config="resource_group_name=${TF_BACKEND_RG}" \ - -backend-config="storage_account_name=${TF_BACKEND_SA}" \ - -backend-config="container_name=${TF_BACKEND_CONTAINER}" \ - -backend-config="key=uat.terraform.tfstate" - - - name: Import existing Container App into Terraform state - uses: ./.github/actions/import-container-app - with: - projname: ${{ env.TF_VAR_projname }} - env: ${{ env.TF_VAR_env }} - location_short: ${{ env.TF_VAR_location_short }} - subscription_id: ${{ env.AZURE_SUBSCRIPTION_ID }} - terraform_working_directory: infra/env/uat - - - name: Terraform Plan - run: | - terraform plan -out=tfplan - - - name: Terraform Apply - run: | - terraform apply -auto-approve tfplan - - - name: Get gateway URL - id: gw - run: echo "url=$(terraform output -raw gateway_url)" >> $GITHUB_OUTPUT - - - name: Get dashboard URL - id: db - run: echo "url=$(terraform output -raw dashboard_url 2>/dev/null || true)" >> $GITHUB_OUTPUT - - - name: Runtime diagnostics (Container App config) - shell: bash - run: | - set -euo pipefail - RG_NAME="pvc-${TF_VAR_env}-${TF_VAR_projname}-rg-${TF_VAR_location_short}" - CA_NAME="pvc-${TF_VAR_env}-${TF_VAR_projname}-ca-${TF_VAR_location_short}" - echo "Resource Group: ${RG_NAME}" - echo "Container App: ${CA_NAME}" - echo "Gateway URL (terraform output): ${{ steps.gw.outputs.url }}" - echo "Latest revision:" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.latestRevisionName" -o tsv - echo "Active revisions (name, active, created):" - az containerapp revision list -g "${RG_NAME}" -n "${CA_NAME}" --query "[].{name:name,active:properties.active,created:properties.createdTime}" -o table - echo "Configured env vars for LiteLLM secret refs:" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.template.containers[0].env[?name=='LITELLM_AZURE_OPENAI_API_KEY' || name=='LITELLM_GATEWAY_KEY']" -o json - echo "Configured secret sources (names + key vault URLs):" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.configuration.secrets[].{name:name,keyVaultUrl:keyVaultUrl}" -o table - echo "LITELLM_CONFIG_CONTENT excerpt (first 2000 chars):" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.template.containers[0].env[?name=='LITELLM_CONFIG_CONTENT'].value | [0]" -o tsv | head -c 2000 || true - echo - - - name: Integration test (Azure OpenAI backend) - shell: bash - env: - AZURE_OPENAI_ENDPOINT: ${{ env.TF_VAR_azure_openai_endpoint }} - AZURE_OPENAI_API_KEY: ${{ env.TF_VAR_azure_openai_api_key }} - AZURE_OPENAI_EMBEDDING_ENDPOINT: ${{ env.TF_VAR_azure_openai_embedding_endpoint }} - AZURE_OPENAI_EMBEDDING_API_KEY: ${{ env.TF_VAR_azure_openai_embedding_api_key }} - AZURE_OPENAI_EMBEDDING_DEPLOYMENT: ${{ env.TF_VAR_embedding_deployment }} - AZURE_OPENAI_API_VERSION: ${{ env.TF_VAR_embeddings_api_version }} - AZURE_OPENAI_CHAT_DEPLOYMENT: "gpt-4.1" - AZURE_OPENAI_CHAT_API_VERSION: ${{ env.TF_VAR_codex_api_version }} - AZURE_OPENAI_CODEX_MODEL: ${{ env.TF_VAR_codex_model }} - working-directory: ${{ github.workspace }} - run: python3 scripts/integration_test.py - - - name: Smoke test gateway (embeddings + responses) - uses: ./.github/actions/smoke-test-gateway - with: - gateway_url: ${{ steps.gw.outputs.url }} - gateway_key: ${{ secrets.AIGATEWAY_KEY }} - embedding_model: ${{ env.TF_VAR_embedding_deployment }} - codex_model: ${{ env.TF_VAR_codex_model }} - aoai_endpoint: ${{ env.TF_VAR_azure_openai_endpoint }} - aoai_api_key: ${{ env.TF_VAR_azure_openai_api_key }} - max_attempts: "3" - retry_sleep: "10" - - - name: Smoke test shared state API (dashboard proxy) - if: env.TF_VAR_state_service_container_image != '' - shell: bash - run: | - set -euo pipefail - DASHBOARD_URL="${{ steps.db.outputs.url }}" - TEST_USER="ci-smoke-${TF_VAR_env}" - - curl -fsS --connect-timeout 5 --max-time 15 "${DASHBOARD_URL}/api/state/catalog" > /tmp/catalog.json - - curl -fsS --connect-timeout 5 --max-time 15 -X PUT "${DASHBOARD_URL}/api/state/selection" \ - -H "Content-Type: application/json" \ - -H "X-User-Id: ${TEST_USER}" \ - -d '{"enabled":true,"selected_model":"'"${TF_VAR_codex_model}"'"}' > /tmp/selection-put.json - - curl -fsS --connect-timeout 5 --max-time 15 "${DASHBOARD_URL}/api/state/selection" \ - -H "X-User-Id: ${TEST_USER}" > /tmp/selection-get.json - - jq -e '.enabled == true' /tmp/selection-get.json > /dev/null + if: github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main' && contains(join(github.event.pull_request.labels.*.name, ','), 'run-staging') + environment: staging + uses: ./.github/workflows/deploy-environment.yaml + with: + env_name: staging + tf_state_key: staging.terraform.tfstate + codex_model: gpt-5.3-codex + codex_api_version: 2025-04-01-preview + terraform_working_directory: infra/env/staging + smoke_retry_sleep: "10" + smoke_models_wait_sleep: "15" + include_aoai_host_check: false + secrets: + AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} + AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} + AZURE_OPENAI_EMBEDDING_ENDPOINT: ${{ secrets.AZURE_OPENAI_EMBEDDING_ENDPOINT }} + AZURE_OPENAI_EMBEDDING_API_KEY: ${{ secrets.AZURE_OPENAI_EMBEDDING_API_KEY }} + AIGATEWAY_KEY: ${{ secrets.AIGATEWAY_KEY }} deploy-prod: name: Deploy prod needs: plan - runs-on: ubuntu-latest if: github.event_name == 'workflow_dispatch' || (github.event_name == 'push' && github.ref == 'refs/heads/main') environment: prod - defaults: - run: - working-directory: infra/env/prod - - env: - TF_VAR_env: "prod" - TF_VAR_projname: "aigateway" - TF_VAR_location: "southafricanorth" - TF_VAR_location_short: "san" - TF_VAR_azure_openai_endpoint: ${{ secrets.AZURE_OPENAI_ENDPOINT }} - TF_VAR_azure_openai_api_key: ${{ secrets.AZURE_OPENAI_API_KEY }} - TF_VAR_azure_openai_embedding_endpoint: ${{ secrets.AZURE_OPENAI_EMBEDDING_ENDPOINT }} - TF_VAR_azure_openai_embedding_api_key: ${{ secrets.AZURE_OPENAI_EMBEDDING_API_KEY }} - TF_VAR_gateway_key: ${{ secrets.AIGATEWAY_KEY }} - TF_VAR_codex_model: "gpt-4o" - TF_VAR_codex_api_version: "2025-01-01-preview" - TF_VAR_embedding_deployment: "text-embedding-3-large" - TF_VAR_embeddings_api_version: "2024-02-01" - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Quickcheck required secrets and config - shell: bash - run: | - set -euo pipefail - missing=0 - required=( - AZURE_CLIENT_ID - AZURE_TENANT_ID - AZURE_SUBSCRIPTION_ID - TF_BACKEND_RG - TF_BACKEND_SA - TF_BACKEND_CONTAINER - TF_VAR_azure_openai_endpoint - TF_VAR_azure_openai_api_key - TF_VAR_gateway_key - ) - for v in "${required[@]}"; do - if [ -z "${!v:-}" ]; then - echo "::error::Missing required value: ${v}" - missing=1 - else - echo "${v}=SET" - fi - done - echo "TF_VAR_env=${TF_VAR_env:-unset}" - echo "TF_VAR_embedding_deployment=${TF_VAR_embedding_deployment:-unset}" - echo "TF_VAR_codex_model=${TF_VAR_codex_model:-unset}" - if [ -n "${TF_VAR_azure_openai_endpoint:-}" ]; then - echo "Azure OpenAI endpoint=${TF_VAR_azure_openai_endpoint}" - endpoint_host=$(echo "${TF_VAR_azure_openai_endpoint}" | sed -E 's#^https?://([^/]+)/?.*$#\1#') - echo "Azure OpenAI endpoint host=${endpoint_host}" - if [ -n "${EXPECTED_AOAI_ENDPOINT_HOST:-}" ] && [ "${endpoint_host}" != "${EXPECTED_AOAI_ENDPOINT_HOST}" ]; then - echo "::error::Prod AOAI endpoint host mismatch. Expected '${EXPECTED_AOAI_ENDPOINT_HOST}', got '${endpoint_host}'. Check environment secret AZURE_OPENAI_ENDPOINT." - missing=1 - fi - fi - if [ "${missing}" -ne 0 ]; then - exit 1 - fi - - - name: Azure Login - uses: azure/login@v2 - with: - client-id: ${{ env.AZURE_CLIENT_ID }} - tenant-id: ${{ env.AZURE_TENANT_ID }} - subscription-id: ${{ env.AZURE_SUBSCRIPTION_ID }} - - - name: Setup Terraform - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: 1.14.6 - - - name: Terraform Init - run: | - terraform init \ - -backend-config="resource_group_name=${TF_BACKEND_RG}" \ - -backend-config="storage_account_name=${TF_BACKEND_SA}" \ - -backend-config="container_name=${TF_BACKEND_CONTAINER}" \ - -backend-config="key=prod.terraform.tfstate" - - - name: Import existing Container App into Terraform state - uses: ./.github/actions/import-container-app - with: - projname: ${{ env.TF_VAR_projname }} - env: ${{ env.TF_VAR_env }} - location_short: ${{ env.TF_VAR_location_short }} - subscription_id: ${{ env.AZURE_SUBSCRIPTION_ID }} - terraform_working_directory: infra/env/prod - - - name: Terraform Plan - run: | - terraform plan -out=tfplan - - - name: Terraform Apply - run: | - terraform apply -auto-approve tfplan - - - name: Get gateway URL - id: gw - run: echo "url=$(terraform output -raw gateway_url)" >> $GITHUB_OUTPUT - - - name: Get dashboard URL - id: db - run: echo "url=$(terraform output -raw dashboard_url 2>/dev/null || true)" >> $GITHUB_OUTPUT - - - name: Runtime diagnostics (Container App config) - shell: bash - run: | - set -euo pipefail - RG_NAME="pvc-${TF_VAR_env}-${TF_VAR_projname}-rg-${TF_VAR_location_short}" - CA_NAME="pvc-${TF_VAR_env}-${TF_VAR_projname}-ca-${TF_VAR_location_short}" - echo "Resource Group: ${RG_NAME}" - echo "Container App: ${CA_NAME}" - echo "Gateway URL (terraform output): ${{ steps.gw.outputs.url }}" - echo "Latest revision:" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.latestRevisionName" -o tsv - echo "Active revisions (name, active, created):" - az containerapp revision list -g "${RG_NAME}" -n "${CA_NAME}" --query "[].{name:name,active:properties.active,created:properties.createdTime}" -o table - echo "Configured env vars for LiteLLM secret refs:" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.template.containers[0].env[?name=='LITELLM_AZURE_OPENAI_API_KEY' || name=='LITELLM_GATEWAY_KEY']" -o json - echo "Configured secret sources (names + key vault URLs):" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.configuration.secrets[].{name:name,keyVaultUrl:keyVaultUrl}" -o table - echo "LITELLM_CONFIG_CONTENT excerpt (first 2000 chars):" - az containerapp show -g "${RG_NAME}" -n "${CA_NAME}" --query "properties.template.containers[0].env[?name=='LITELLM_CONFIG_CONTENT'].value | [0]" -o tsv | head -c 2000 || true - echo - - - name: Integration test (Azure OpenAI backend) - shell: bash - env: - AZURE_OPENAI_ENDPOINT: ${{ env.TF_VAR_azure_openai_endpoint }} - AZURE_OPENAI_API_KEY: ${{ env.TF_VAR_azure_openai_api_key }} - AZURE_OPENAI_EMBEDDING_ENDPOINT: ${{ env.TF_VAR_azure_openai_embedding_endpoint }} - AZURE_OPENAI_EMBEDDING_API_KEY: ${{ env.TF_VAR_azure_openai_embedding_api_key }} - AZURE_OPENAI_EMBEDDING_DEPLOYMENT: ${{ env.TF_VAR_embedding_deployment }} - AZURE_OPENAI_API_VERSION: ${{ env.TF_VAR_embeddings_api_version }} - AZURE_OPENAI_CHAT_DEPLOYMENT: "gpt-4.1" - AZURE_OPENAI_CHAT_API_VERSION: ${{ env.TF_VAR_codex_api_version }} - AZURE_OPENAI_CODEX_MODEL: ${{ env.TF_VAR_codex_model }} - working-directory: ${{ github.workspace }} - run: python3 scripts/integration_test.py - - - name: Smoke test gateway (embeddings + responses) - uses: ./.github/actions/smoke-test-gateway - with: - gateway_url: ${{ steps.gw.outputs.url }} - gateway_key: ${{ secrets.AIGATEWAY_KEY }} - embedding_model: ${{ env.TF_VAR_embedding_deployment }} - codex_model: ${{ env.TF_VAR_codex_model }} - aoai_endpoint: ${{ env.TF_VAR_azure_openai_endpoint }} - aoai_api_key: ${{ env.TF_VAR_azure_openai_api_key }} - max_attempts: "3" - retry_sleep: "15" # prod: longer cold-start; allow more time between retries - models_wait_attempts: "3" # prod: wait longer for LiteLLM to register healthy deployments - models_wait_sleep: "30" - - - name: Smoke test shared state API (dashboard proxy) - if: env.TF_VAR_state_service_container_image != '' - shell: bash - run: | - set -euo pipefail - DASHBOARD_URL="${{ steps.db.outputs.url }}" - TEST_USER="ci-smoke-${TF_VAR_env}" - - curl -fsS --connect-timeout 5 --max-time 15 "${DASHBOARD_URL}/api/state/catalog" > /tmp/catalog.json - - curl -fsS --connect-timeout 5 --max-time 15 -X PUT "${DASHBOARD_URL}/api/state/selection" \ - -H "Content-Type: application/json" \ - -H "X-User-Id: ${TEST_USER}" \ - -d '{"enabled":true,"selected_model":"'"${TF_VAR_codex_model}"'"}' > /tmp/selection-put.json - - curl -fsS --connect-timeout 5 --max-time 15 "${DASHBOARD_URL}/api/state/selection" \ - -H "X-User-Id: ${TEST_USER}" > /tmp/selection-get.json - - jq -e '.enabled == true' /tmp/selection-get.json > /dev/null + uses: ./.github/workflows/deploy-environment.yaml + with: + env_name: prod + tf_state_key: prod.terraform.tfstate + codex_model: gpt-4o + codex_api_version: 2025-01-01-preview + terraform_working_directory: infra/env/prod + smoke_retry_sleep: "15" + smoke_models_wait_sleep: "30" + include_aoai_host_check: true + secrets: + AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} + AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} + AZURE_OPENAI_EMBEDDING_ENDPOINT: ${{ secrets.AZURE_OPENAI_EMBEDDING_ENDPOINT }} + AZURE_OPENAI_EMBEDDING_API_KEY: ${{ secrets.AZURE_OPENAI_EMBEDDING_API_KEY }} + AIGATEWAY_KEY: ${{ secrets.AIGATEWAY_KEY }} + + # Legacy inline deployments removed - see deploy-environment.yaml diff --git a/README.md b/README.md index d4dc2ba..dabd2de 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Creates the shared resource group, storage account, and container for Terraform ### 2. Add GitHub secrets -Add these secrets to each GitHub **Environment** (dev, uat, prod): **Settings → Environments → <env> → Environment secrets**. +Add these secrets to each GitHub **Environment** (dev, staging, prod): **Settings → Environments → <env> → Environment secrets**. | Secret | Description | Example | | ----------------------- | --------------------------------- | --------------------------------------------- | @@ -53,16 +53,16 @@ Bootstrap prints these values. For local runs, copy `infra/.env.local.example` t **Bash:** ```bash -./infra/scripts/terraform-init.sh dev # or uat, prod +./infra/scripts/terraform-init.sh dev # or staging, prod ``` **PowerShell:** ```powershell -.\infra\scripts\terraform-init.ps1 -Env dev # or uat, prod +.\infra\scripts\terraform-init.ps1 -Env dev # or staging, prod ``` -Valid environments: `dev`, `uat`, `prod`. +Valid environments: `dev`, `staging`, `prod`. ### 4. Plan and apply @@ -74,11 +74,11 @@ terraform apply ## Environments -| Env | Purpose | -| ---- | --------------- | -| dev | Development | -| uat | User acceptance | -| prod | Production | +| Env | Purpose | +| ------- | ----------- | +| dev | Development | +| staging | Staging | +| prod | Production | ## CI/CD @@ -104,6 +104,6 @@ pnpm format - [PRD](docs/PRD.md) – Product requirements - [Terraform Blueprint](docs/Terraform_Blueprint.md) – Infrastructure design -- [CI/CD Runbook](docs/CI_CD.md) – workflow behavior, UAT toggle, smoke tests +- [CI/CD Runbook](docs/CI_CD.md) – workflow behavior, staging toggle, smoke tests - [Azure OIDC Setup](docs/AZURE_OIDC_SETUP.md) – GitHub Actions OIDC configuration - [Secrets Checklist](docs/SECRETS.md) – Copy/paste setup for GitHub environment secrets diff --git a/dashboard/app.js b/dashboard/app.js index 65011e4..63675e5 100644 --- a/dashboard/app.js +++ b/dashboard/app.js @@ -53,7 +53,7 @@ function escHtml(s) { function deriveEnv(url) { if (!url) return null; - const m = url.match(/pvc-(dev|uat|prod)-/); + const m = url.match(/pvc-(dev|staging|prod)-/); return m ? m[1] : null; } diff --git a/docs/AZURE_OIDC_SETUP.md b/docs/AZURE_OIDC_SETUP.md index 509156e..0c9518b 100644 --- a/docs/AZURE_OIDC_SETUP.md +++ b/docs/AZURE_OIDC_SETUP.md @@ -12,7 +12,7 @@ If you see: Error: AADSTS700213: No matching federated identity record found for presented assertion subject 'repo:phoenixvc/ai-gateway:environment:dev' ``` -**Cause:** The workflow uses `environment: dev` (and uat/prod), so the OIDC subject is `repo:org/repo:environment:dev`. Azure must have a federated credential with that exact subject. +**Cause:** The workflow uses `environment: dev` (and staging/prod), so the OIDC subject is `repo:org/repo:environment:dev`. Azure must have a federated credential with that exact subject. ### Fix: Add environment federated credentials @@ -32,21 +32,21 @@ az ad app list --display-name pvc-shared-github-actions-oidc --query "[0].appId" 1. Go to **Azure Portal** → **Microsoft Entra ID** → **App registrations** → your app (e.g. `pvc-shared-github-actions-oidc`) 2. **Certificates & secrets** → **Federated credentials** → **Add credential** -3. For each environment (dev, uat, prod), add: +3. For each environment (dev, staging, prod), add: - **Federated credential scenario:** GitHub Actions deploying Azure resources - **Organization:** phoenixvc - **Repository:** ai-gateway - **Entity type:** Environment - - **Environment name:** dev (or uat, prod) - - **Name:** github-actions-dev (or uat, prod) + - **Environment name:** dev (or staging, prod) + - **Name:** github-actions-dev (or staging, prod) ### Subject formats -| Workflow config | OIDC subject | -| -------------------- | ----------------------------------------------- | -| `environment: dev` | `repo:phoenixvc/ai-gateway:environment:dev` | -| `environment: uat` | `repo:phoenixvc/ai-gateway:environment:uat` | -| `environment: prod` | `repo:phoenixvc/ai-gateway:environment:prod` | -| Branch only (no env) | `repo:phoenixvc/ai-gateway:ref:refs/heads/main` | +| Workflow config | OIDC subject | +| ---------------------- | ----------------------------------------------- | +| `environment: dev` | `repo:phoenixvc/ai-gateway:environment:dev` | +| `environment: staging` | `repo:phoenixvc/ai-gateway:environment:staging` | +| `environment: prod` | `repo:phoenixvc/ai-gateway:environment:prod` | +| Branch only (no env) | `repo:phoenixvc/ai-gateway:ref:refs/heads/main` | The federated credential **Subject** in Azure must match exactly. diff --git a/docs/CI_CD.md b/docs/CI_CD.md index ce180f0..a5a623a 100644 --- a/docs/CI_CD.md +++ b/docs/CI_CD.md @@ -6,15 +6,15 @@ This document describes the current GitHub Actions deployment behavior for `ai-g - PRs from forks are skipped for deployment-related jobs (no repo secrets). - PRs targeting `dev` run `plan` + `deploy-dev`. -- PRs targeting `main` run UAT only when the PR has label `run-uat`. +- PRs targeting `main` run staging only when the PR has label `run-staging`. - Push to `main` and `workflow_dispatch` run `plan` + `deploy-prod`. -## Runtime UAT toggle +## Runtime staging toggle -UAT deployment for PRs to `main` is controlled by PR label: +staging deployment for PRs to `main` is controlled by PR label: -- Add label `run-uat` to enable `deploy-uat` for that PR. -- Remove label `run-uat` to disable UAT for that PR. +- Add label `run-staging` to enable `deploy-staging` for that PR. +- Remove label `run-staging` to disable staging for that PR. ## Smoke test behavior diff --git a/docs/PRD.md b/docs/PRD.md index 6cc0c70..710823f 100644 --- a/docs/PRD.md +++ b/docs/PRD.md @@ -15,7 +15,7 @@ Roo/Qoder currently struggles with Azure model/operation mismatches. A gateway n 2. Support: - `POST /v1/responses` routed to Azure **Responses** endpoint for configurable model (default: `gpt-5.3-codex`). - `POST /v1/embeddings` routed to Azure embeddings deployment. -3. Enable **multiple environments** (dev/uat/prod) and **multiple downstream projects**. +3. Enable **multiple environments** (dev/staging/prod) and **multiple downstream projects**. 4. Infrastructure managed with **Terraform**. 5. CI/CD via **GitHub Actions** using **Azure OIDC** (no long-lived secrets). 6. “Get it working” first; hardening follows. @@ -29,7 +29,7 @@ Roo/Qoder currently struggles with Azure model/operation mismatches. A gateway n ## 3) Environments - `dev` -- `uat` +- `staging` - `prod` Each env is independently deployable. @@ -150,7 +150,7 @@ Gateway must expose: - `docs/` - Documentation. - `infra/` - `modules/aigateway_aca` - Core Terraform module. - - `env/dev|uat|prod` - Environment-specific configurations. + - `env/dev|staging|prod` - Environment-specific configurations. - `.github/workflows/` - CI/CD pipelines. - `scripts/` - Helper scripts (bootstrap). @@ -161,13 +161,13 @@ Gateway must expose: - **Phase 1: Terraform & CI/CD** - Terraform defines infra. - GitHub Actions deploys using Azure OIDC. - - Dev auto-apply on merge; UAT/Prod gated with environment approvals. + - Dev auto-apply on merge; Staging/Prod gated with environment approvals. ## 10) Acceptance criteria 1. Roo/Qoder can use gateway for coding with configured model (default `gpt-5.3-codex`) without `chatCompletion operation does not work`. 2. Codebase indexing completes using embeddings through the gateway. -3. Dev/UAT/Prod are reproducible via Terraform + Actions. +3. Dev/staging/Prod are reproducible via Terraform + Actions. 4. No secrets committed. ## 11) Risks & mitigations @@ -180,5 +180,5 @@ Gateway must expose: - M0: Repo setup, Bootstrap scripts (OIDC, State Backend). - M1: Dev env deployed; smoke tests pass; Roo works. -- M2: UAT + Prod; environment approvals. +- M2: staging + Prod; environment approvals. - M3: Hardening (Front Door/WAF, Entra auth). diff --git a/docs/SECRETS.md b/docs/SECRETS.md index 4463421..460097e 100644 --- a/docs/SECRETS.md +++ b/docs/SECRETS.md @@ -2,17 +2,17 @@ Copy this checklist when setting up environments for this repo. -For workflow behavior (dev/uat/prod triggers, PR label `run-uat`, and smoke-test flow), see [CI_CD.md](CI_CD.md). +For workflow behavior (dev/staging/prod triggers, PR label `run-staging`, and smoke-test flow), see [CI_CD.md](CI_CD.md). ## Where to add secrets Add these as **Environment secrets** in GitHub: - **Settings → Environments → dev → Environment secrets** -- **Settings → Environments → uat → Environment secrets** +- **Settings → Environments → staging → Environment secrets** - **Settings → Environments → prod → Environment secrets** -> This workflow is environment-based (`environment: dev|uat|prod`), so each environment should have the full secret set. +> This workflow is environment-based (`environment: dev|staging|prod`), so each environment should have the full secret set. ## Required secrets (all environments) @@ -53,7 +53,7 @@ When `STATE_SERVICE_CONTAINER_IMAGE` is set (state-service enabled), set this se ## Copy/paste template -Use this block as a setup checklist when creating/updating `dev`, `uat`, and `prod`: +Use this block as a setup checklist when creating/updating `dev`, `staging`, and `prod`: ```text AZURE_CLIENT_ID= @@ -82,13 +82,13 @@ STATE_SERVICE_REGISTRY_PASSWORD= # required for priv - [ ] `AIGATEWAY_KEY` matches the key expected by the deployed gateway. - [ ] OIDC federated credentials exist for each environment subject: - `repo:phoenixvc/ai-gateway:environment:dev` - - `repo:phoenixvc/ai-gateway:environment:uat` + - `repo:phoenixvc/ai-gateway:environment:staging` - `repo:phoenixvc/ai-gateway:environment:prod` -## Runtime UAT toggle +## Runtime staging toggle -- UAT deploy on PRs into `main` is controlled by PR label `run-uat`. -- Add label `run-uat` to enable `deploy-uat` for that PR. -- Remove label `run-uat` to skip UAT for that PR. +- Staging deploy on PRs into `main` is controlled by PR label `run-staging`. +- Add label `run-staging` to enable `deploy-staging` for that PR. +- Remove label `run-staging` to skip staging for that PR. For OIDC troubleshooting, see [AZURE_OIDC_SETUP.md](AZURE_OIDC_SETUP.md). diff --git a/docs/Terraform_Blueprint.md b/docs/Terraform_Blueprint.md index 54ea563..f1027b0 100644 --- a/docs/Terraform_Blueprint.md +++ b/docs/Terraform_Blueprint.md @@ -3,7 +3,7 @@ This canvas includes a working Terraform scaffold: - `infra/modules/aigateway_aca` -- `infra/env/dev|uat|prod` +- `infra/env/dev|staging|prod` - Shared state configured via `terraform init -backend-config=...` in GitHub Actions > Notes: @@ -27,7 +27,7 @@ infra/ main.tf variables.tf terraform.tfvars - uat/ + staging/ main.tf variables.tf terraform.tfvars @@ -44,7 +44,7 @@ infra/ ```hcl variable "env" { type = string - description = "Environment name (dev|uat|prod)" + description = "Environment name (dev|staging|prod)" } variable "projname" { @@ -343,7 +343,7 @@ output "key_vault_name" { ## 5) Env stacks -### 5.1 `infra/env/dev/variables.tf` (repeat for uat/prod) +### 5.1 `infra/env/dev/variables.tf` (repeat for staging/prod) ```hcl variable "env" { type = string } @@ -441,7 +441,7 @@ tags = { } ``` -Repeat the env folders for `uat` and `prod`, changing only `env` and tags. +Repeat the env folders for `staging` and `prod`, changing only `env` and tags. --- diff --git a/docs/architecture/01-system-context.md b/docs/architecture/01-system-context.md new file mode 100644 index 0000000..9c20e95 --- /dev/null +++ b/docs/architecture/01-system-context.md @@ -0,0 +1,87 @@ +# System Context + +Status: Accepted +Date: 2026-03-15 +Owners: PhoenixVC Architecture Group + +## Context + +The PhoenixVC AI Platform integrates multiple intelligent systems designed to support: + +- AI request routing and governance +- Multi-agent orchestration +- Developer workflow intelligence +- Tool-driven agent execution +- Edge telemetry interpretation + +The platform consists of five major subsystems: + +1. AI Gateway +2. Cognitive Mesh +3. CodeFlow Engine +4. AgentKit Forge +5. PhoenixRooivalk + +These systems operate across both cloud infrastructure and edge deployments, and rely on a hybrid SLM + LLM architecture for performance, cost efficiency, and reasoning capability. + +## Decision + +Adopt a layered architecture where: + +- AI Gateway acts as the control-plane entry point +- SLMs perform routing, triage, screening, and compression +- LLMs are used selectively for high-value reasoning +- Edge systems remain locally autonomous when necessary + +## System Context Diagram + +```mermaid +flowchart TB + User[Users / Operators / Developers] + Apps[Client Apps / APIs] + GitHub[GitHub / CI Events] + Sensors[PhoenixRooivalk Sensors] + Providers[Model Providers] + Tools[External Tools / APIs] + + subgraph Platform + AIG[AI Gateway] + CM[Cognitive Mesh] + CFE[CodeFlow Engine] + AKF[AgentKit Forge] + PR[PhoenixRooivalk] + end + + User --> AIG + Apps --> AIG + GitHub --> CFE + Sensors --> PR + + AIG --> CM + AIG --> CFE + AIG --> AKF + AIG --> PR + + CM --> Providers + AKF --> Providers + CFE --> Providers + + CM --> Tools + AKF --> Tools + CFE --> Tools +``` + +## Consequences + +### Advantages + +- centralized governance of AI usage +- consistent routing logic +- scalable orchestration +- edge autonomy + +### Tradeoffs + +- additional architectural complexity +- routing model calibration required +- shared telemetry contracts required diff --git a/docs/architecture/02-container-architecture.md b/docs/architecture/02-container-architecture.md new file mode 100644 index 0000000..2b66950 --- /dev/null +++ b/docs/architecture/02-container-architecture.md @@ -0,0 +1,95 @@ +# Container Architecture + +Status: Accepted +Date: 2026-03-15 + +## Context + +To support scalability and independent evolution of system capabilities, the platform is decomposed into containerized services. + +Each service is responsible for a clearly bounded domain. + +## Container Diagram + +```mermaid +flowchart TB + subgraph Clients + C1[Chat UI] + C2[Internal Apps] + C3[GitHub Webhooks] + C4[Operator Console] + end + + subgraph Gateway + G1[Ingress API] + G2[SLM Classifier] + G3[Policy Scan] + G4[Budget Router] + G5[Semantic Cache] + G6[Escalation Judge] + end + + subgraph Mesh + M1[Specialist Router] + M2[Task Decomposer] + M3[State Manager] + M4[Synthesis Coordinator] + end + + subgraph Forge + F1[Tool Selector] + F2[Argument Extractor] + F3[Execution Loop] + F4[Result Compressor] + end + + subgraph CodeFlow + CF1[PR Classifier] + CF2[Risk Scorer] + CF3[CI Triage] + CF4[Review Engine] + end + + subgraph Models + SLM[SLM Pool] + LLM[LLM Pool] + end + + C1 --> G1 + C2 --> G1 + C4 --> G1 + + G1 --> G2 + G2 --> G3 + G3 --> G4 + G4 --> G5 + G5 --> G6 + + G6 --> M1 + G6 --> F1 + G6 --> CF1 + + M1 --> M2 + M2 --> M3 + M3 --> M4 + + F1 --> F2 + F2 --> F3 + F3 --> F4 + + CF1 --> CF2 + CF2 --> CF3 + CF3 --> CF4 +``` + +## Consequences + +### Benefits + +- service isolation +- independent scaling +- clearer ownership + +### Tradeoffs + +- increased service orchestration complexity diff --git a/docs/architecture/03-deployment-trust-boundaries.md b/docs/architecture/03-deployment-trust-boundaries.md new file mode 100644 index 0000000..18724c7 --- /dev/null +++ b/docs/architecture/03-deployment-trust-boundaries.md @@ -0,0 +1,82 @@ +# Deployment and Trust Boundaries + +Status: Accepted + +## Context + +The system interacts with external users, internal services, model providers, and edge devices. Clear trust boundaries must be established. + +## Trust Boundary Diagram + +```mermaid +flowchart LR + subgraph Public + A[Users] + B[GitHub] + C[External Apps] + end + + subgraph Ingress + D[API Gateway / WAF] + E[AI Gateway] + end + + subgraph ControlPlane + F[Policy Engine] + G[Session Store] + H[Semantic Cache] + I[Observability] + end + + subgraph Execution + J[Cognitive Mesh] + K[AgentKit Forge] + L[CodeFlow Engine] + end + + subgraph Integration + M[Key Vault] + N[Azure APIs] + O[GitHub APIs] + end + + subgraph ExternalModels + P[LLM Providers] + end + + subgraph Edge + Q[PhoenixRooivalk Node] + R[Sensors] + end + + A --> D + B --> D + C --> D + D --> E + + E --> F + E --> G + E --> H + E --> I + + E --> J + E --> K + E --> L + + J --> N + K --> N + L --> O + + E --> M + E --> P + + R --> Q + Q --> E +``` + +## Security Principles + +- **Gateway is the only public AI ingress.** +- **Secrets only accessed through Key Vault.** +- **Tool access occurs through controlled brokers.** +- **Edge nodes operate under constrained trust.** diff --git a/docs/architecture/04-observability-telemetry.md b/docs/architecture/04-observability-telemetry.md new file mode 100644 index 0000000..3afe313 --- /dev/null +++ b/docs/architecture/04-observability-telemetry.md @@ -0,0 +1,94 @@ +# Observability and Telemetry + +Status: Accepted + +## Context + +Cross-system observability is required for: + +- cost visibility +- routing quality measurement +- policy enforcement evidence +- debugging and operational monitoring + +## Telemetry Architecture + +```mermaid +flowchart TB + subgraph Producers + P1[AI Gateway] + P2[Cognitive Mesh] + P3[AgentKit Forge] + P4[CodeFlow Engine] + P5[Rooivalk Edge] + end + + subgraph Signals + S1[Request Logs] + S2[Routing Decisions] + S3[Policy Events] + S4[Tool Calls] + S5[Model Usage] + S6[Edge Events] + end + + subgraph Ingest + I1[OpenTelemetry] + I2[Azure Monitor] + I3[Blob Export] + end + + subgraph Analytics + A1[Azure Data Explorer] + A2[Cost Aggregates] + A3[Quality Metrics] + end + + subgraph Visualization + V1[Grafana] + V2[Alerts] + end + + P1 --> S1 + P1 --> S2 + P1 --> S5 + P2 --> S2 + P3 --> S4 + P4 --> S1 + P5 --> S6 + + S1 --> I1 + S2 --> I1 + S4 --> I1 + S5 --> I2 + S6 --> I3 + + I1 --> A1 + I2 --> A1 + I3 --> A1 + + A1 --> V1 + V1 --> V2 +``` + +## Key Metrics + +### Gateway + +- routing decision distribution +- SLM vs LLM usage ratio +- cache hit rate + +### CodeFlow + +- PR classification accuracy +- CI triage distribution + +### AgentKit + +- tool selection success rate + +### Rooivalk + +- alert compression ratio +- edge escalation frequency diff --git a/docs/architecture/05-slm-llm-decision-flow.md b/docs/architecture/05-slm-llm-decision-flow.md new file mode 100644 index 0000000..124e27a --- /dev/null +++ b/docs/architecture/05-slm-llm-decision-flow.md @@ -0,0 +1,60 @@ +# SLM to LLM Decision Flow + +Status: Accepted + +## Context + +Small Language Models are used as the operational cognition layer, while Large Language Models perform high-value reasoning. + +## Decision Flow + +```mermaid +flowchart TD + A[Incoming Request] + B[SLM Preprocess] + C[Intent Classification] + D[Policy Scan] + E[Tool Check] + F[Complexity Estimate] + G[Confidence Score] + + A --> B + B --> C + C --> D + D --> E + E --> F + F --> G + + G --> H{Policy violation?} + H -->|Yes| X[Block / Redact] + H -->|No| I{Simple task?} + + I -->|Yes| Y[Return SLM result] + I -->|No| J{Tool first?} + + J -->|Yes| K[Execute Tool] + K --> L[SLM Compress Result] + L --> M{Enough?} + + M -->|Yes| Y + M -->|No| N[Escalate] + + J -->|No| N + + N --> O[LLM Reasoning] + O --> P[Post-check] + P --> Q[Return Response] +``` + +## Consequences + +### Benefits + +- reduced inference cost +- lower latency +- improved throughput + +### Risks + +- incorrect routing +- model confidence calibration required diff --git a/docs/architecture/06-shared-contracts.md b/docs/architecture/06-shared-contracts.md new file mode 100644 index 0000000..4b19404 --- /dev/null +++ b/docs/architecture/06-shared-contracts.md @@ -0,0 +1,57 @@ +# Shared Contracts + +Status: Accepted + +## Routing Decision + +```json +{ + "intent": "string", + "complexity": "low|medium|high", + "risk_level": "low|medium|high|critical", + "policy_status": "allow|redact|deny|review", + "needs_tool": true, + "recommended_tier": "slm|llm", + "recommended_path": "direct|tool_first|mesh|escalate", + "confidence": 0.0 +} +``` + +## Model Usage Event + +```json +{ + "trace_id": "uuid", + "system": "ai-gateway", + "model_tier": "slm", + "model_name": "model-id", + "token_in": 320, + "token_out": 64, + "latency_ms": 41, + "estimated_cost": 0.0002 +} +``` + +## Tool Execution Event + +```json +{ + "trace_id": "uuid", + "tool_name": "azure_cli", + "action": "query_metrics", + "success": true, + "latency_ms": 820 +} +``` + +## Edge Escalation Packet + +```json +{ + "event_id": "uuid", + "site_id": "string", + "event_label": "rf_anomaly", + "summary": "Drone signature detected near perimeter", + "confidence": 0.78 +} +``` diff --git a/docs/architecture/07-repo-ownership-map.md b/docs/architecture/07-repo-ownership-map.md new file mode 100644 index 0000000..341323f --- /dev/null +++ b/docs/architecture/07-repo-ownership-map.md @@ -0,0 +1,28 @@ +# Repository Ownership Map + +Status: Accepted + +## Repository Map + +```mermaid +flowchart LR + R1[pvc-ai-gateway] --> S1[AI Gateway Service] + R2[cognitive-mesh] --> S2[Cognitive Mesh] + R3[codeflow-engine] --> S3[CodeFlow Engine] + R4[agentkit-forge] --> S4[AgentKit Forge] + R5[phoenixrooivalk] --> S5[Rooivalk Edge / Command] + R6[shared-contracts] --> S6[Shared Contracts] + R7[infra] --> S7[Infrastructure / Monitoring] +``` + +## Ownership + +| Repository | Owns | +| -------------------- | ------------------------------------------------------ | +| **AI Gateway** | request routing, policy enforcement, model abstraction | +| **Cognitive Mesh** | orchestration, multi-agent coordination | +| **CodeFlow Engine** | CI/CD intelligence, PR analysis | +| **AgentKit Forge** | tool-driven agents, execution runtime | +| **PhoenixRooivalk** | edge telemetry, operator alerts | +| **Shared Contracts** | telemetry schema, routing decisions, audit envelope | +| **Infrastructure** | Azure deployment, monitoring, networking | diff --git a/docs/architecture/README.md b/docs/architecture/README.md new file mode 100644 index 0000000..fae13c6 --- /dev/null +++ b/docs/architecture/README.md @@ -0,0 +1,214 @@ +# Architecture + +This directory contains system architecture documentation for the AI Gateway and related systems. + +## Overview + +The architecture follows a layered approach combining: + +- **SLMs (Small Language Models)** for cost-effective routing, classification, and tool selection +- **LLMs** for complex reasoning and final synthesis + +### Canonical Principle + +> **Use SLMs to decide, filter, classify, compress, and prepare.** +> **Use LLMs to reason, reconcile, synthesize, and communicate.** + +## Documentation Structure + +``` +docs/architecture/ +├── README.md # This file +├── 01-system-context.md # ADR: System Context +├── 02-container-architecture.md # ADR: Container Architecture +├── 03-deployment-trust-boundaries.md # ADR: Deployment & Trust Boundaries +├── 04-observability-telemetry.md # ADR: Observability & Telemetry +├── 05-slm-llm-decision-flow.md # ADR: SLM→LLM Decision Flow +├── 06-shared-contracts.md # ADR: Shared Contracts +├── 07-repo-ownership-map.md # ADR: Repository Ownership +├── systems/ # Individual system documentation +│ ├── ai-gateway.md +│ ├── cognitive-mesh.md +│ ├── codeflow-engine.md +│ ├── agentkit-forge.md +│ ├── phoenix-rooivalk.md +│ └── mystira.md +└── reference/ # Reference and planning docs + ├── cross-system.md + ├── c4-architecture.md + ├── deployment-observability.md + ├── contracts.md + ├── operations-patterns.md + ├── dashboards.md + ├── slm-implementation-matrix.md + ├── slm-management-plan.md + ├── matrix-gateway.md + ├── matrix-cognitive-mesh.md + ├── matrix-codeflow.md + ├── matrix-agentkit.md + ├── matrix-rooivalk.md + ├── matrix-mystira.md + └── strategic/ # Strategic guidance + ├── README.md + ├── 01-why-slms-matter.md + ├── 02-gateway-slm-use-cases.md + ├── 03-cognitive-mesh-use-cases.md + ├── 04-codeflow-use-cases.md + ├── 05-agentkit-use-cases.md + ├── 06-rooivalk-use-cases.md + ├── 07-deployment-model.md + └── 08-implementation-order.md +``` + +docs/architecture/ +├── README.md # This file +├── systems/ # Individual system documentation +│ ├── ai-gateway.md +│ ├── cognitive-mesh.md +│ ├── codeflow-engine.md +│ ├── agentkit-forge.md +│ ├── phoenix-rooivalk.md +│ └── mystira.md +└── reference/ # Reference and planning docs +├── cross-system.md +├── c4-architecture.md +├── deployment-observability.md +├── contracts.md +├── operations-patterns.md +├── dashboards.md +├── slm-implementation-matrix.md +├── slm-management-plan.md +├── matrix-gateway.md +├── matrix-cognitive-mesh.md +├── matrix-codeflow.md +├── matrix-agentkit.md +├── matrix-rooivalk.md +├── matrix-mystira.md +└── strategic/ # Strategic guidance +├── README.md +├── 01-why-slms-matter.md +├── 02-gateway-slm_use-cases.md +├── 03-cognitive-mesh-use-cases.md +├── 04-codeflow-use-cases.md +├── 05-agentkit-use-cases.md +├── 06-rooivalk-use-cases.md +├── 07-deployment-model.md +└── 08-implementation-order.md + +``` + +docs/architecture/ +├── README.md # This file +├── systems/ # Individual system documentation +│ ├── ai-gateway.md +│ ├── cognitive-mesh.md +│ ├── codeflow-engine.md +│ ├── agentkit-forge.md +│ ├── phoenix-rooivalk.md +│ └── mystira.md +└── reference/ # Reference and planning docs +├── cross-system.md +├── slm-implementation-matrix.md +├── slm-management-plan.md +├── matrix-gateway.md +├── matrix-cognitive-mesh.md +├── matrix-codeflow.md +├── matrix-agentkit.md +├── matrix-rooivalk.md +├── matrix-mystira.md +└── strategic/ # Strategic guidance +├── README.md +├── 01-why-slms-matter.md +├── 02-gateway-slm-use-cases.md +├── 03-cognitive-mesh-use-cases.md +├── 04-codeflow-use-cases.md +├── 05-agentkit-use-cases.md +├── 06-rooivalk-use-cases.md +├── 07-deployment-model.md +└── 08-implementation-order.md + +``` + +### Systems + +- [systems/ai-gateway.md](systems/ai-gateway.md) - AI Gateway: SLM as admission control & routing +- [systems/cognitive-mesh.md](systems/cognitive-mesh.md) - Agent orchestration: routing, decomposition +- [systems/codeflow-engine.md](systems/codeflow-engine.md) - CI/CD intelligence: PR triage, log analysis +- [systems/agentkit-forge.md](systems/agentkit-forge.md) - Agent building: tool selection, context compression +- [systems/phoenix-rooivalk.md](systems/phoenix-rooivalk.md) - Edge AI: SLM for reports only (NOT control) +- [systems/mystira.md](systems/mystira.md) - Story generation: SLM as moderation, age-fit, continuity layer + +### Reference + +- [reference/cross-system.md](reference/cross-system.md) - How all systems integrate +- [reference/c4-architecture.md](reference/c4-architecture.md) - C4-style diagrams (context, containers, sequences) +- [reference/deployment-observability.md](reference/deployment-observability.md) - Deployment, trust boundaries, observability +- [reference/contracts.md](reference/contracts.md) - Shared JSON schemas for telemetry and routing +- [reference/operations-patterns.md](reference/operations-patterns.md) - SLM→LLM decision flows, ownership, implementation +- [reference/dashboards.md](reference/dashboards.md) - Recommended Grafana/ADX dashboards +- [reference/slm-implementation-matrix.md](reference/slm-implementation-matrix.md) - Overview with threshold summary +- [reference/slm-management-plan.md](reference/slm-management-plan.md) - Cross-project SLM management + +### Strategic Guidance + +- [reference/strategic/README.md](reference/strategic/README.md) - Strategic SLM guidance index +- [reference/strategic/01-why-slms-matter.md](reference/strategic/01-why-slms-matter.md) - Executive summary +- [reference/strategic/02-gateway-slm-use-cases.md](reference/strategic/02-gateway-slm-use-cases.md) - AI Gateway use cases +- [reference/strategic/03-cognitive-mesh-use-cases.md](reference/strategic/03-cognitive-mesh-use-cases.md) - Cognitive Mesh use cases +- [reference/strategic/04-codeflow-use-cases.md](reference/strategic/04-codeflow-use-cases.md) - CodeFlow Engine use cases +- [reference/strategic/05-agentkit-use-cases.md](reference/strategic/05-agentkit-use-cases.md) - AgentKit Forge use cases +- [reference/strategic/06-rooivalk-use-cases.md](reference/strategic/06-rooivalk-use-cases.md) - PhoenixRooivalk use cases +- [reference/strategic/07-deployment-model.md](reference/strategic/07-deployment-model.md) - Deployment model +- [reference/strategic/08-implementation-order.md](reference/strategic/08-implementation-order.md) - Implementation order + +## Quick Reference + +| System | SLM Role | Key Document | +| --------------- | ----------------------------------------- | ---------------------------------------------------------- | +| AI Gateway | routing, policy checks, cost prediction | [systems/ai-gateway.md](systems/ai-gateway.md) | +| Cognitive Mesh | agent routing, task decomposition | [systems/cognitive-mesh.md](systems/cognitive-mesh.md) | +| PhoenixRooivalk | **operator summaries only** | [systems/phoenix-rooivalk.md](systems/phoenix-rooivalk.md) | +| CodeFlow Engine | CI intelligence, log analysis | [systems/codeflow-engine.md](systems/codeflow-engine.md) | +| AgentKit Forge | tool selection, context compression | [systems/agentkit-forge.md](systems/agentkit-forge.md) | +| Mystira | story classification, moderation, age-fit | [systems/mystira.md](systems/mystira.md) | + +## Implementation Order + +1. **AI Gateway SLM router** — Highest immediate cost-leverage +2. **CodeFlow Engine CI/PR classifier** — Fastest operational value +3. **Cognitive Mesh decomposer/router** — Strong leverage once taxonomy stabilizes +4. **AgentKit Forge tool selector** — Useful once tool inventory is mature +5. **PhoenixRooivalk operator interpreter** — Valuable, keep isolated from critical control +6. **Mystira story control layer** — For child-safe story generation with SLM-based moderation + +## Tiered Model Strategy + +| Tier | Use For | Examples | +| ------ | --------------------- | --------------------------------------------- | +| Tier 0 | deterministic/non-LLM | regex, schemas, policies | +| Tier 1 | SLM | classification, decomposition, tool selection | +| Tier 2 | LLM | synthesis, complex reasoning | + +## Diagram Tools + +This documentation uses **Mermaid** for inline diagrams (rendered in VS Code, GitHub, etc.). + +For high-quality published diagrams, consider: + +- **Figma MCP** - AI-powered Figma integration via VS Code extension +- **Mermaid Live Editor** - Online Mermaid diagram editing +- **Draw.io** - Traditional diagram editor + +### Using Figma MCP for Architecture Diagrams + +The [MCP Figma VS Code extension](https://github.com/sethdford/mcp-figma) enables AI-assisted diagram creation: + +1. Install the extension in VS Code +2. Configure MCP server for your AI assistant +3. Use AI to generate and edit architecture diagrams in Figma + +This is useful for creating polished, branded diagrams for presentations and documentation. + +``` + +``` diff --git a/docs/architecture/reference/c4-architecture.md b/docs/architecture/reference/c4-architecture.md new file mode 100644 index 0000000..0ce3c98 --- /dev/null +++ b/docs/architecture/reference/c4-architecture.md @@ -0,0 +1,332 @@ +# C4-Style Architecture + +This section provides C4-style diagrams showing system context, containers, and key sequences. + +## 1. System Context + +This shows the major external actors and the five core systems. + +```mermaid +flowchart TB + User[Users / Operators / Developers] + Apps[Client Apps / Internal Portals / APIs] + GitHub[GitHub / CI Events / PRs / Issues] + Sensors[PhoenixRooivalk Sensors / RF / EO / Radar / Telemetry] + Providers[Model Providers / Hosted Models] + Tools[Azure / Terraform / Kusto / GitHub APIs / Internal Tools] + + subgraph Platform["PhoenixVC AI Platform"] + AIG[AI Gateway] + CM[Cognitive Mesh] + CFE[CodeFlow Engine] + AKF[AgentKit Forge] + PR[PhoenixRooivalk Edge + Command Layer] + end + + User --> AIG + Apps --> AIG + GitHub --> CFE + Sensors --> PR + + AIG --> CM + AIG --> CFE + AIG --> AKF + AIG --> PR + + CM --> Providers + CFE --> Providers + AKF --> Providers + AIG --> Providers + + CM --> Tools + CFE --> Tools + AKF --> Tools + PR --> AIG +``` + +### External Actors + +| Actor | Role | +| ------------------------------ | ------------------------------------------ | +| Users / Operators / Developers | Initiate requests, reviews, investigations | +| Apps / APIs | Consume AI control plane programmatically | +| GitHub | Triggers software delivery workflows | +| Sensors | Produce edge telemetry | +| Model Providers | Serve LLM/SLM inference | +| Tools | Execution surfaces, enterprise integration | + +### System Roles + +| System | Role | +| --------------- | -------------------------------------------- | +| AI Gateway | Front door, routing, policy, budget, caching | +| Cognitive Mesh | Multi-agent coordination and synthesis | +| CodeFlow Engine | SDLC/CI intelligence | +| AgentKit Forge | Tool-driven agent execution | +| PhoenixRooivalk | Edge detection interpretation | + +--- + +## 2. Container Diagram + +```mermaid +flowchart TB + subgraph Clients["Clients / Event Sources"] + C1[Web UI / Chat UI] + C2[Internal Apps / APIs] + C3[GitHub Webhooks] + C4[Operator Console] + end + + subgraph Gateway["AI Gateway"] + G1[Ingress API] + G2[SLM Classifier] + G3[Policy Scan] + G4[Budget Router] + G5[Semantic Cache] + G6[Escalation Judge] + end + + subgraph Mesh["Cognitive Mesh"] + M1[Specialist Router] + M2[Task Decomposer] + M3[State Manager] + M4[Synthesis Coordinator] + end + + subgraph Forge["AgentKit Forge"] + F1[Tool Selector] + F2[Argument Extractor] + F3[Execution Loop] + F4[Result Compressor] + end + + subgraph CodeFlow["CodeFlow Engine"] + CF1[PR / Diff Classifier] + CF2[Risk Scorer] + CF3[CI Failure Triage] + CF4[Review / Action Engine] + end + + subgraph Shared["Shared Platform Services"] + S1[Policy Engine] + S2[Observability] + S3[State Store] + S4[Vector Store] + S5[Tool Broker] + end + + subgraph Models["Model Tier"] + ML1[SLM Pool] + ML2[LLM Pool] + end + + subgraph Edge["PhoenixRooivalk Edge"] + E1[Detection Pipeline] + E2[Edge SLM Event Labeler] + E3[Edge SLM Summarizer] + E4[Edge Escalation Filter] + end + + C1 --> G1 + C2 --> G1 + C3 --> CF1 + C4 --> G1 + + G1 --> G2 + G2 --> G3 + G3 --> G4 + G4 --> G5 + G5 --> G6 + + G6 --> M1 + G6 --> F1 + G6 --> CF1 + G6 --> ML2 + + M1 --> M2 + M2 --> M3 + M3 --> M4 + + F1 --> F2 + F2 --> F3 + F3 --> F4 + + CF1 --> CF2 + CF2 --> CF3 + CF3 --> CF4 + + G3 --> S1 + G6 --> S2 + M3 --> S3 + G5 --> S3 + G5 --> S4 + F3 --> S5 + CF4 --> S5 + + E1 --> E2 + E2 --> E3 + E3 --> E4 + E4 --> G1 +``` + +### Container Responsibilities + +#### AI Gateway + +| Container | Responsibility | +| ---------------- | -------------------------------- | +| Ingress API | Entry point | +| SLM Classifier | Intent/complexity classification | +| Policy Scan | Safety/compliance gate | +| Budget Router | Tier selection | +| Semantic Cache | Avoid redundant inference | +| Escalation Judge | Small-vs-large decision | + +#### Cognitive Mesh + +| Container | Responsibility | +| --------------------- | ---------------- | +| Specialist Router | Picks agent(s) | +| Task Decomposer | Splits work | +| State Manager | Compressed state | +| Synthesis Coordinator | Merge + escalate | + +#### AgentKit Forge + +| Container | Responsibility | +| ------------------ | ------------------ | +| Tool Selector | Chooses tool | +| Argument Extractor | Structured inputs | +| Execution Loop | Run/retry/fallback | +| Result Compressor | Distills output | + +#### CodeFlow Engine + +| Container | Responsibility | +| -------------------- | ------------------- | +| PR/Diff Classifier | File classification | +| Risk Scorer | Risk assessment | +| CI Failure Triage | Failure bucketing | +| Review/Action Engine | Routing/actions | + +#### PhoenixRooivalk Edge + +| Container | Responsibility | +| ---------------------- | ------------------ | +| Detection Pipeline | Signal processing | +| Edge Event Labeler | Labels events | +| Edge Summarizer | Operator summaries | +| Edge Escalation Filter | Cloud escalation | + +--- + +## 3. CodeFlow Sequence + +```mermaid +sequenceDiagram + participant GH as GitHub + participant CF as CodeFlow + participant SLM as SLM Tier + participant TO as CI / Tool Broker + participant GW as AI Gateway + participant LLM as LLM Tier + + GH->>CF: PR opened / updated + CF->>SLM: classify files + intent + SLM-->>CF: infra-change, high risk + + CF->>TO: trigger CI / contract checks + TO-->>CF: logs, results + + CF->>SLM: triage failures + SLM-->>CF: breaking change detected + + CF->>GW: request remediation + GW->>LLM: analyze + explain + LLM-->>GW: remediation steps + GW-->>CF: response + + CF-->>GH: PR comment with findings +``` + +### SLM Handles + +- File classification +- Risk scoring +- Log bucketing +- Cause identification + +### LLM Handles + +- Remediation proposals +- Tradeoff explanation +- Evidence synthesis + +--- + +## 4. PhoenixRooivalk Sequence + +```mermaid +sequenceDiagram + participant Sensors + participant DP as Detection Pipeline + participant ESLM as Edge SLM + participant OC as Operator Console + participant GW as AI Gateway + participant CM as Cognitive Mesh + participant LLM as Cloud LLM + + Sensors->>DP: raw detections + DP->>ESLM: normalized event + ESLM-->>DP: label + summary + confidence + + DP->>OC: local alert + + alt Below threshold + DP->>OC: local record + else Above threshold + DP->>GW: compressed bundle + GW->>CM: route to workflow + CM->>LLM: deep analysis + LLM-->>CM: interpretation + CM-->>GW: response + GW-->>OC: escalated advisory + end +``` + +### Design Intent + +- Label events +- Summarize meaning +- Suppress noise +- Conserve bandwidth +- Escalate only when justified + +--- + +## 5. C4 Narrative + +### System Context + +The platform provides a unified AI control plane for developer workflows, agent orchestration, and edge intelligence. + +### Container View + +| Layer | Description | +| --------------- | ---------------------------------------- | +| Control-plane | Classification, policy, routing, caching | +| Execution | Orchestration, tools, CI, edge | +| Shared services | Policy, retrieval, memory, telemetry | +| Model | SLM and LLM workloads | +| Edge | Local interpretation + escalation | + +### Dynamic Patterns + +| Pattern | System | Description | +| -------------- | --------------- | -------------------- | +| Gateway triage | AI Gateway | Selective escalation | +| Repo triage | CodeFlow | Remediation | +| Multi-agent | Cognitive Mesh | State compression | +| Tool loops | AgentKit Forge | Result distillation | +| Edge-first | PhoenixRooivalk | Threshold escalation | diff --git a/docs/architecture/reference/contracts.md b/docs/architecture/reference/contracts.md new file mode 100644 index 0000000..7379030 --- /dev/null +++ b/docs/architecture/reference/contracts.md @@ -0,0 +1,117 @@ +# Shared Contracts + +Standardized JSON schemas used across all systems for consistent telemetry, routing, and event handling. + +--- + +## RoutingDecision + +Emitted for every routing decision in the gateway. + +```json +{ + "intent": "string", + "complexity": "low|medium|high", + "risk_level": "low|medium|high|critical", + "policy_status": "allow|redact|deny|review", + "needs_tool": true, + "recommended_tier": "slm|llm", + "recommended_path": "direct|tool_first|mesh|escalate", + "confidence": 0.0 +} +``` + +| Field | Type | Description | +| ---------------- | ------- | --------------------------------------------- | +| intent | string | Classified intent (e.g., "ci_failure_triage") | +| complexity | enum | Estimated task complexity | +| risk_level | enum | Risk assessment | +| policy_status | enum | Policy engine result | +| needs_tool | boolean | Whether tool invocation is required | +| recommended_tier | enum | SLM or LLM recommendation | +| recommended_path | enum | Execution path recommendation | +| confidence | float | 0.0-1.0 confidence score | + +--- + +## ModelUsageEvent + +Emitted for every model invocation for cost tracking and quality analysis. + +```json +{ + "trace_id": "uuid", + "system": "ai-gateway", + "model_tier": "slm", + "model_name": "phi-4-mini", + "token_in": 320, + "token_out": 64, + "latency_ms": 41, + "estimated_cost": 0.0002 +} +``` + +| Field | Type | Description | +| -------------- | ------ | ---------------------------- | +| trace_id | uuid | Distributed trace identifier | +| system | string | Originating system | +| model_tier | enum | slm or llm | +| model_name | string | Specific model used | +| token_in | int | Input tokens | +| token_out | int | Output tokens | +| latency_ms | int | Response time | +| estimated_cost | float | Estimated cost in USD | + +--- + +## ToolExecutionEvent + +Emitted for every tool invocation through the Tool Broker. + +```json +{ + "trace_id": "uuid", + "tool_name": "azure_cli", + "action": "monitor_query", + "success": true, + "latency_ms": 820, + "retry_count": 1 +} +``` + +| Field | Type | Description | +| ----------- | ------- | ---------------------------- | +| trace_id | uuid | Distributed trace identifier | +| tool_name | string | Tool identifier | +| action | string | Action performed | +| success | boolean | Execution outcome | +| latency_ms | int | Execution time | +| retry_count | int | Number of retries | + +--- + +## EdgeEscalationPacket + +Compressed escalation from PhoenixRooivalk edge nodes. + +```json +{ + "event_id": "uuid", + "site_id": "string", + "event_label": "rf_anomaly", + "summary": "Consumer quadcopter signature near perimeter", + "confidence": 0.77, + "telemetry_refs": ["blob://..."], + "requires_cloud_analysis": true +} +``` + +| Field | Type | Description | +| ----------------------- | ------- | --------------------------------- | +| event_id | uuid | Unique event identifier | +| site_id | string | Edge site identifier | +| event_label | string | Classified event type | +| summary | string | Compressed human-readable summary | +| confidence | float | 0.0-1.0 confidence score | +| telemetry_refs | array | Blob references for raw telemetry | +| requires_cloud_analysis | boolean | Needs LLM-level analysis | diff --git a/docs/architecture/reference/cross-system.md b/docs/architecture/reference/cross-system.md new file mode 100644 index 0000000..a02ba19 --- /dev/null +++ b/docs/architecture/reference/cross-system.md @@ -0,0 +1,503 @@ +# Cross-System Architecture + +This document describes the unified production architecture that separates: + +- Control plane vs execution plane +- SLM tier vs LLM tier +- Cloud vs edge +- Policy, observability, cache, and cost controls + +## Unified Production Architecture + +```mermaid +flowchart TB + subgraph Clients["Ingress Sources"] + U1[Users] + U2[Developers / PR Events] + U3[Apps / APIs] + U4[Operators / Mission Console] + U5[Sensors / Telemetry] + end + + subgraph Cloud["Cloud Control Plane"] + GW[AI Gateway] + + subgraph SLMCP["SLM Control Tier"] + S1[Intent + Complexity Classifier] + S2[Policy / PII / Secret / Injection Scan] + S3[Cost + Latency Router] + S4[Semantic Cache Admission / Reuse] + S5[Context Compressor] + S6[Escalation Judge] + end + + subgraph Orchestration["Orchestration Services"] + CM[Cognitive Mesh] + AF[AgentKit Forge] + CF[CodeFlow Engine] + end + + subgraph SharedServices["Shared Platform Services"] + POL[Policy Engine] + OBS[Observability / Telemetry / Audit] + BUD[Budget + Rate Controls] + MEM[State Store / Memory / Session Context] + VC[Vector Store / Retrieval] + TOOLS[Tools / APIs / CLI / GitHub / Azure / Kusto / Terraform] + end + + subgraph LLMZone["Deep Reasoning Tier"] + L1[Reasoning LLM] + L2[Code / Analysis LLM] + L3[Research / Synthesis LLM] + end + + subgraph Providers["Provider Layer"] + P1[OpenAI / Azure OpenAI] + P2[Other Model Providers] + P3[Local Hosted Models] + end + end + + subgraph Edge["PhoenixRooivalk Edge Plane"] + RP[Signal / Detection Pipeline] + ER1[Edge SLM: Event Labeler] + ER2[Edge SLM: Threat Summarizer] + ER3[Edge SLM: Alert Composer] + ER4[Edge SLM: Escalation Filter] + OC[Operator Console] + end + + U1 --> GW + U2 --> GW + U3 --> GW + U4 --> GW + U5 --> RP + + GW --> S1 + S1 --> S2 + S2 --> S3 + S3 --> S4 + S4 --> S5 + S5 --> S6 + + S2 --> POL + S3 --> BUD + S4 --> MEM + S5 --> VC + S6 --> OBS + + S6 --> CM + S6 --> AF + S6 --> CF + S6 --> L1 + S6 --> L2 + S6 --> L3 + + CM --> MEM + CM --> TOOLS + CM --> L1 + + AF --> MEM + AF --> TOOLS + AF --> L2 + + CF --> MEM + CF --> TOOLS + CF --> L2 + + L1 --> P1 + L2 --> P1 + L3 --> P2 + L2 --> P3 + + RP --> ER1 + ER1 --> ER2 + ER2 --> ER3 + ER3 --> OC + ER2 --> ER4 + ER4 --> GW +``` + +## System Responsibilities + +### AI Gateway + +The front door that owns: + +- Request intake +- Classification +- Safety checks +- Budget-aware routing +- Cache decisions +- Escalation decisions + +### Cognitive Mesh + +The orchestration brain for multi-agent work: + +- Specialist routing +- Decomposition +- Shared state coordination + +### AgentKit Forge + +The tool execution runtime: + +- Tool selection +- Parameter extraction +- Execution loops + +### CodeFlow Engine + +The CI/CD intelligence plane: + +- PR/diff triage +- CI failure bucketing +- Contract breakage interpretation + +### PhoenixRooivalk + +The edge interpretation plane: + +- Event labeling +- Operator alert generation +- Low-bandwidth summaries + +--- + +## Control Plane vs Execution Plane + +```mermaid +flowchart LR + subgraph CP["Control Plane"] + A[AI Gateway] + B[SLM Routing] + C[Policy Engine] + D[Budget Controls] + E[Observability] + F[State / Memory] + end + + subgraph EP["Execution Plane"] + G[Cognitive Mesh] + H[AgentKit Forge] + I[CodeFlow Engine] + J[LLM Providers] + K[Tools / APIs] + L[PhoenixRooivalk Edge] + end + + A --> B + B --> G + B --> H + B --> I + B --> J + G --> K + H --> K + I --> K + L --> A + C --> A + D --> A + E --> A + F --> G + F --> H + F --> I +``` + +--- + +## SLM Tier vs LLM Tier + +```mermaid +flowchart TD + IN[Request / Event / Telemetry] --> SLM[SLM Tier] + + subgraph SLMOps["SLM Responsibilities"] + S1[Classify] + S2[Screen] + S3[Route] + S4[Compress] + S5[Validate] + S6[Triage] + end + + SLM --> S1 + SLM --> S2 + SLM --> S3 + SLM --> S4 + SLM --> S5 + SLM --> S6 + + S3 --> D{Escalate?} + D -->|No| OUT1[Fast / Cheap Response] + D -->|Yes| LLM[LLM Tier] + + subgraph LLMOps["LLM Responsibilities"] + L1[Deep reasoning] + L2[Complex synthesis] + L3[Ambiguous tradeoffs] + L4[Novel plan generation] + end + + LLM --> L1 + LLM --> L2 + LLM --> L3 + LLM --> L4 + LLM --> OUT2[High-value response] +``` + +--- + +## Practical Request Path (AI Gateway) + +```mermaid +sequenceDiagram + participant C as Client + participant G as AI Gateway + participant S as SLM Layer + participant T as Tools + participant M as Mesh + participant L as LLM + participant O as Observability + + C->>G: Request + G->>S: classify + scan + estimate complexity + S-->>G: route decision + confidence + G->>O: log request metadata + + alt Simple + G-->>C: direct low-cost response + else Tool-first + G->>M: dispatch task + M->>T: execute tools + T-->>M: tool results + M->>S: compress results + S-->>M: compact state + M-->>C: response + else Complex + G->>L: escalate with compact context + L-->>G: deep reasoning output + G-->>C: final response + end +``` + +--- + +## CodeFlow Engine CI Path + +```mermaid +flowchart TD + PR[PR / Push / Issue Event] --> C1[SLM Diff Classifier] + C1 --> C2[SLM Risk Scorer] + C2 --> C3[SLM Test Impact Predictor] + + C3 --> D{Path} + D -->|low risk| F1[Fast checks] + D -->|high risk| F2[Full CI / security / contract tests] + D -->|uncertain| F3[LLM or human review gate] + + F1 --> L[CI Logs] + F2 --> L + F3 --> L + + L --> T1[SLM Failure Triage] + T1 --> T2[SLM Comment Draft / Routing] + T2 --> T3[Action: retry / assign / block / suggest fix] +``` + +--- + +## AgentKit Forge Tool Loop + +```mermaid +flowchart LR + A[Task] --> B[SLM Tool Selector] + B --> C[Select Tool + Args] + + C --> D1[GitHub] + C --> D2[Azure] + C --> D3[Terraform] + C --> D4[Kusto] + C --> D5[Docs / Files] + + D1 --> E[SLM Result Compressor] + D2 --> E + D3 --> E + D4 --> E + D5 --> E + + E --> F{Enough?} + F -->|yes| G[Return answer] + F -->|no| H[Escalate to LLM / Mesh] +``` + +--- + +## PhoenixRooivalk Edge Path + +```mermaid +sequenceDiagram + participant S as Sensors + participant P as Detection Pipeline + participant E as Edge SLM + participant O as Operator Console + participant C as Cloud Gateway + + S->>P: RF / EO / radar / telemetry + P->>E: normalized event packet + E-->>P: label + summary + confidence + P->>O: operator alert + + alt threshold exceeded + P->>C: send compressed evidence bundle + else local-only event + P->>O: keep local record + end +``` + +--- + +## Layer Responsibilities + +| Layer | Primary | SLM Role | LLM Role | +| ------------- | ------------------------------ | ----------------------- | -------------------- | +| Edge | PhoenixRooivalk | Reports only | None | +| Gateway | AI Gateway | Routing, security, cost | Complex reasoning | +| Orchestration | Cognitive Mesh, AgentKit Forge | Routing, tools | Synthesis | +| Intelligence | CodeFlow Engine | Triage | None | +| Synthesis | LLM Layer | None | Reasoning, synthesis | + +--- + +## Ownership Boundaries + +### AI Gateway owns + +- Ingress control +- Policy enforcement +- Routing +- Cost governance +- Model/provider abstraction +- Shared telemetry + +### Cognitive Mesh owns + +- Multi-agent coordination +- Task decomposition +- State fusion +- Escalation into deep synthesis + +### AgentKit Forge owns + +- Tool loops +- Action execution +- Extraction +- Retry/fallback behavior + +### CodeFlow Engine owns + +- Software delivery intelligence +- Repo event interpretation +- CI analysis +- Developer feedback automation + +### PhoenixRooivalk owns + +- Edge summarization +- Local alerting +- Compressed event escalation + +--- + +## Implementation Phases + +### Phase 1 — Gateway-first + +Build SLM control plane: intent classifier, policy scanner, budget router, cache gate, escalation judge + +### Phase 2 — CodeFlow Engine + +Add SLMs: diff classifier, PR risk scorer, CI failure bucketer + +### Phase 3 — AgentKit Forge + +Optimize tool loops: tool selector, arg extractor, result compressor + +### Phase 4 — Cognitive Mesh + +Add: specialist router, decomposer, state manager + +### Phase 5 — PhoenixRooivalk + +Deploy edge SLMs: event label, alert text, escalation filter + +--- + +## Shared Telemetry Schema + +```json +{ + "trace_id": "uuid", + "system": "ai-gateway|cognitive-mesh|codeflow-engine|agentkit-forge|phoenixrooivalk", + "stage": "classify|route|tool_call|llm_escalation|edge_alert", + "model_tier": "slm|llm", + "model_name": "example-model", + "decision": "allow|block|tool_first|escalate|local_only", + "confidence": 0.92, + "latency_ms": 83, + "token_in": 540, + "token_out": 96, + "estimated_cost": 0.0014, + "policy_flags": ["pii:none", "secret:none"], + "outcome": "success" +} +``` + +--- + +## Production Rules + +### Escalate to LLM when: + +- Confidence below threshold +- Ambiguity above threshold +- Multiple specialists disagree +- Tool results conflict +- Output is user-facing and high-stakes +- Architecture/tradeoff reasoning required + +### Stay in SLM path when: + +- Task is classification +- Task is screening +- Task is extraction +- Task is summarization +- Task is repetitive CI triage +- Task is edge-local operator support + +--- + +## C4-Style Architecture + +For detailed C4-style diagrams including: + +- System Context diagram +- Container diagram +- CodeFlow sequence +- PhoenixRooivalk edge-to-cloud sequence + +See [c4-architecture.md](c4-architecture.md) + +--- + +## Bottom Line + +The most practical target architecture: + +- **AI Gateway** as the centralized SLM control plane +- **Cognitive Mesh / AgentKit Forge / CodeFlow Engine** as execution systems +- **PhoenixRooivalk** as edge plane with local SLM autonomy +- **LLMs** reserved for synthesis, ambiguity, and hard reasoning + +> Gateway governs. SLMs triage and steer. Specialist systems execute. LLMs arbitrate the hard cases. Edge stays local unless escalation is justified. diff --git a/docs/architecture/reference/dashboards.md b/docs/architecture/reference/dashboards.md new file mode 100644 index 0000000..9e70f36 --- /dev/null +++ b/docs/architecture/reference/dashboards.md @@ -0,0 +1,17 @@ +# Recommended Dashboards + +Grafana/ADX dashboard recommendations for operational visibility. + +--- + +## Dashboard Pack + +Split Grafana/ADX dashboards into these boards: + +| Dashboard | Metrics | +| -------------------- | ------------------------------------------------------------------------------------- | +| **Executive / Cost** | Total requests, SLM vs LLM ratio, cost by route, cost per outcome, escalation rate | +| **Reliability** | Error rate, tool failure rate, retry hotspots, provider latency, queue backlog | +| **Governance** | Policy blocks, redaction counts, provider data-boundary usage, audit completeness | +| **CodeFlow** | PR risk distribution, CI triage buckets, contract-break suspects, feedback usefulness | +| **Rooivalk** | Detections vs alerts, local vs escalated, site alert volume, edge latency | diff --git a/docs/architecture/reference/deployment-observability.md b/docs/architecture/reference/deployment-observability.md new file mode 100644 index 0000000..f237e21 --- /dev/null +++ b/docs/architecture/reference/deployment-observability.md @@ -0,0 +1,383 @@ +# Deployment, Trust Boundaries & Observability + +This set extends the C4 view into operational architecture including deployment, security boundaries, and telemetry. + +--- + +## 1. Deployment Diagram + +This is the practical cloud/edge deployment shape for your stack. + +```mermaid +flowchart TB + subgraph Internet["Public / External"] + U1[Users / Browsers / Chat Clients] + U2[GitHub Webhooks] + U3[External APIs / Apps] + MP[Model Providers] + end + + subgraph Azure["Azure Subscription"] + DNS[Azure DNS / Front Door / App Gateway] + KV[Key Vault] + LAW[Log Analytics] + ADX[Azure Data Explorer / Kusto] + BLOB[Blob Storage] + REDIS[Redis / Cache] + DB[Postgres / Cosmos / State DB] + AISEARCH[Vector Store / AI Search] + GRAF[Grafana] + BUS[Service Bus / Queue] + MON[Azure Monitor / App Insights] + + subgraph Runtime["Runtime Plane"] + GW[AI Gateway] + CM[Cognitive Mesh] + AKF[AgentKit Forge] + CFE[CodeFlow Engine] + TB[Tool Broker] + OPA[Policy Engine] + end + + subgraph Workers["Background / Event Workers"] + W1[PR / CI Worker] + W2[Agent Task Worker] + W3[Telemetry Ingest Worker] + W4[Cost / Audit Aggregator] + end + + subgraph Models["Hosted Model Zone"] + SLM[SLM Serving Pool] + LLM[LLM Adapter / Provider Proxy] + end + end + + subgraph Edge["PhoenixRooivalk Edge Sites"] + SENS[RF / EO / Radar / Telemetry Sensors] + EDGEPIPE[Detection Pipeline] + E1[Edge SLM Event Labeler] + E2[Edge SLM Summarizer] + E3[Edge Escalation Filter] + OPC[Operator Console] + SYNC[Secure Sync Agent] + end + + U1 --> DNS + U2 --> DNS + U3 --> DNS + DNS --> GW + + GW --> REDIS + GW --> DB + GW --> AISEARCH + GW --> KV + GW --> OPA + GW --> TB + GW --> SLM + GW --> LLM + GW --> MON + + CM --> DB + CM --> BUS + CM --> AISEARCH + CM --> TB + CM --> SLM + CM --> LLM + CM --> MON + + AKF --> DB + AKF --> BUS + AKF --> TB + AKF --> SLM + AKF --> LLM + AKF --> MON + + CFE --> DB + CFE --> BUS + CFE --> TB + CFE --> SLM + CFE --> LLM + CFE --> MON + + W1 --> CFE + W2 --> AKF + W3 --> GW + W4 --> ADX + + MON --> LAW + LAW --> ADX + BLOB --> ADX + ADX --> GRAF + + MP --> LLM + + SENS --> EDGEPIPE + EDGEPIPE --> E1 + E1 --> E2 + E2 --> E3 + E2 --> OPC + E3 --> SYNC + SYNC --> GW +``` + +### Practical Reading of Deployment + +| Zone | Components | Purpose | +| -------------------- | --------------------------------------------------------- | ----------------------- | +| **Front door** | Azure DNS / Front Door / App Gateway | Ingress and routing | +| **Shared backing** | Key Vault, Redis, Postgres/Cosmos, AI Search, Service Bus | State, caching, secrets | +| **Runtime services** | AI Gateway, Cognitive Mesh, AgentKit Forge, CodeFlow | Core execution | +| **Workers** | PR/CI, Agent Task, Telemetry, Cost Aggregators | Background processing | +| **Model zone** | SLM Pool, LLM Adapter | AI inference | +| **Edge** | Detection Pipeline, Edge SLMs, Operator Console | Local operation | + +--- + +## 2. Trust Boundary Diagram + +This is the security-relevant segmentation. + +```mermaid +flowchart LR + subgraph TB1["Boundary 1: Public / Untrusted"] + A[Users / Browsers] + B[GitHub Webhooks] + C[External Apps] + D[Internet Traffic] + end + + subgraph TB2["Boundary 2: Controlled Ingress"] + E[Front Door / API Gateway / WAF] + F[AI Gateway] + end + + subgraph TB3["Boundary 3: Internal Control Plane"] + G[Policy Engine] + H[Budget / Rate Controls] + I[Session / State Store] + J[Semantic Cache] + K[Observability / Audit] + end + + subgraph TB4["Boundary 4: Internal Execution Plane"] + L[Cognitive Mesh] + M[AgentKit Forge] + N[CodeFlow Engine] + O[Tool Broker] + end + + subgraph TB5["Boundary 5: Sensitive Integration Zone"] + P[Key Vault] + Q[Azure APIs] + R[GitHub APIs] + S[Kusto / Terraform / Internal Tools] + end + + subgraph TB6["Boundary 6: External Model Providers"] + T[LLM Providers] + U[Hosted / External SLM Providers] + end + + subgraph TB7["Boundary 7: Edge / Field Environment"] + V[PhoenixRooivalk Edge Node] + W[Sensors] + X[Operator Console] + end + + A --> E + B --> E + C --> E + D --> E + E --> F + + F --> G + F --> H + F --> I + F --> J + F --> K + + F --> L + F --> M + F --> N + L --> O + M --> O + N --> O + + O --> Q + O --> R + O --> S + F --> P + L --> P + M --> P + N --> P + + F --> T + F --> U + L --> T + M --> T + N --> T + + W --> V + V --> X + V --> F +``` + +### Security Interpretation + +| Boundary | Description | +| --------- | ----------------------------------------------------------------------------------------- | +| **1 → 2** | Treat all inbound as hostile until authenticated, rate-limited, schema-validated, logged | +| **2 → 3** | AI Gateway is the only entry into internal AI control plane | +| **3 → 4** | Control-plane services decide policy, routing, cost, escalation | +| **4 → 5** | Sensitive zone: credentials, infra mutation, production APIs, write actions | +| **6** | External providers are semi-trusted - apply output scanning and redaction | +| **7** | Edge nodes are partially disconnected - need signed software, local audit, encrypted sync | + +--- + +## 3. Observability Architecture + +This is the unified telemetry design across all systems. + +```mermaid +flowchart TB + subgraph Producers["Telemetry Producers"] + P1[AI Gateway] + P2[Cognitive Mesh] + P3[AgentKit Forge] + P4[CodeFlow Engine] + P5[PhoenixRooivalk Edge] + P6[Tool Broker] + P7[Policy Engine] + end + + subgraph Signals["Signal Types"] + S1[Request / Response Logs] + S2[Routing Decisions] + S3[Policy Events] + S4[Tool Calls] + S5[Model Usage] + S6[CI / PR Events] + S7[Edge Detection Events] + S8[Cost / Token Metrics] + S9[Audit Trail] + end + + subgraph Ingest["Ingestion"] + I1[OpenTelemetry Collectors] + I2[Azure Monitor / App Insights] + I3[Blob Export] + I4[Log Analytics] + end + + subgraph Analytics["Analytics / Query"] + A1[Azure Data Explorer / Kusto] + A2[Cost Aggregates] + A3[Decision Quality Metrics] + A4[Security / Audit Views] + end + + subgraph Viz["Visualization / Alerting"] + V1[Grafana Dashboards] + V2[Alerts / On-call] + V3[Ops Runbooks] + V4[Executive Cost Views] + end + + P1 --> S1 + P1 --> S2 + P1 --> S5 + P1 --> S8 + P1 --> S9 + + P2 --> S2 + P2 --> S4 + P2 --> S5 + P2 --> S9 + + P3 --> S4 + P3 --> S5 + P3 --> S9 + + P4 --> S6 + P4 --> S2 + P4 --> S5 + P4 --> S9 + + P5 --> S7 + P5 --> S2 + P5 --> S9 + + P6 --> S4 + P7 --> S3 + + S1 --> I1 + S2 --> I1 + S3 --> I1 + S4 --> I1 + S5 --> I2 + S6 --> I2 + S7 --> I3 + S8 --> I2 + S9 --> I4 + + I1 --> A1 + I2 --> A1 + I3 --> A1 + I4 --> A1 + + A1 --> A2 + A1 --> A3 + A1 --> A4 + + A2 --> V1 + A3 --> V1 + A4 --> V1 + V1 --> V2 + V1 --> V3 + V1 --> V4 +``` + +### What to Measure + +#### Gateway metrics + +- Requests by route +- SLM vs LLM escalation rate +- Confidence distribution +- Token in/out averages +- Semantic cache hit rate +- Refusal/block counts +- Provider latency/error rate + +#### Cognitive Mesh metrics + +- Route-to-specialist distribution +- Decomposition count per task +- Summary compression ratio +- Multi-agent disagreement rate +- Escalation rate to LLM synthesis + +#### AgentKit Forge metrics + +- Tool selection accuracy +- Retry counts +- Fallback frequency +- Avg tool-loop depth +- Tool output compression ratio + +#### CodeFlow Engine metrics + +- PR classification distribution +- False positive/negative on risk tier +- CI failure bucket frequency +- Contract-break detection precision +- Comment usefulness feedback + +#### PhoenixRooivalk metrics + +- Local-only vs escalated events +- Edge summary latency +- Alert volume per session +- Signal-to-alert compression ratio +- Dropped/deferred syncs diff --git a/docs/architecture/reference/matrix-agentkit.md b/docs/architecture/reference/matrix-agentkit.md new file mode 100644 index 0000000..ef3cd79 --- /dev/null +++ b/docs/architecture/reference/matrix-agentkit.md @@ -0,0 +1,110 @@ +# AgentKit Forge SLM Implementation + +## SLM Endpoints + +| Endpoint | Method | Purpose | +| ----------------------- | ------ | ------------------------------------------------- | +| `/slm/select-tool` | POST | Maps request to GitHub/Azure/Terraform/Kusto/docs | +| `/slm/filter-context` | POST | Selects only relevant memory/state | +| `/slm/estimate-budget` | POST | Predicts steps, token tier, tool-first viability | +| `/slm/check-escalation` | POST | Decides whether LLM planning is needed | + +## Service Boundaries + +```mermaid +flowchart TD + A[Agent Runtime] --> B[Task Intake] + B --> C[SLM Tool Selector] + C --> D{Tool / Reason / Direct} + D --> E[Tool Adapter Layer] + D --> F[Direct Response] + D --> G[LLM Planner] + E --> H[State Store] + G --> H + H --> I[SLM Context Filter] + I --> J[Next Action] +``` + +## Example Responses + +**select-tool:** + +```json +{ + "action_mode": "tool", + "tool": "azure_cli", + "operation_family": "cost_management", + "arguments_hint": { "service": "foundry", "time_window": "last_30_days" }, + "confidence": 0.89 +} +``` + +**estimate-budget:** + +```json +{ + "predicted_steps": 4, + "token_cost_tier": "medium", + "tool_first_recommended": true, + "llm_needed": false, + "confidence": 0.81 +} +``` + +## Contract Shapes + +```typescript +interface SelectToolOutput { + action_mode: "tool" | "reason" | "direct"; + tool: "github" | "azure_cli" | "terraform" | "kusto" | "docs_search"; + operation_family: string; + arguments_hint: Record; + confidence: number; +} + +interface EstimateBudgetOutput { + predicted_steps: number; + token_cost_tier: "low" | "medium" | "high"; + tool_first_recommended: boolean; + llm_needed: boolean; + confidence: number; +} +``` + +## Telemetry Fields + +| Field | Type | Description | +| ------------------- | ------- | ------------------ | +| `agent_run_id` | uuid | Unique run ID | +| `selected_tool` | string | Tool selected | +| `action_mode` | string | tool/reason/direct | +| `budget_tier` | string | Cost tier | +| `predicted_steps` | number | Steps predicted | +| `escalated_to_llm` | boolean | LLM invoked | +| `compression_ratio` | number | Context reduced | + +## Fallback Rules + +| Condition | Action | +| ----------------------------- | --------------------------- | +| No tool confidence >= 0.80 | Don't execute automatically | +| Context filter low | Preserve more context | +| Budget low but ambiguity high | Escalate to planner | +| Tool failure | Classify before retry | + +## Configurable Thresholds + +```typescript +const DEFAULT_THRESHOLDS = { + tool_selection: { direct_execute: 0.85, require_confirm: 0.7 }, + context_filtering: { aggressive: 0.85, conservative: 0.78 }, + escalation_check: { continue_tools: 0.8, llm_planning: 0.65 }, + budget_estimate: { reliable: 0.75, uncertain: 0.6 }, +}; +``` + +| Threshold | Action | +| --------- | -------------------- | +| >= 0.85 | Direct execution | +| 0.70-0.84 | Require confirmation | +| < 0.70 | Decline / clarify | diff --git a/docs/architecture/reference/matrix-codeflow.md b/docs/architecture/reference/matrix-codeflow.md new file mode 100644 index 0000000..90d2fe7 --- /dev/null +++ b/docs/architecture/reference/matrix-codeflow.md @@ -0,0 +1,110 @@ +# CodeFlow Engine SLM Implementation + +## SLM Endpoints + +| Endpoint | Method | Purpose | +| ---------------------------- | ------ | --------------------------------------------------------------- | +| `/slm/classify-change` | POST | Determines: docs/code/config/infra/security, risk, blast radius | +| `/slm/suggest-pipeline` | POST | Fast path vs full path | +| `/slm/summarize-failure` | POST | Turns CI output into actionable summary | +| `/slm/release-note-fragment` | POST | Generates structured change summary | + +## Service Boundaries + +```mermaid +flowchart TD + A[GitHub Event] --> B[Diff / Metadata Collector] + B --> C[SLM Change Classifier] + C --> D[Pipeline Policy Engine] + D --> E[CI Path Selection] + E --> F[Workflow Execution] + F --> G[SLM Failure Summarizer] + G --> H[PR Comment / Status] +``` + +## Example Responses + +**classify-change:** + +```json +{ + "change_type": "infra", + "risk": "high", + "blast_radius": "shared_environment", + "requires_contract_validation": false, + "requires_security_scan": true, + "recommended_pipeline": "full", + "confidence": 0.91 +} +``` + +**summarize-failure:** + +```json +{ + "failure_type": "test_failure", + "subtype": "integration_environment", + "retryable": true, + "summary": "Integration tests failed due to unreachable dependent service.", + "recommended_next_action": "retry once and verify service container health", + "confidence": 0.83 +} +``` + +## Contract Shapes + +```typescript +interface ClassifyChangeOutput { + change_type: "docs" | "code" | "config" | "schema" | "infra" | "security"; + risk: "low" | "medium" | "high" | "critical"; + blast_radius: "local_only" | "shared_environment" | "production"; + requires_security_scan: boolean; + recommended_pipeline: "fast" | "full"; + confidence: number; +} + +interface SummarizeFailureOutput { + failure_type: string; + retryable: boolean; + summary: string; + recommended_next_action: string; + confidence: number; +} +``` + +## Telemetry Fields + +| Field | Type | Description | +| ------------------------------- | ------ | ------------------- | +| `repo` | string | Repository | +| `pr_number` | number | PR number | +| `change_type` | string | Classified type | +| `risk` | string | Risk level | +| `pipeline_selected` | string | Path chosen | +| `slm_classification_latency_ms` | number | Classification time | +| `workflow_duration_ms` | number | Total duration | + +## Fallback Rules + +| Condition | Action | +| ----------------------------------- | ------------------------------------ | +| Never skip mandatory tests from SLM | Hard policy enforcement | +| High-risk + low confidence | Choose stricter pipeline | +| Classifier unavailable | Default conservative path | +| Failure uncertain | No destructive reruns without policy | + +## Configurable Thresholds + +```typescript +const DEFAULT_THRESHOLDS = { + change_classification: { direct_use: 0.88, manual_review: 0.75 }, + pipeline_suggestion: { direct_path: 0.85, force_full_path: 0.7 }, + failure_summary: { direct_use: 0.8, require_human: 0.65 }, +}; +``` + +| Threshold | Action | +| --------- | ----------------- | +| >= 0.88 | Direct use | +| 0.75-0.87 | Verify with rules | +| < 0.75 | Manual review | diff --git a/docs/architecture/reference/matrix-cognitive-mesh.md b/docs/architecture/reference/matrix-cognitive-mesh.md new file mode 100644 index 0000000..5d11f33 --- /dev/null +++ b/docs/architecture/reference/matrix-cognitive-mesh.md @@ -0,0 +1,112 @@ +# Cognitive Mesh SLM Implementation + +## SLM Endpoints + +| Endpoint | Method | Purpose | +| ------------------------ | ------ | ------------------------------------------- | +| `/slm/decompose-task` | POST | Break complex request into agent tasks | +| `/slm/route-agent` | POST | Route task to appropriate specialist agent | +| `/slm/compress-context` | POST | Compress long context for agent consumption | +| `/slm/validate-response` | POST | Validate agent response coherence | + +## Service Boundaries + +```mermaid +flowchart TD + A[Mesh Entry] --> B[SLM Router] + B --> C{Single or Multi-Agent?} + C -->|Single| D[Specialist Agent] + C -->|Multi| E[SLM Decomposer] + E --> F[Task Graph] + F --> G[Specialist Agents] + D --> H[Evidence Store] + G --> H + H --> I[SLM Compressor] + I --> J[LLM Synthesizer] +``` + +## Example Responses + +**route-agent:** + +```json +{ + "mode": "multi_agent", + "agents": ["infra_agent", "cost_agent", "security_agent"], + "priority": "normal", + "reason_codes": ["azure", "cost", "security_terms"], + "confidence": 0.87 +} +``` + +**decompose-task:** + +```json +{ + "subtasks": [ + { "id": "t1", "agent": "infra_agent", "goal": "inventory deployed Azure resources" }, + { "id": "t2", "agent": "cost_agent", "goal": "identify cost spikes" }, + { "id": "t3", "agent": "security_agent", "goal": "check for unauthorized usage" } + ], + "confidence": 0.82 +} +``` + +## Contract Shapes + +```typescript +interface RouteAgentOutput { + target_agent: string; + mode: "single_agent" | "parallel_agents" | "sequential"; + escalation_required: boolean; + fallback_agent?: string; + confidence: number; +} + +interface DecomposeTaskOutput { + tasks: { + id: string; + description: string; + agent_type: string; + dependencies: string[]; + }[]; + estimated_complexity: "low" | "medium" | "high"; + confidence: number; +} +``` + +## Telemetry Fields + +| Field | Type | Description | +| --------------------- | -------- | ------------------- | +| `mesh_run_id` | uuid | Unique execution ID | +| `route_mode` | string | single/multi agent | +| `selected_agents` | string[] | Agents selected | +| `decomposition_count` | number | Subtasks created | +| `compression_ratio` | number | Tokens reduced | +| `escalated_to_llm` | boolean | LLM used | + +## Fallback Rules + +| Condition | Action | +| -------------------- | ------------------------- | +| Route confidence low | Send to orchestration LLM | +| Decomposition low | Single-agent fallback | +| Compression low | Pass fuller context | +| No agent matches | Default to "research" | + +## Configurable Thresholds + +```typescript +const DEFAULT_THRESHOLDS = { + agent_routing: { direct_route: 0.85, verify_with_rules: 0.7 }, + task_decomposition: { direct_decompose: 0.8, single_agent_fallback: 0.65 }, + context_compression: { aggressive: 0.85, conservative: 0.78 }, +}; +``` + +| Threshold | Action | +| --------- | ----------------- | +| >= 0.85 | Direct routing | +| 0.70-0.84 | Verify with rules | +| < 0.70 | Escalate to LLM | diff --git a/docs/architecture/reference/matrix-gateway.md b/docs/architecture/reference/matrix-gateway.md new file mode 100644 index 0000000..4551887 --- /dev/null +++ b/docs/architecture/reference/matrix-gateway.md @@ -0,0 +1,111 @@ +# AI Gateway SLM Implementation + +## SLM Endpoints + +| Endpoint | Method | Purpose | +| ------------------------ | ------ | ----------------------------------------------------- | +| `/slm/classify-request` | POST | Infer intent, estimate complexity, detect toolability | +| `/slm/policy-screen` | POST | PII/secrets/prompt injection scan, tenant policy fit | +| `/slm/post-tag-response` | POST | Normalize telemetry tags, classify business category | + +## Service Boundaries + +```mermaid +flowchart TD + A[Gateway API] --> B[Policy Engine] + B --> C[SLM Router Service] + C --> D[Model Selection Engine] + D --> E[Provider Adapter] + E --> F[LLM / SLM / Tool] + F --> G[Response Validator] + G --> H[Telemetry + Billing] +``` + +## Example Request/Response + +**Request:** + +```json +{ + "tenant_id": "phoenixvc-prod", + "user_input": "Review this PR and tell me if the API contract changed.", + "context": { + "channel": "web", + "has_files": true, + "history_len": 7 + } +} +``` + +**Response:** + +```json +{ + "intent": "code_review", + "complexity": "medium", + "tool_candidate": true, + "recommended_target": "codeflow-engine", + "recommended_model_tier": "small", + "escalation_required": false, + "confidence": 0.93 +} +``` + +## Contract Shapes + +```typescript +interface ClassifyRequestOutput { + request_id: string; + label: "code_review" | "chat" | "analysis" | "tool_invocation" | "embedding"; + confidence: number; + complexity: "low" | "medium" | "high"; + tool_candidate: boolean; + recommended_tier: "slm" | "small" | "large"; + cacheable: boolean; +} + +interface PolicyScreenOutput { + allowed: boolean; + risk_level: "low" | "medium" | "high" | "critical"; + risk_categories: string[]; + action: "allow" | "rewrite" | "block" | "escalate"; + confidence: number; +} +``` + +## Telemetry Fields + +| Field | Type | Description | +| ------------------- | ------- | ------------------- | +| `tenant_id` | string | Tenant identifier | +| `slm_latency_ms` | number | SLM processing time | +| `intent` | string | Classified intent | +| `complexity` | string | Complexity level | +| `risk_level` | string | Risk assessment | +| `tool_candidate` | boolean | Tool recommendation | +| `escalated_to_llm` | boolean | Whether escalated | +| `cost_estimate_usd` | number | Estimated cost | + +## Fallback Rules + +| Condition | Action | +| -------------------------------- | ---------------------- | +| `policy-screen.allowed == false` | Block or redact | +| `confidence < 0.70` | Escalate to LLM | +| Tool suggested but no mapping | Send to general LLM | +| Tagging fails | Mark telemetry partial | + +## Configurable Thresholds + +```typescript +const DEFAULT_THRESHOLDS = { + intent_classification: { direct_route: 0.9, verify_with_rules: 0.75 }, + policy: { block_immediately: ["critical_secrets"], escalate_to_review: 0.6 }, +}; +``` + +| Threshold | Action | +| --------- | ----------------- | +| >= 0.90 | Direct routing | +| 0.75-0.89 | Verify with rules | +| < 0.75 | Escalate to LLM | diff --git a/docs/architecture/reference/matrix-mystira.md b/docs/architecture/reference/matrix-mystira.md new file mode 100644 index 0000000..ae81461 --- /dev/null +++ b/docs/architecture/reference/matrix-mystira.md @@ -0,0 +1,137 @@ +# Mystira SLM Implementation + +## SLM Endpoints + +| Endpoint | Method | Purpose | +| ---------------------------- | ------ | ---------------------------------------------------------------- | +| `/slm/classify-session` | POST | Determines: bedtime/educational/adventure/branching/continuation | +| `/slm/check-safety-agefit` | POST | Ensures age appropriateness, tone, blocked content | +| `/slm/check-continuity` | POST | Maintains character consistency, world rules | +| `/slm/shape-image-prompt` | POST | Converts story scene to safe, style-consistent prompt | +| `/slm/compress-story-memory` | POST | Keeps only relevant story state | + +## Service Boundaries + +```mermaid +flowchart TD + A[User / Parent / Educator Input] --> B[Story Session Manager] + B --> C[SLM Session Classifier] + C --> D[Safety + Age Fit] + D --> E{Simple or Creative} + E -->|Simple| F[Template / Guided Story Engine] + E -->|Creative| G[LLM Narrative Engine] + G --> H[SLM Continuity + Reading Level Pass] + F --> H + H --> I[Story Output] + H --> J[Image Prompt Shaper] +``` + +## Example Responses + +**check-safety-agefit:** + +```json +{ + "allowed": true, + "age_band": "8-10", + "tone": "gentle_adventure", + "rewrite_needed": false, + "blocked_categories": [], + "confidence": 0.94 +} +``` + +**check-continuity:** + +```json +{ + "consistent": true, + "issues": [], + "retained_story_facts": [ + "main character is Luma", + "forest companion is a silver fox", + "setting is moonlit valley" + ], + "confidence": 0.86 +} +``` + +**shape-image-prompt:** + +```json +{ + "prompt": "A child-safe illustrated moonlit valley scene with Luma and a silver fox, soft wonder, readable composition, no frightening imagery.", + "safety_checked": true, + "style_profile": "mystira_storybook_v1", + "confidence": 0.9 +} +``` + +## Contract Shapes + +```typescript +interface ClassifySessionOutput { + story_type: "bedtime" | "educational" | "adventure" | "branching" | "continuation"; + age_band: string; + is_interactive: boolean; + needs_images: boolean; + curriculum_tags: string[]; + confidence: number; +} + +interface CheckSafetyAgefitOutput { + allowed: boolean; + age_band: string; + tone: string; + rewrite_needed: boolean; + blocked_categories: string[]; + confidence: number; +} + +interface ShapeImagePromptOutput { + prompt: string; + safety_checked: boolean; + style_profile: string; + confidence: number; +} +``` + +## Telemetry Fields + +| Field | Type | Description | +| ----------------------- | ------- | ---------------- | +| `session_id` | uuid | Session ID | +| `story_mode` | string | Classification | +| `age_band` | string | Target age | +| `safety_action` | string | Action taken | +| `rewrite_applied` | boolean | Rewritten | +| `continuity_check_used` | boolean | Validated | +| `image_prompt_shaped` | boolean | Prompt generated | +| `slm_cost` | number | SLM cost | +| `llm_cost` | number | LLM cost | + +## Fallback Rules + +| Condition | Action | +| ------------------ | ------------------------ | +| Safety uncertainty | Safe rewrite or refuse | +| Continuity low | Pass more history to LLM | +| Image shaping low | Conservative template | +| Age-fit uncertain | Default younger-safe | + +## Configurable Thresholds + +```typescript +const DEFAULT_THRESHOLDS = { + session_classification: { direct_use: 0.88, require_review: 0.75 }, + safety_agefit: { direct_allow: 0.92, require_rewrite: 0.8, block: 0.8 }, + continuity: { direct_use: 0.82, pass_to_llm: 0.7 }, + image_prompt: { direct_use: 0.88, conservative: 0.75 }, +}; +``` + +| Threshold | Action | +| --------- | ------------- | +| >= 0.92 | Direct allow | +| 0.80-0.91 | Rewrite/adapt | +| < 0.80 | Block content | diff --git a/docs/architecture/reference/matrix-rooivalk.md b/docs/architecture/reference/matrix-rooivalk.md new file mode 100644 index 0000000..29f20da --- /dev/null +++ b/docs/architecture/reference/matrix-rooivalk.md @@ -0,0 +1,120 @@ +# PhoenixRooivalk SLM Implementation + +## SLM Endpoints + +| Endpoint | Method | Purpose | +| ------------------------------- | ------ | ---------------------------------------------------- | +| `/slm/interpret-event` | POST | Turns fused detection into operator-readable summary | +| `/slm/suggest-sop` | POST | Maps event type to likely SOP references | +| `/slm/condense-mission-log` | POST | Produces incident record | +| `/slm/classify-incident-report` | POST | Creates structured post-event label set | + +## Service Boundaries + +```mermaid +flowchart TD + A[RF / Radar / EO / IR / Rules] --> B[Fusion + Threat Scoring] + B --> C[Hard Decision Layer] + B --> D[SLM Interpretation Layer] + D --> E[Operator Console Summary] + D --> F[SOP Suggestions] + D --> G[Mission Narrative] + C --> H[Manual Review / Control Path] +``` + +## CRITICAL: SLM is for Reporting Only + +``` +┌─────────────────────────────────────────────────────────┐ +│ IMPORTANT - SAFETY BOUNDARY │ +├─────────────────────────────────────────────────────────┤ +│ Hard Decision Layer must NOT depend on free-form SLM │ +│ │ +│ SLM output is for OBSERVATION and REPORTING only: │ +│ • Operator summaries │ +│ • SOP suggestions (non-binding) │ +│ • Mission log condensation │ +│ │ +│ SLM must NEVER be used for: │ +│ • Autonomous threat response │ +│ • Access control decisions │ +│ • Resource isolation actions │ +│ • Any kinetic or hard control actions │ +└─────────────────────────────────────────────────────────┘ +``` + +## Example Responses + +**interpret-event:** + +```json +{ + "title": "Low-altitude inbound contact", + "facts": ["sector north-east", "altitude 35m", "consumer quadcopter RF profile"], + "inferences": ["possible perimeter reconnaissance"], + "operator_summary": "Inbound low-altitude contact detected from north-east sector.", + "confidence": 0.77 +} +``` + +**suggest-sop:** + +```json +{ + "recommended_sops": ["SOP-12 Verify EO feed", "SOP-21 Raise perimeter alert state"], + "confidence": 0.74 +} +``` + +## Contract Shapes + +```typescript +interface InterpretEventOutput { + title: string; + facts: string[]; + inferences: string[]; + operator_summary: string; + confidence: number; +} + +interface SuggestSopOutput { + recommended_sops: string[]; + confidence: number; +} +``` + +## Telemetry Fields + +| Field | Type | Description | +| ------------------------- | -------- | ------------------ | +| `incident_id` | uuid | Incident ID | +| `sensor_fusion_version` | string | Fusion version | +| `threat_score` | number | Calculated score | +| `slm_interpretation_used` | boolean | SLM invoked | +| `sop_suggestions` | string[] | SOPs suggested | +| `human_acknowledged` | boolean | Human acknowledged | +| `offline_mode` | boolean | Offline mode | + +## Fallback Rules + +| Condition | Action | +| ----------------------------- | ---------------------------- | +| Interpretation low confidence | Show facts only | +| SOP low confidence | "Manual SOP lookup required" | +| Edge model unavailable | Use non-LLM summaries | +| SOP generated | NEVER pass to control path | + +## Configurable Thresholds + +```typescript +const DEFAULT_THRESHOLDS = { + operator_summary: { direct_use: 0.8, facts_only: 0.65 }, + sop_suggestion: { direct_suggest: 0.78, manual_lookup: 0.65 }, +}; +``` + +| Threshold | Action | +| --------- | ---------------------------- | +| >= 0.80 | Full summary with inferences | +| 0.65-0.79 | Facts only, no inferences | +| < 0.65 | Human analysis | diff --git a/docs/architecture/reference/operations-patterns.md b/docs/architecture/reference/operations-patterns.md new file mode 100644 index 0000000..01e9b84 --- /dev/null +++ b/docs/architecture/reference/operations-patterns.md @@ -0,0 +1,152 @@ +# Operations Patterns + +Operational patterns including SLM→LLM decision flows, ownership maps, and implementation guidance. + +--- + +## SLM → LLM Decision Flow + +Production handoff logic for routing between SLM and LLM tiers. + +```mermaid +flowchart TD + A[Incoming task / event / request] --> B[SLM preprocess] + + B --> C[Intent classification] + C --> D[Policy / risk scan] + D --> E[Tool-needed check] + E --> F[Complexity estimate] + F --> G[Confidence score] + + G --> H{Blocked by policy?} + H -->|Yes| X[Refuse / redact / quarantine] + H -->|No| I{Simple and high confidence?} + + I -->|Yes| Y[Return SLM path result] + I -->|No| J{Tool first?} + + J -->|Yes| K[Run tool / workflow] + K --> L[SLM compress + validate tool output] + L --> M{Enough to answer?} + M -->|Yes| Y + M -->|No| N[Escalate] + + J -->|No| N[Escalate] + + N --> O[Prepare compact escalation context] + O --> P[LLM reasoning / synthesis] + P --> Q[Post-LLM policy / quality check] + Q --> R[Return final response] +``` + +### Threshold Guidelines + +Use configurable thresholds, not hardcoded logic. + +| Stay in SLM Path | Escalate to LLM | +| ----------------------------------- | -------------------------- | +| High confidence | Confidence below threshold | +| Classification/extraction/screening | Policy ambiguity exists | +| Short, bounded output | Tool outputs conflict | +| Unambiguous tool result | Multi-agent disagreement | +| Low risk | User-facing, high impact | + +### Decision Schema + +```json +{ + "intent": "ci_failure_triage", + "risk_level": "medium", + "needs_tool": true, + "complexity": "medium", + "confidence": 0.81, + "policy_status": "allow", + "recommended_path": "tool_first", + "escalate": false +} +``` + +--- + +## Repo-to-Service Ownership Map + +Maps conceptual stack into likely repo/service boundaries. + +```mermaid +flowchart LR + R1[pvc-ai-gateway repo] --> S1[AI Gateway Service] + R2[cognitive-mesh repo] --> S2[Cognitive Mesh Service] + R3[codeflow-engine repo] --> S3[CodeFlow Engine Service] + R4[agentkit-forge repo] --> S4[AgentKit Forge Service] + R5[phoenixrooivalk-* repos] --> S5[PhoenixRooivalk Edge + Command Services] + R6[shared-platform / contracts / schemas repo] --> S6[Shared Contracts / Telemetry / Policy / SDKs] + R7[infra repo] --> S7[Azure Infra / Monitoring / Deployment Pipelines] +``` + +### Ownership Summary + +| Repo | Owns | +| ---------------------- | ------------------------------------------------------------------------------------------------------- | +| **pvc-ai-gateway** | Ingress API, routing contracts, escalation policy, provider abstraction, semantic cache, audit envelope | +| **cognitive-mesh** | Specialist routing, task decomposition, agent state model, synthesis orchestration, disagreement logic | +| **codeflow-engine** | PR event models, diff classification, CI log triage, contract break workflows, comment generation | +| **agentkit-forge** | Tool registry, tool selection schemas, arg extraction, execution-loop state, retry/fallback logic | +| **phoenixrooivalk-\*** | Edge event schema, local alerting, escalation packet format, command-layer integration | +| **shared-platform** | Telemetry envelope, routing decision schema, model usage schema, audit/trace IDs, reusable schemas | +| **infra** | Azure deployment, Grafana/ADX dashboards, Key Vault wiring, service identities, networking | + +--- + +## Implementation Order + +### First + +Define shared contracts: + +- Routing decision schema +- Model usage event +- Tool execution event +- Audit envelope +- Edge escalation packet + +### Second + +Implement telemetry in the gateway: + +- Trace ID propagation +- Decision logs +- Provider usage events +- Cost estimation fields + +### Third + +Bring CodeFlow and AgentKit onto same telemetry envelope. + +### Fourth + +Add Cognitive Mesh orchestration and disagreement telemetry. + +### Fifth + +Add Rooivalk edge packet telemetry and sync audit. + +--- + +## Architectural Recommendation + +For your environment, the strongest production stance is: + +1. **AI Gateway is the only public AI ingress** +2. **All routing decisions emit one shared RoutingDecision contract** +3. **All model calls emit one shared ModelUsageEvent** +4. **All tool invocations flow through a broker or shared event schema** +5. **All edge escalations use compact evidence packets** +6. **ADX/Kusto + Grafana becomes the operational truth layer** + +This gives you: + +- Cost visibility +- Quality visibility +- Compliance evidence +- Easier A/B testing of SLM routing +- Cleaner failure diagnosis diff --git a/docs/architecture/reference/slm-implementation-matrix.md b/docs/architecture/reference/slm-implementation-matrix.md new file mode 100644 index 0000000..eb1fc6e --- /dev/null +++ b/docs/architecture/reference/slm-implementation-matrix.md @@ -0,0 +1,260 @@ +# SLM Implementation Matrix + +This document provides a repo-by-repo implementation matrix showing SLM endpoints, contract shapes, telemetry fields, fallback rules, confidence thresholds, and practical service boundaries across all six platforms. + +## Quick Reference + +| Platform | SLM Role | Key Endpoints | +| --------------- | --------------------------------------- | ----------------------------------------- | +| AI Gateway | routing, policy, cost control | /classify-request, /policy-screen | +| Cognitive Mesh | agent routing, decomposition | /route-agent, /decompose-task | +| CodeFlow Engine | PR triage, failure analysis | /classify-change, /summarize-failure | +| AgentKit Forge | tool selection, context shaping | /select-tool, /estimate-budget | +| PhoenixRooivalk | event interpretation, SOP suggestions | /interpret-event, /suggest-sop | +| Mystira | story safety, continuity, image prompts | /check-safety-agefit, /shape-image-prompt | + +## Documentation Structure + +``` +reference/ +├── slm-implementation-matrix.md # This file +├── matrix-gateway.md # AI Gateway details +├── matrix-cognitive-mesh.md # Cognitive Mesh details +├── matrix-codeflow.md # CodeFlow Engine details +├── matrix-agentkit.md # AgentKit Forge details +├── matrix-rooivalk.md # PhoenixRooivalk details +└── matrix-mystira.md # Mystira details +``` + +--- + +## 1. Cross-Stack Operating Model + +Use the same control pattern everywhere: + +```mermaid +flowchart LR + A[Input / Event / Request] --> B[Deterministic Guards] + B --> C[SLM Control Layer] + C --> D{Confidence + Policy} + D -->|high confidence| E[Direct Action / Route / Summarize] + D -->|medium confidence| F[Tool Path / Restricted Flow] + D -->|low confidence| G[LLM Escalation] + G --> H[Post-Validation] + E --> I[Telemetry + Audit] + F --> I + H --> I +``` + +--- + +## 2. Canonical SLM Service Interfaces + +These are the reusable interface families standardized across the stack. + +### A. Classification Contract + +```json +{ + "request_id": "uuid", + "label": "code_review", + "confidence": 0.91, + "secondary_labels": ["security_review"], + "reason_codes": ["contains_diff", "contains_code_terms"], + "recommended_action": "route_security_agent" +} +``` + +### B. Routing Contract + +```json +{ + "request_id": "uuid", + "target": "infra_agent", + "mode": "single_agent", + "escalation_required": false, + "tool_candidate": true, + "cost_tier": "low", + "confidence": 0.88 +} +``` + +### C. Compression Contract + +```json +{ + "request_id": "uuid", + "summary": "User wants Azure cost anomaly investigation for Foundry usage.", + "retained_facts": [ + "resource deleted on 2026-03-05", + "billing visible from 2026-03-03", + "suspected partner local usage" + ], + "dropped_categories": ["small talk", "repeated screenshots"], + "confidence": 0.84 +} +``` + +### D. Safety / Moderation Contract + +```json +{ + "request_id": "uuid", + "allowed": true, + "risk_level": "low", + "risk_categories": [], + "action": "allow", + "confidence": 0.96 +} +``` + +### E. Summarization / Operator Brief Contract + +```json +{ + "request_id": "uuid", + "title": "Possible perimeter drone approach", + "summary": "Low-altitude approach detected from north-east sector.", + "facts": ["altitude 35m", "entry vector north-east", "rf profile matched consumer quadcopter"], + "inferences": ["possible surveillance behavior"], + "recommended_next_step": "verify EO feed and initiate SOP-12", + "confidence": 0.79 +} +``` + +--- + +## 3. Cross-Platform Confidence Policy + +A unified confidence policy across all platforms: + +| Confidence | Meaning | Action | +| ---------- | ----------------- | ----------------------------------------- | +| 0.90-1.00 | Strong confidence | Direct automated route/action | +| 0.80-0.89 | Acceptable | Automate with validation | +| 0.70-0.79 | Uncertain | Restricted automation or human/LLM assist | +| < 0.70 | Weak | Escalate or safe fallback | + +```mermaid +flowchart TD + A[SLM Result] --> B{Confidence >= 0.90?} + B -->|Yes| C[Direct Action] + B -->|No| D{Confidence >= 0.80?} + D -->|Yes| E[Validate & Proceed] + D -->|No| F{Confidence >= 0.70?} + F -->|Yes| G[Restricted Automation] + F -->|No| H[Escalate / Fallback] + E --> I[Execute] + G --> I + H --> I +``` + +--- + +## 4. Cross-Platform Telemetry Schema + +Use a common event envelope across all repos: + +```json +{ + "event_id": "uuid", + "timestamp_utc": "2026-03-15T10:00:00Z", + "platform": "codeflow-engine", + "component": "slm-change-classifier", + "model": "phi-3-mini", + "operation": "classify-change", + "latency_ms": 42, + "input_tokens": 612, + "output_tokens": 87, + "confidence": 0.91, + "action_taken": "full_pipeline", + "escalated": false, + "cost_estimate_usd": 0.0004, + "trace_id": "trace-123" +} +``` + +### Recommended Common Fields + +| Field | Type | Description | +| ------------------- | ------- | ------------------------ | +| `event_id` | uuid | Unique event identifier | +| `trace_id` | uuid | Distributed trace ID | +| `platform` | string | System name | +| `component` | string | Specific component | +| `operation` | string | Operation performed | +| `model` | string | Model used | +| `model_version` | string | Model version | +| `latency_ms` | number | Processing time | +| `input_tokens` | number | Input token count | +| `output_tokens` | number | Output token count | +| `confidence` | number | Model confidence | +| `action_taken` | string | Action taken | +| `escalated` | boolean | Whether escalated to LLM | +| `fallback_reason` | string | Fallback reason | +| `cost_estimate_usd` | number | Estimated cost | +| `tenant_or_project` | string | Tenant identifier | +| `environment` | string | Environment | + +--- + +## 5. Recommended Model-Role Mapping + +This is a practical role map, not a vendor mandate. + +| Role | Recommended Model Profile | +| ----------------------- | ------------------------------------- | +| Classification | Very small, fast instruct model | +| Routing | Small instruct model with strict JSON | +| Safety Prefilter | Small model + deterministic rules | +| Compression | Small/medium model with schema output | +| Failure Summarization | Small instruct model | +| Creative Storytelling | Larger narrative-capable model | +| Deep Synthesis | Larger reasoning model | +| Edge Operator Summaries | Compact on-device model | + +--- + +## 6. Implementation Order + +### Phase 1: Foundation + +- AI Gateway request classifier +- CodeFlow change classifier +- AgentKit tool selector + +### Phase 2: Expansion + +- Cognitive Mesh router + decomposer +- Mystira safety/continuity layer + +### Phase 3: Maturation + +- PhoenixRooivalk operator interpreter +- Shared telemetry normalization +- Confidence calibration dashboards + +--- + +## 7. Cross-System Summary + +### Confidence Threshold Summary + +| System | High (direct) | Medium (verify) | Low (escalate) | +| --------------- | ------------- | --------------- | -------------- | +| AI Gateway | >= 0.90 | 0.75-0.89 | < 0.75 | +| Cognitive Mesh | >= 0.85 | 0.70-0.84 | < 0.70 | +| CodeFlow | >= 0.88 | 0.75-0.87 | < 0.75 | +| AgentKit Forge | >= 0.85 | 0.70-0.84 | < 0.70 | +| PhoenixRooivalk | >= 0.80 | 0.65-0.79 | < 0.65 | +| Mystira | >= 0.92 | 0.80-0.91 | < 0.80 | + +### Standard Fallback Pattern + +``` +1. SLM timeout → Deterministic rules +2. Low confidence → LLM escalation +3. Safety critical → Block immediately +4. Unknown classification → Safe default +5. All failures → Log + alert + human review +``` diff --git a/docs/architecture/reference/slm-management-plan.md b/docs/architecture/reference/slm-management-plan.md new file mode 100644 index 0000000..7c116c5 --- /dev/null +++ b/docs/architecture/reference/slm-management-plan.md @@ -0,0 +1,274 @@ +# SLM Management Plan + +This document outlines the key concerns and management strategy for SLM deployment across all projects. + +## Key Concerns Overview + +| Concern | Priority | Projects Affected | +| -------------------- | -------- | ----------------------- | +| Model Selection | High | All | +| Cost Management | High | All | +| Latency Requirements | High | Gateway, Rooivalk | +| Edge Deployment | High | Rooivalk | +| Security & Privacy | High | Gateway, Cognitive Mesh | +| Reliability | Medium | All | +| Observability | Medium | All | +| Versioning | Medium | All | + +## 1. Model Selection + +### Strategy + +Maintain a tiered model portfolio: + +| Tier | Models | Use Cases | Cost | +| ----------- | -------------------- | ------------------------------ | --------------- | +| Ultra-light | Phi-3 Mini, Gemma 2B | Classification, routing | $0.0001/request | +| Light | Phi-3, Llama 3 8B | Tool selection, log analysis | $0.001/request | +| Medium | Llama 3 70B | Complex routing, decomposition | $0.01/request | +| Heavy | GPT-4 class | Reasoning, synthesis | $0.05+/request | + +### Management + +- **Central model registry** with capability matrix +- **A/B testing framework** for model comparisons +- **Performance benchmarks** per use case category + +## 2. Cost Management + +### Strategy + +Implement cost controls at each layer: + +``` +Cost Control Layers +┌─────────────────────────────────────┐ +│ 1. Budget caps per project │ +├─────────────────────────────────────┤ +│ 2. SLM-first routing (80%+ target) │ +├─────────────────────────────────────┤ +│ 3. Confidence-based escalation │ +├─────────────────────────────────────┤ +│ 4. Request caching │ +├─────────────────────────────────────┤ +│ 5. Telemetry & alerting │ +└─────────────────────────────────────┘ +``` + +### Targets + +| Metric | Target | +| -------------------- | ------ | +| SLM routing % | >80% | +| Cost per 1K requests | <$5 | +| LLM escalation rate | <20% | +| Cache hit rate | >30% | + +### Alerts + +- Cost spike >20% day-over-day +- LLM escalation >25% +- Budget utilization >80% + +## 3. Latency Requirements + +### Targets by Project + +| Project | Target P99 | Critical Path | +| --------------- | ---------- | --------------------- | +| AI Gateway | <100ms | routing decision | +| PhoenixRooivalk | <50ms | threat classification | +| CodeFlow | <2s | PR classification | +| Cognitive Mesh | <500ms | agent selection | +| AgentKit Forge | <1s | tool selection | + +### Optimization + +- **Model quantization** for edge (int4) +- **Caching** of frequent decisions +- **Batch processing** for non-critical tasks +- **Connection pooling** to inference endpoints + +## 4. Edge Deployment (PhoenixRooivalk) + +### Critical: SLM is NOT Primary + +> **Never use SLM for safety-critical decisions.** + +SLM is only for: + +- Operator-facing summaries +- Report generation +- Post-mission narratives + +Core detection uses: + +- Rules + signal models + fusion engine + +### Strategy + +| Requirement | Solution | +| ------------------ | ------------------------------- | +| Hardware diversity | Support Jetson, CPU, mobile | +| Offline operation | Full local inference capability | +| Model updates | OTA with rollback | +| Security | No external connectivity | + +### Model Optimization + +```python +# Standard edge optimization pipeline +optimizations = [ + quantization(weights="int4"), + pruning(structured=0.3), + distillation(student=phi3_mini), + compilation(target="cuda|cpu") +] +``` + +## 5. Security & Privacy + +### Strategy + +| Layer | Controls | +| ---------- | ----------------------------------------- | +| Input | Prompt injection detection, PII filtering | +| Processing | No data leaves boundary | +| Output | Content filtering, audit logging | +| Access | Role-based model access | + +### SLM Security Checks + +```python +async def security_pipeline(request: Request) -> SecurityResult: + # 1. Prompt injection check + injection = await slm_check_injection(request.prompt) + if injection.detected: + return blocked(injection.reason) + + # 2. PII detection + pii = await slm_check_pii(request.prompt) + if pii.found: + return blocked("PII detected") + + # 3. Policy check + policy = await slm_check_policy(request.prompt) + if policy.violation: + return blocked(policy.violation) + + return allowed() +``` + +## 6. Reliability + +### Strategy + +| Concern | Mitigation | +| ------------------- | ------------------------ | +| Model downtime | Fallback models per tier | +| Latency spikes | Timeout + escalation | +| Quality degradation | Continuous evaluation | +| Hallucinations | Confidence thresholds | + +### Fallback Hierarchy + +``` +Request + │ + ▼ Primary SLM + │ + ├─ Success → Return + │ + ├─ Timeout → Fallback SLM + │ + ├─ Low confidence → LLM verification + │ + └─ Failure → Error with telemetry +``` + +## 7. Observability + +### Metrics Collection + +| Metric Type | Collection | +| -------------- | -------------------------- | +| Request volume | Per model, per project | +| Latency | P50, P95, P99 per endpoint | +| Error rate | By error type, model | +| Cost | Per project, per user | +| Quality | Accuracy, escalation rate | + +### Dashboards + +- **Cost Dashboard**: Spend by project, model, day +- **Performance Dashboard**: Latency by tier +- **Quality Dashboard**: Accuracy, false positives + +## 8. Versioning + +### Strategy + +| Component | Versioning | Update Frequency | +| -------------- | ---------------- | ------------------ | +| Models | Semantic (1.0.0) | Monthly evaluation | +| Prompts | Git-based | Per task | +| Infrastructure | Terraform | Per deployment | + +### Model Lifecycle + +``` +Discovery → Testing → Staging → Production → Deprecated → Retired + │ │ │ │ │ + ▼ ▼ ▼ ▼ ▼ + Evaluate A/B test Shadow mode Active Fallback +``` + +## Project-Specific Concerns + +### AI Gateway + +- High-volume routing +- Security-first evaluation +- Real-time cost tracking + +### Cognitive Mesh + +- Agent capability mapping +- Task decomposition accuracy +- Multi-agent coordination + +### PhoenixRooivalk + +- **CRITICAL**: SLM NOT for safety decisions +- Edge hardware diversity +- Offline reliability +- Minimal latency + +### CodeFlow Engine + +- PR classification accuracy +- CI log analysis quality +- Auto-merge reliability + +### AgentKit Forge + +- Tool selection accuracy +- Context compression ratio +- LLM call reduction + +## Canonical Principle + +> **Use SLMs to decide, filter, classify, compress, and prepare.** +> **Use LLMs to reason, reconcile, synthesize, and communicate.** + +## Action Items + +1. [ ] Establish model registry with tiered selection +2. [ ] Implement cost tracking per project +3. [ ] Set up latency monitoring dashboards +4. [ ] Create edge deployment pipeline +5. [ ] Build security check pipeline +6. [ ] Define fallback hierarchies +7. [ ] Implement observability stack +8. [ ] Document model lifecycle process +9. [ ] **Add explicit safety boundary for PhoenixRooivalk** diff --git a/docs/architecture/reference/strategic/01-why-slms-matter.md b/docs/architecture/reference/strategic/01-why-slms-matter.md new file mode 100644 index 0000000..653e273 --- /dev/null +++ b/docs/architecture/reference/strategic/01-why-slms-matter.md @@ -0,0 +1,86 @@ +# Why SLMs Matter in These Systems + +This document explains the strategic value of Small Language Models (SLMs) across the ecosystem. + +## Executive Summary + +Across all six platforms, SLMs provide: + +| Benefit | Description | +| -------------------------- | ------------------------------------------- | +| **Cost Control** | Large models are invoked only when required | +| **Latency Reduction** | Routing decisions happen in milliseconds | +| **Edge Deployment** | PhoenixRooivalk can run inference locally | +| **Deterministic Behavior** | SLMs are easier to constrain and audit | + +## Summary Table + +| System | SLM Role | +| --------------- | --------------------------------------- | +| AI Gateway | routing, policy checks, cost prediction | +| Cognitive Mesh | agent routing, task decomposition | +| PhoenixRooivalk | edge telemetry analysis | +| CodeFlow Engine | CI intelligence, log analysis | +| AgentKit Forge | tool selection, context compression | +| Mystira | story safety, continuity, age-fit | + +--- + +## Design Principle + +The best use of SLMs is not "replace the big model." It is: + +```mermaid +flowchart LR + S[Screen First] --> R[Route Cheap] + R --> E[Escalate Selectively] + E --> C[Compress Context Aggressively] + C --> L[Keep Edge Decisions Local] +``` + +| Principle | Description | +| ------------------------ | -------------------------------------------------------------- | +| **Screen First** | SLMs handle initial classification before expensive operations | +| **Route Cheap** | Direct simple requests to SLMs or small models | +| **Escalate Selectively** | Only invoke LLMs for complex, ambiguous tasks | +| **Compress Context** | SLMs reduce token volume before LLM processing | +| **Keep Edge Local** | PhoenixRooivalk operates without cloud dependency | + +--- + +## Reference Architecture + +```mermaid +flowchart TD + U[Users / Operators / CI Events / Sensor Feeds] + U --> G[AI Gateway] + G --> G1[SLM: intent classification] + G --> G2[SLM: safety / policy scan] + G --> G3[SLM: cost routing] + G --> G4[Cache / provider selection] + G4 --> CM[Cognitive Mesh] + G4 --> CF[CodeFlow Engine] + G4 --> AF[AgentKit Forge] + G4 --> PR[PhoenixRooivalk] + G4 --> MY[Mystira] + CM --> L1[LLM: deep reasoning] + CF --> L2[LLM: remediation] + AF --> L3[LLM: synthesis] + MY --> L4[LLM: narrative] +``` + +--- + +## Strategic Recommendation + +SLMs should be treated as: + +- **Control-plane intelligence**: Routing, classification, decision-making +- **Cheap operational cognition**: Fast, repetitive tasks +- **First-pass classifiers**: Initial triage before expensive operations +- **Context reducers**: Compressing data for efficient processing +- **Edge interpreters**: Local processing without cloud dependency + +**Not** as replacements for the reasoning tier. + +> **SLMs run the flow. LLMs solve the hard parts.** diff --git a/docs/architecture/reference/strategic/02-gateway-slm-use-cases.md b/docs/architecture/reference/strategic/02-gateway-slm-use-cases.md new file mode 100644 index 0000000..1d62542 --- /dev/null +++ b/docs/architecture/reference/strategic/02-gateway-slm-use-cases.md @@ -0,0 +1,90 @@ +# AI Gateway — Practical SLM Use Cases + +AI Gateway is the highest-leverage place to put SLMs because every request passes through it. + +## Best-Fit SLM Tasks + +### A. Intent and Complexity Classification + +The SLM predicts: + +- Request type +- Risk level +- Likely tool need +- Token size estimate +- Recommended model tier + +```json +{ + "intent": "repo_analysis", + "complexity": "medium", + "tool_required": true, + "security_risk": "low", + "recommended_tier": "mid" +} +``` + +### B. Safety and Data-Boundary Screening + +Before a request hits an expensive model: + +- Secret leakage scan +- PII detection +- Jailbreak/prompt-injection detection +- Tenant/policy checks +- Outbound data classification + +### C. Budget-Aware Routing + +Use the SLM to decide: + +- Direct answer with small model +- Call tool first +- Escalate to reasoning model +- Deny or redact +- Cache hit / semantic cache reuse + +## Practical Gateway Flow + +```mermaid +flowchart LR + A[Client Request] --> B[Gateway Ingress] + B --> C[SLM Classifier] + C --> D[SLM Policy Scan] + D --> E[Budget / Latency Rules] + E --> F{Decision} + F -->|simple| G[Small Model] + F -->|tool-first| H[Tool Execution] + F -->|complex| I[Large Model] + F -->|blocked| J[Policy Refusal] + H --> K[Post-tool SLM summarizer] + K --> I +``` + +## Why It Fits AI Gateway + +| Benefits | Tradeoffs | +| ----------------------------- | -------------------------------------- | +| Major cost reduction | Misrouting risk if classifier is weak | +| Faster median latency | Extra hop in pipeline | +| Consistent policy enforcement | Need calibration, thresholds, fallback | +| Cleaner observability | | + +## Where It Breaks Down + +- Vague prompts +- Multi-domain prompts +- Hidden tool requirements +- Requests where complexity is not obvious + +## Recommended Pattern + +Use the SLM as a **triage layer, not the final authority**. If confidence is low, escalate automatically. + +### Threshold Guide + +| Confidence | Action | +| ---------- | ----------------- | +| >= 0.90 | Direct routing | +| 0.75-0.89 | Verify with rules | +| < 0.75 | Escalate to LLM | diff --git a/docs/architecture/reference/strategic/03-cognitive-mesh-use-cases.md b/docs/architecture/reference/strategic/03-cognitive-mesh-use-cases.md new file mode 100644 index 0000000..4255e87 --- /dev/null +++ b/docs/architecture/reference/strategic/03-cognitive-mesh-use-cases.md @@ -0,0 +1,95 @@ +# Cognitive Mesh — Practical SLM Use Cases + +Cognitive Mesh is where SLMs become orchestration primitives. + +## Best-Fit SLM Tasks + +### A. Specialist Routing + +The SLM decides which node gets the task: + +- infra +- code +- security +- research +- finance +- documentation +- architecture + +### B. Task Decomposition + +Before invoking expensive reasoning, the SLM splits tasks into atomic units. + +Example: "Review this repo and propose a deploy plan" becomes: + +1. Detect stack +2. Detect infra +3. Detect secrets/compliance issues +4. Map CI/CD +5. Draft deploy sequence + +### C. State Summarization + +Multi-agent systems accumulate long histories. An SLM maintains: + +- Current objective +- Known constraints +- Prior decisions +- Unresolved blockers +- Tool outputs summary + +### D. Agent Health and Loop Detection + +The SLM can classify: + +- Repeated retries +- Tool thrashing +- No-progress loops +- Conflicting agent outputs + +## Practical Cognitive Mesh Flow + +```mermaid +flowchart TD + U[User] --> R[SLM Router] + R --> A1[Architect Agent] + R --> A2[Coder Agent] + R --> A3[Security Agent] + R --> A4[Infra Agent] + R --> A5[Research Agent] + A1 --> S[SLM State Manager] + A2 --> S + A3 --> S + A4 --> S + A5 --> S + S --> X{Need deep reasoning?} + X -->|No| O[Assemble Response] + X -->|Yes| L[LLM Synthesis] + L --> O +``` + +## Why It Fits Cognitive Mesh + +| Benefits | Tradeoffs | +| --------------------- | ----------------------------- | +| Cheaper orchestration | Decomposition quality matters | +| Faster routing | Errors compound downstream | +| Smaller context | Summaries can lose nuance | +| Better determinism | | + +## Best Operational Pattern + +| Use SLMs For | Use LLMs For | +| ------------------------ | ----------------------- | +| "Who should do this?" | Final synthesis | +| "What is the next step?" | Architecture evaluation | +| "What matters here?" | Novel reasoning | +| "Are we stuck?" | | + +## Threshold Guide + +| Confidence | Action | +| ---------- | ----------------- | +| >= 0.85 | Direct routing | +| 0.70-0.84 | Verify with rules | +| < 0.70 | Escalate to LLM | diff --git a/docs/architecture/reference/strategic/04-codeflow-use-cases.md b/docs/architecture/reference/strategic/04-codeflow-use-cases.md new file mode 100644 index 0000000..f1024fe --- /dev/null +++ b/docs/architecture/reference/strategic/04-codeflow-use-cases.md @@ -0,0 +1,87 @@ +# CodeFlow Engine — Practical SLM Use Cases + +CodeFlow Engine is one of the strongest SLM domains because CI/CD workloads are repetitive, structured, and high-volume. + +## Best-Fit SLM Tasks + +### A. PR Classification + +Classify a PR as: + +- docs-only +- low-risk refactor +- dependency update +- infra change +- security-sensitive +- contract-breaking +- test-only +- release-impacting + +### B. Diff Summarization + +Generate short structured summaries from git diff and changed files. + +### C. CI Failure Triage + +Classify failures into: + +- test regression +- flaky test +- dependency resolution +- auth/secret issue +- infra provisioning error +- timeout/resource exhaustion +- lint/type failure + +### D. Review Routing + +Decide which reviewers or agent flows should be triggered. + +### E. Release-Note Extraction + +Extract user-facing change notes without using a full LLM. + +## Practical CodeFlow Pipeline + +```mermaid +flowchart LR + GP[Git Push / PR] --> IN[Ingest] + IN --> S1[SLM: diff classifier] + IN --> S2[SLM: risk scorer] + S1 --> D{Decision} + S2 --> D + D -->|low-risk| Q[Fast CI] + D -->|high-risk| F[Full CI] + D -->|unclear| L[LLM Review] + F --> T[CI Logs] + Q --> T + T --> C[SLM: triage] + C --> R[Action] +``` + +## Why It Fits CodeFlow Engine + +| Benefits | Tradeoffs | +| -------------------------- | --------------------------- | +| Huge cost savings at scale | False negatives possible | +| Strong consistency | Requires designed schemas | +| Better PR throughput | Model drift affects quality | +| Repetitive workload fit | | + +## Strongest SLM Opportunities + +Given emphasis on contract diffs, OpenAPI breakage, schema validation, CI gates: + +- Change intent detection +- Docs generation hints +- Issue bucketing +- Runbook suggestion +- Log compression before escalation + +## Threshold Guide + +| Confidence | Action | +| ---------- | ----------------- | +| >= 0.88 | Direct use | +| 0.75-0.87 | Verify with rules | +| < 0.75 | Manual review | diff --git a/docs/architecture/reference/strategic/05-agentkit-use-cases.md b/docs/architecture/reference/strategic/05-agentkit-use-cases.md new file mode 100644 index 0000000..ebd5d4d --- /dev/null +++ b/docs/architecture/reference/strategic/05-agentkit-use-cases.md @@ -0,0 +1,85 @@ +# AgentKit Forge — Practical SLM Use Cases + +AgentKit Forge is ideal for SLMs because tool-heavy agents don't need a large model for every micro-decision. + +## Best-Fit SLM Tasks + +### A. Tool Selection + +Choose among: + +- GitHub API +- Azure CLI +- Terraform +- Kusto +- File retrieval +- Documentation lookup +- Shell command +- Search + +### B. Parameter Extraction + +Pull structured arguments out of the request before calling the tool. + +### C. Context Compression + +Convert long tool traces into compact operational state. + +### D. Step Validation + +Check whether a step result is sufficient before moving to next step. + +### E. Retry / Fallback Logic + +Classify whether an error merits: + +- Retry +- Alternate tool +- Human intervention +- Escalation to larger model + +## Practical AgentKit Flow + +```mermaid +flowchart TD + T[Agent Task] --> P[SLM Planner] + P --> TS[SLM Tool Selector] + TS --> G1[GitHub] + TS --> G2[Azure] + TS --> G3[Terraform] + TS --> G4[Kusto] + G1 --> M[SLM Compressor] + G2 --> M + G3 --> M + G4 --> M + M --> V{Enough?} + V -->|Yes| A[Response] + V -->|No| L[Escalate LLM] + L --> A +``` + +## Why It Fits AgentKit Forge + +| Benefits | Tradeoffs | +| -------------------- | --------------------------- | +| Lower token burn | Brittle if schemas weak | +| Faster tool loops | Poor extraction = bad calls | +| Improved determinism | Compression can lose detail | +| Cleaner contracts | | + +## Design Rule + +| Let SLMs Own | Let LLMs Own | +| ------------ | -------------------- | +| Selection | Synthesis | +| Extraction | Ambiguity resolution | +| Compression | Multi-tool planning | +| Validation | | + +## Threshold Guide + +| Confidence | Action | +| ---------- | -------------------- | +| >= 0.85 | Direct execution | +| 0.70-0.84 | Require confirmation | +| < 0.70 | Decline / clarify | diff --git a/docs/architecture/reference/strategic/06-rooivalk-use-cases.md b/docs/architecture/reference/strategic/06-rooivalk-use-cases.md new file mode 100644 index 0000000..6bdb6dc --- /dev/null +++ b/docs/architecture/reference/strategic/06-rooivalk-use-cases.md @@ -0,0 +1,76 @@ +# PhoenixRooivalk — Practical SLM Use Cases + +PhoenixRooivalk is different because the core advantage is locality, latency, and resilience—not just cost. + +## Best-Fit SLM Tasks + +### A. Edge Event Labeling + +Convert telemetry into categories: + +- loitering +- fast ingress +- signal loss +- RF anomaly +- perimeter breach candidate +- operator attention required + +### B. Operator-Facing Summary + +Turn noisy sensor events into concise, human-readable alerts. + +### C. Log-to-Report Conversion + +Mission logs, detections, and post-event evidence can be summarized locally. + +### D. Escalation Gating + +Only send selected events to cloud when: + +- Confidence above threshold +- Event duration exceeds threshold +- Evidence bundle sufficient +- Bandwidth available + +## Practical Edge Flow + +```mermaid +flowchart LR + S[RF / EO / Radar / Telemetry] --> N[Detection Pipeline] + N --> E[Edge SLM] + E --> L1[Event Label] + E --> L2[Threat Summary] + E --> L3[Alert Text] + E --> L4[Escalation] + L4 -->|local| O[Console] + L4 -->|upstream| C[Cloud] +``` + +## Why It Fits PhoenixRooivalk + +| Benefits | Tradeoffs | +| ------------------------ | ------------------------- | +| Low latency | Limited reasoning depth | +| Offline capability | Edge hardware constraints | +| Bandwidth savings | Must handle noisy inputs | +| Privacy / sovereignty | Needs tight prompt design | +| Constrained hardware fit | | + +## CRITICAL: Important Boundary + +Do NOT let SLM become sole authority for kinetic or high-stakes decisions. + +| Use SLM For | NOT For | +| ---------------- | ---------------------------- | +| Interpretation | Critical threat adjudication | +| Summarization | Response triggering | +| Prioritization | Access control | +| Operator support | Resource isolation | + +## Threshold Guide + +| Confidence | Action | +| ---------- | -------------- | +| >= 0.80 | Full summary | +| 0.65-0.79 | Facts only | +| < 0.65 | Human analysis | diff --git a/docs/architecture/reference/strategic/07-deployment-model.md b/docs/architecture/reference/strategic/07-deployment-model.md new file mode 100644 index 0000000..e4e6f8d --- /dev/null +++ b/docs/architecture/reference/strategic/07-deployment-model.md @@ -0,0 +1,75 @@ +# Practical Deployment Model + +This is the recommended stack for the ecosystem. + +## Full Stack Architecture + +```mermaid +flowchart TD + A[Ingress] --> B[AI Gateway SLM] + B --> C1[Fast-path] + B --> C2[Tool-first] + B --> C3[Escalation] + C2 --> D1[AgentKit] + C2 --> D2[CodeFlow] + C2 --> D3[Cognitive Mesh] + D1 --> E1[SLM tool loops] + D2 --> E2[SLM CI triage] + D3 --> E3[SLM orchestration] + E1 --> F[LLM Pool] + E2 --> F + E3 --> F + G[Rooivalk Edge] --> H[Local SLM] + H --> I[Local / Cloud] + F --> J[Observability] +``` + +## Decision Matrix + +| System | Best SLM Jobs | Less Suitable | +| --------------- | -------------------------- | ------------------------------ | +| AI Gateway | routing, screening, cost | Nuanced synthesis | +| Cognitive Mesh | routing, decomposition | Final judgment | +| CodeFlow | PR triage, log analysis | Root cause across dependencies | +| AgentKit | tool selection, extraction | Multi-step planning | +| PhoenixRooivalk | summaries, alerts | Sole threat authority | +| Mystira | safety, continuity | Rich narrative | + +## Practical Gateway Flow + +```mermaid +flowchart LR + A[Request] --> B[Classifier] + B --> C[Policy Scan] + C --> D[Budget Rules] + D --> E{Decision} + E -->|simple| F[Small Model] + E -->|tool| G[Tools] + E -->|complex| H[LLM] + E -->|blocked| I[Refusal] + G --> J[Post-tool Summarizer] + J --> H +``` + +## End-to-End Example + +A developer opens a PR that changes Terraform, GitHub Actions, and an OpenAPI spec: + +```mermaid +sequenceDiagram + Dev->>GH: Open PR + GH->>CF: Event + CF->>SLM: Classify + risk + SLM-->>CF: infra-change, high risk + CF->>GH: Full CI + contract checks + GH-->>CF: Results + CF->>SLM: Triage logs + SLM-->>CF: Breaking change detected + CF->>AG: Escalate + AG->>LLM: Reasoning + LLM-->>AG: Advice + AG-->>CF: Response + CF-->>GH: Comment +``` + +SLMs handle repetitive triage; LLMs solve the hard part. diff --git a/docs/architecture/reference/strategic/08-implementation-order.md b/docs/architecture/reference/strategic/08-implementation-order.md new file mode 100644 index 0000000..fde2524 --- /dev/null +++ b/docs/architecture/reference/strategic/08-implementation-order.md @@ -0,0 +1,56 @@ +# Recommended Implementation Order + +For the stack, the highest ROI sequence is: + +## Phase 1: Gateway Foundation + +- AI Gateway intent classifier +- AI Gateway policy scan +- Route-to-tier decision +- Semantic cache admission + +**Value**: Highest immediate cost-leverage + +## Phase 2: CI Intelligence + +- CodeFlow Engine PR risk classifier +- CodeFlow Engine CI failure bucketing +- CodeFlow Engine release-note summarizer + +**Value**: Fastest operational value + +## Phase 3: Agent Runtime + +- AgentKit Forge tool selector +- AgentKit Forge parameter extractor +- AgentKit Forge context compressor + +**Value**: Lower token burn, faster tool loops + +## Phase 4: Orchestration + +- Cognitive Mesh specialist router +- Cognitive Mesh decomposition engine +- Cognitive Mesh state manager + +**Value**: Strong leverage once taxonomy stabilizes + +## Phase 5: Edge + +- PhoenixRooivalk edge event summarizer +- PhoenixRooivalk operator alert composer +- PhoenixRooivalk escalation filter + +**Value**: Keep isolated from critical control + +## Summary + +| Phase | System | Priority | +| ----- | --------------- | -------- | +| 1 | AI Gateway | Highest | +| 2 | CodeFlow | High | +| 3 | AgentKit Forge | Medium | +| 4 | Cognitive Mesh | Medium | +| 5 | PhoenixRooivalk | Lower | + +That order gives fastest operational value with lowest implementation risk. diff --git a/docs/architecture/reference/strategic/README.md b/docs/architecture/reference/strategic/README.md new file mode 100644 index 0000000..0644088 --- /dev/null +++ b/docs/architecture/reference/strategic/README.md @@ -0,0 +1,28 @@ +# Strategic SLM Guidance + +This folder contains strategic guidance on why SLMs matter and how to deploy them across the ecosystem. + +## Documents + +- [01-why-slms-matter.md](01-why-slms-matter.md) - Executive summary and core principles +- [02-gateway-slm-use-cases.md](02-gateway-slm-use-cases.md) - AI Gateway practical use cases +- [03-cognitive-mesh-use-cases.md](03-cognitive-mesh-use-cases.md) - Cognitive Mesh practical use cases +- [04-codeflow-use-cases.md](04-codeflow-use-cases.md) - CodeFlow Engine practical use cases +- [05-agentkit-use-cases.md](05-agentkit-use-cases.md) - AgentKit Forge practical use cases +- [06-rooivalk-use-cases.md](06-rooivalk-use-cases.md) - PhoenixRooivalk practical use cases +- [07-deployment-model.md](07-deployment-model.md) - Practical deployment model and decision matrix +- [08-implementation-order.md](08-implementation-order.md) - Recommended implementation sequence + +## Quick Navigation + +| Phase | System | Document | +| --------------- | --------------- | ---------------------------------------------------------------- | +| Foundation | AI Gateway | [02-gateway-slm-use-cases.md](02-gateway-slm-use-cases.md) | +| CI Intelligence | CodeFlow Engine | [04-codeflow-use-cases.md](04-codeflow-use-cases.md) | +| Agent Runtime | AgentKit Forge | [05-agentkit-use-cases.md](05-agentkit-use-cases.md) | +| Orchestration | Cognitive Mesh | [03-cognitive-mesh-use-cases.md](03-cognitive-mesh-use-cases.md) | +| Edge | PhoenixRooivalk | [06-rooivalk-use-cases.md](06-rooivalk-use-cases.md) | + +## Core Principle + +> **SLMs run the flow. LLMs solve the hard parts.** diff --git a/docs/architecture/systems/agentkit-forge.md b/docs/architecture/systems/agentkit-forge.md new file mode 100644 index 0000000..c84a182 --- /dev/null +++ b/docs/architecture/systems/agentkit-forge.md @@ -0,0 +1,192 @@ +# AgentKit Forge + +AgentKit Forge builds AI agents and orchestration workflows. SLMs help when agents have **many tools** and **large working memory**. + +## Architecture + +``` +Agent Task + │ + ▼ +┌─────────────────────────────────────┐ +│ SLM Execution Governor │ +│ (tool selection, memory, budget) │ +└─────────────────────────────────────┘ + │ + ▼ +Tool Selection + │ + ├─→ GitHub API + ├─→ Azure CLI + ├─→ Terraform + ├─→ Documentation Search + └─→ LLM Synthesis +``` + +## Most Practical SLM Jobs + +### 1. Tool Selector + +Map user or system request to the right tool. + +```json +{ + "tool": "azure_cli", + "command": "az monitor metrics list", + "args": { + "resource": "/subscriptions/.../appinsights/...", + "metric": "requests" + }, + "confidence": 0.92 +} +``` + +### 2. Relevance Filter + +Only send necessary state to expensive models. + +```json +{ + "relevant_context": ["terraform_plan", "error_logs"], + "pruned_context": ["old_successful_deploys", "unrelated_metrics"], + "estimated_tokens": 3500 +} +``` + +### 3. Budget Governor + +Estimate likely token spend and whether tool-first is sufficient. + +```json +{ + "estimated_tokens": 8000, + "can_fit_in_window": true, + "should_use_tool_first": true, + "budget_tier": "medium" +} +``` + +### 4. Execution Classifier + +Distinguish how to handle the request. + +```json +{ + "action": "use_tool", + "tool_name": "github_api", + "escalate_to_llm": false, + "reason": "simple data retrieval" +} +``` + +## Implementation + +### Tool Selection + +```python +async def select_tool(task: str, available_tools: list[Tool]) -> ToolInvocation: + prompt = f"""Select the best tool for this task. + +Task: {task} + +Available tools: +{format_tools(available_tools)} + +Output: tool_name, args, confidence""" + + result = await slm_completion(prompt) + return ToolInvocation( + tool=result.tool, + args=result.args, + confidence=result.confidence + ) +``` + +### Context Planning + +```python +async def plan_context(task: str, context_options: list[Context]) -> ContextPlan: + prompt = f"""Plan which context to use for this task. + +Task: {task} + +Available context: +{format_context(context_options)} + +Output: required_context, optional_context, estimated_tokens""" + + return await slm_completion(prompt) +``` + +### Budget Governor + +```python +async def govern_budget(task: str) -> BudgetDecision: + prompt = f"""Estimate token budget for this task. + +Task: {task} + +Consider: context size, expected output, complexity""" + + estimate = await slm_completion(prompt) + + return BudgetDecision( + estimated_tokens=estimate.tokens, + can_fit=estimate.can_fit, + should_escalate=estimate.should_escalate + ) +``` + +### Multi-Step Execution + +```python +async def execute_agent_task(task: str) -> AgentResult: + # Step 1: Decompose + plan = await slm_decompose(task) + + # Step 2: Execute each step with tool selection + for step in plan.steps: + tool = await select_tool(step.description, available_tools) + result = await execute_tool(tool) + + # Step 3: Check if escalation needed + if result.complexity == "high": + llm_result = await llm_complete(step, context) + result = llm_result + + return aggregate_results(plan.steps) +``` + +## Tradeoffs + +| Pros | Cons | +| ----------------------------------- | ---------------------------------------------- | +| Keeps agent execution lean | Weak tool selection harms trust | +| Lowers token burn dramatically | Compressed memory can omit critical edge cases | +| Improves tool invocation discipline | Too much reliance can make agents look shallow | + +## Key Concerns + +| Concern | Strategy | +| ------------- | -------------------------------------- | +| Tool accuracy | Validate tool exists before invocation | +| Context bloat | SLM filters context before LLM | +| Cost | Route 70%+ through SLM tool selection | +| Reliability | Fallback to LLM on low confidence | + +## Tool Categories + +| Category | SLM Handles | LLM Handles | +| ------------ | ------------------ | ------------------ | +| CLI commands | selection + args | complex pipelines | +| API calls | endpoint selection | response parsing | +| File ops | path determination | content generation | +| Queries | query construction | result synthesis | + +## Implementation Checklist + +- [ ] Implement tool selection with confidence scores +- [ ] Add relevance filtering for context +- [ ] Implement budget governor with token estimation +- [ ] Add execution classification (direct/tool/LLM) +- [ ] Set up fallback to LLM on low confidence diff --git a/docs/architecture/systems/ai-gateway.md b/docs/architecture/systems/ai-gateway.md new file mode 100644 index 0000000..5a288f0 --- /dev/null +++ b/docs/architecture/systems/ai-gateway.md @@ -0,0 +1,146 @@ +# AI Gateway + +AI Gateway sits between applications and multiple AI providers. The SLM acts as the **admission control and routing brain** — the fast, cheap, deterministic control layer before expensive model invocation. + +## Architecture + +``` +Client Request + │ + ▼ +┌─────────────────────────────────────┐ +│ SLM Control Layer │ +│ (intent, complexity, risk, tools) │ +└─────────────────────────────────────┘ + │ + ▼ +Routing Decision + │ + ├─→ Cache (if cacheable) + ├─→ Tool call + ├─→ SLM response + ├─→ Small model + └─→ Large model escalation +``` + +## SLM as Admission Control + +The SLM sits **before** expensive model invocation and sometimes **after** provider response for tagging/telemetry normalization. + +### Best SLM Use Cases + +| Use Case | Description | Output Schema | +| ------------------------ | ------------------------------ | ---------------------------------- | +| Intent Classification | Determine user intent | `{ "intent": "code_review", ... }` | +| Complexity Scoring | Rate request complexity | `{ "complexity": "medium", ... }` | +| Tool Eligibility | Detect if tool call needed | `{ "tool_candidate": true, ... }` | +| Safety Prefiltering | Prompt injection, PII, secrets | `{ "risk": "low", ... }` | +| Cache Key Enrichment | Generate cache keys | `{ "cacheable": false, ... }` | +| Telemetry Categorization | Tag for observability | `{ "category": "analysis", ... }` | +| Tenant Policy Gating | Per-tenant routing rules | `{ "tier": "premium", ... }` | + +### Why This Works + +These tasks are: + +- **Short-context** — SLM handles easily +- **Repetitive** — High cache hit potential +- **Structured** — Schema-bound outputs +- **Latency-sensitive** — SLM is fast + +### Good SLM Output + +```json +{ + "intent": "code_review", + "complexity": "medium", + "tool_candidate": true, + "risk": "low", + "cacheable": false, + "recommended_tier": "large" +} +``` + +## Implementation + +### Routing Logic + +```python +async def gateway_admission(request: Request) -> AdmissionDecision: + # SLM does admission control + classification = await slm_classify(request.prompt) + + # Route based on classification + if classification.cacheable: + cached = await check_cache(classification.cache_key) + if cached: + return CachedResponse(cached) + + if classification.tool_candidate: + return await route_to_tools(classification) + + if classification.complexity == "low": + return await route_to_slm(classification) + + # Escalate to LLM + return await route_to_llm(classification) +``` + +### Policy Check Pipeline + +```python +async def security_scan(prompt: str) -> SecurityResult: + checks = await asyncio.gather( + slm_check_injection(prompt), + slm_check_pii(prompt), + slm_check_secrets(prompt) + ) + + if any(check.flagged for check in checks): + return SecurityResult(blocked=True, reason=checks) + + return SecurityResult(allowed=True) +``` + +## Tradeoffs + +| Pros | Cons | +| ------------------------------- | -------------------------------------------------- | +| Major cost reduction | Misrouting risk if classifier is weak | +| Consistent routing | Small models can under-detect subtle safety issues | +| Lower p95 latency | More moving parts in gateway logic | +| Easier telemetry and governance | | + +## Key Concerns + +| Concern | Strategy | +| -------- | -------------------------------------------- | +| Latency | SLM runs inline; must respond in <50ms | +| Accuracy | Cascade: low confidence → LLM verification | +| Cost | Route 80%+ to SLMs; LLM only for escalation | +| Security | SLM policy check before any model invocation | + +## SLM Model Selection + +Recommended models for gateway classification: + +- Phi-3 Mini (3.8B) - fast, accurate +- Llama 3 8B - good general classification +- Gemma 2B - minimal latency + +## Metrics + +Track per routing decision: + +- SLM vs LLM routing ratio +- Average latency by route type +- Escalation rate (SLM → LLM) +- Cost per 1K requests + +## Implementation Checklist + +- [ ] Add SLM policy envelope returning intent, complexity, risk, cacheability, tier +- [ ] Implement cascade pattern for low confidence → LLM +- [ ] Add security prefiltering (injection, PII, secrets) +- [ ] Set up cost tracking per tier +- [ ] Configure latency alerts diff --git a/docs/architecture/systems/codeflow-engine.md b/docs/architecture/systems/codeflow-engine.md new file mode 100644 index 0000000..30f5dfd --- /dev/null +++ b/docs/architecture/systems/codeflow-engine.md @@ -0,0 +1,165 @@ +# CodeFlow Engine + +CodeFlow Engine is a DevOps and CI/CD intelligence system. **This is one of the most natural SLM fits** — CI/CD emits lots of repetitive semi-structured text where SLMs excel. + +## Architecture + +``` +Git Push / PR Event + │ + ▼ +┌─────────────────────────────────────┐ +│ SLM Triage Layer │ +│ (classification, risk, pipeline) │ +└─────────────────────────────────────┘ + │ + ▼ +CI/CD Decision + │ + ├─→ Auto approve + ├─→ Run tests (full/minimal/skip) + ├─→ Security review + └─→ Escalate to LLM +``` + +## Best SLM Use Cases + +| Use Case | Description | Example Output | +| ------------------------ | ------------------------- | ------------------------------------------------------------- | +| PR Classification | Categorize change type | `{ "type": "api_contract", "risk": "high" }` | +| Test Selection | Choose which tests to run | `{ "run_unit": true, "run_integration": false }` | +| Blast Radius | Estimate change impact | `{ "impacted": ["schemas", "api"], "risk": "medium" }` | +| Changelog Category | Generate release notes | `{ "category": "feature", "component": "gateway" }` | +| Build Log Classification | Diagnose failures | `{ "failure": "dependency_error", "fix": "npm install" }` | +| Flaky Test Grouping | Identify test patterns | `{ "flaky_group": "network_timed_out" }` | +| Issue Routing | Route to component owners | `{ "component": "infrastructure", "owner": "platform-team" }` | + +## Example SLM Outputs + +### PR Classification + +```json +{ + "change_type": "api_contract", + "risk": "high", + "requires_full_ci": true, + "security_review": false, + "impacted_domains": ["schemas", "api"], + "suggested_reviewers": ["platform-team"] +} +``` + +### Failure Diagnosis + +```json +{ + "failure_type": "dependency_resolution", + "retryable": false, + "likely_root_cause": "missing package lock update", + "suggested_action": "regenerate lock file and rerun" +} +``` + +## Why This Works + +CI/CD emits lots of repetitive semi-structured text: + +- Similar commit patterns +- Recurring error types +- Predictable change categories + +SLMs do very well at pattern recognition on this data. + +## Implementation + +### PR Classification + +```python +async def classify_pr(pr_diff: str, pr_description: str) -> PRClassification: + prompt = f"""Classify this PR: + +Diff (first 2000 chars): {pr_diff[:2000]} +Description: {pr_description} + +Output JSON with: type, risk_level, tests_required, reviewers_needed, security_review""" + + result = await slm_completion(prompt) + return PRClassification.parse_json(result) +``` + +### Test Selection + +```python +async def select_tests(change_type: str, impacted_files: list[str]) -> TestPlan: + prompt = f"""Select tests for this change: + +Type: {change_type} +Files: {', '.join(impacted_files)} + +Output: { "run_unit": bool, "run_integration": bool, "run_e2e": bool, "skip_reason": str|null }""" + + return await slm_completion(prompt) +``` + +### Failure Diagnosis + +```python +async def diagnose_failure(build_log: str) -> Diagnosis: + prompt = f"""Diagnose this CI failure: + +Log (last 5000 chars): +{build_log[-5000:]} + +Output: failure_type, retryable, likely_root_cause, suggested_action""" + + return await slm_completion(prompt) +``` + +### Auto-Rules Mapping + +```python +CLASSIFICATION_ACTIONS = { + ("docs", "low"): {"auto_merge": True, "ci_skip": True}, + ("feat", "low"): {"auto_merge": False, "ci_full": True}, + ("fix", "medium"): {"auto_merge": False, "ci_full": True, "security_review": True}, + ("refactor", "low"): {"auto_merge": True, "ci_minimal": True}, + ("api_contract", "high"): {"auto_merge": False, "ci_full": True, "security_review": True}, +} +``` + +## Tradeoffs + +| Pros | Cons | +| ----------------------------------- | ------------------------------------------------- | +| Cheaper automated repo intelligence | Incorrect risk can under-test changes | +| Better developer feedback speed | Failure summarization may miss subtle root causes | +| Fewer wasted full-pipeline runs | Rules should never override hard safety gates | + +## Key Concerns + +| Concern | Strategy | +| -------- | ----------------------------------------------- | +| Speed | SLM must complete in <2s | +| Accuracy | Validate against rules; escalate on uncertainty | +| Cost | Batch processing; SLM only for classification | +| Coverage | Handle all common CI scenarios | + +## Classification Types + +| Change Type | SLM Output | CI Action | +| ------------- | ------------ | ------------------ | +| documentation | risk: low | skip tests | +| bugfix | risk: medium | run tests | +| refactor | risk: low | run tests | +| security | risk: high | full review | +| breaking | risk: high | require approval | +| api_contract | risk: high | full CI + security | + +## Implementation Checklist + +- [ ] Add PR classification with structured output +- [ ] Implement test selection hints +- [ ] Add blast radius estimation +- [ ] Implement failure diagnosis with suggested actions +- [ ] Set up changelog category generation +- [ ] Configure auto-merge rules diff --git a/docs/architecture/systems/cognitive-mesh.md b/docs/architecture/systems/cognitive-mesh.md new file mode 100644 index 0000000..d4f2c96 --- /dev/null +++ b/docs/architecture/systems/cognitive-mesh.md @@ -0,0 +1,175 @@ +# Cognitive Mesh + +Cognitive Mesh architectures orchestrate multiple AI agents and tools. The SLM is the **control fabric** that decides which specialist acts, whether decomposition is needed, what context is necessary, and when to escalate. + +## Architecture + +``` +User Query + │ + ▼ +┌─────────────────────────────────────┐ +│ SLM Control Fabric │ +│ (routing, decomposition, compression)│ +└─────────────────────────────────────┘ + │ + ▼ +Routing Decision + │ + ├─→ Code Agent + ├─→ Infra Agent + ├─→ Security Agent + └─→ Research Agent + │ + ▼ + Specialist Work + │ + ▼ + LLM Synthesis (only when needed) +``` + +## Strong SLM Roles in Cognitive Mesh + +### 1. Router + +Pick which specialist or workflow handles the request. + +```json +{ + "agent": "code_agent", + "confidence": 0.94, + "reasoning": "User is asking about refactoring" +} +``` + +### 2. Task Decomposer + +Break one request into bounded subtasks. + +**Example:** + +User: "Analyze this repo and generate a deployment plan." + +SLM decomposition: + +1. repository structure analysis +2. dependency inventory +3. infrastructure detection +4. deployment strategy generation + +Only the final step requires LLM. + +### 3. Context Compressor + +Reduce token load before LLM synthesis. + +```json +{ + "summary": "User wants Azure cost analysis", + "relevant_files": ["infra/main.tf", "infra/outputs.tf"], + "active_task": "generating cost breakdown", + "pruned_messages": 12 +} +``` + +### 4. Failure Classifier + +Classify failures to determine retry strategy: + +```json +{ + "failure_type": "tool_error", + "retryable": true, + "cause": "transient_network", + "action": "retry_with_backoff" +} +``` + +## Practical Pattern + +A good mesh uses: + +1. **SLM first** — routing, decomposition +2. **Tools/specialists second** — execution +3. **LLM only for synthesis** — or when ambiguous + +## Implementation + +### Agent Selection + +```python +async def select_agent(user_request: str) -> Agent: + # SLM determines best agent + classification = await slm_classify_intent(user_request) + + agent_map = { + "code": CodeAgent, + "infrastructure": InfraAgent, + "security": SecurityAgent, + "research": ResearchAgent, + } + + return agent_map[classification.agent] +``` + +### Task Decomposition + +```python +async def decompose_task(request: str) -> TaskPlan: + # SLM breaks down into subtasks + decomposition = await slm_decompose(request) + + return TaskPlan( + subtasks=decomposition.steps, + dependencies=decomposition.dependencies, + llm_required_at_step=decomposition.final_step_only + ) +``` + +### Context Compression + +```python +async def compress_context(messages: list[Message]) -> Compressed: + summary = await slm_summarize(messages) + + return Compressed( + summary=summary.state, + relevant=summary.relevant_messages, + token_estimate=summary.tokens + ) +``` + +## Tradeoffs + +| Pros | Cons | +| ------------------------------- | ----------------------------------------------- | +| Very large token savings | Decomposition quality can bottleneck workflow | +| Better determinism | Brittle routing if taxonomy is poor | +| Easier specialist orchestration | Harder debugging if confidence handling is weak | +| Improved auditability | | + +## Key Concerns + +| Concern | Strategy | +| ------------------ | ----------------------------------------- | +| Routing accuracy | Validate against known agent capabilities | +| Task complexity | SLM estimates; LLM confirms if wrong | +| Agent coordination | SLM manages task queue and dependencies | +| Failure detection | SLM monitors logs; LLM only for recovery | + +## Agent Capabilities Matrix + +| Agent | SLM Handles | LLM Required For | +| -------- | ------------------------------ | ------------------- | +| Code | file operations, git commands | complex refactoring | +| Infra | terraform plans, status checks | architecture design | +| Security | vulnerability scanning | threat analysis | +| Research | information retrieval | synthesis | + +## Implementation Checklist + +- [ ] Define agent taxonomy with capabilities +- [ ] Implement SLM router with structured output +- [ ] Add task decomposition with bounded subtasks +- [ ] Implement context compression before LLM +- [ ] Add failure classification for retry logic diff --git a/docs/architecture/systems/mystira.md b/docs/architecture/systems/mystira.md new file mode 100644 index 0000000..17079e7 --- /dev/null +++ b/docs/architecture/systems/mystira.md @@ -0,0 +1,584 @@ +# Mystira + +Mystira is an interactive story generation system for children. The SLM serves as a **content-shaping, moderation, personalization, and cost-control layer** around story generation and interactive experience flows. + +## Architecture Overview + +```mermaid +flowchart TB + subgraph User["User Layer"] + U[Child/Parent Input] + P[Parent Controls] + end + + subgraph SLML["SLM Experience Control Layer"] + C[Story Classifier] + A[Age-Tone Controller] + M[Moderation Filter] + W[World Consistency] + end + + subgraph State["State Management"] + S[Story State] + Pr[Child Profile] + Ch[Character Registry] + end + + subgraph Content["Content Pipeline"] + PC[Prompt Constructor] + LLM[Story LLM] + IMG[Illustration Generator] + end + + subgraph Output["Output Processing"] + SA[Safety Audit] + RL[Reading Level Check] + IP[Image Prompt Shaper] + end + + U --> C + P --> Pr + C --> A + A --> M + M --> W + W --> S + S --> Pr + Pr --> PC + PC --> LLM + LLM --> SA + SA --> RL + RL --> IMG + IMG --> IP + PC --> Ch + Ch --> W +``` + +## Detailed Data Flow + +```mermaid +sequenceDiagram + participant U as User Input + participant SLM as SLM Control Layer + participant SS as Story State + participant LLM as Story LLM + participant IMG as Image Service + participant MOD as Moderation + + U->>SLM: Story request + SLM->>SLM: Classify request type + SLM->>SLM: Check age appropriateness + SLM->>SLM: Validate parental controls + SLM->>SS: Update session context + SLM->>SS: Compress memory if needed + + alt Simple continuation + SLM->>SLM: Generate simple continuation + SLM->>MOD: Quick safety check + SLM->>U: Return response + else Full story generation + SLM->>LLM: Prepare enriched prompt + LLM->>SLM: Generated story + SLM->>SLM: Validate continuity + SLM->>MOD: Full moderation check + SLM->>SLM: Adapt reading level + SLM->>IMG: Shape image prompts + IMG->>SLM: Generated illustrations + SLM->>U: Return enriched story + end +``` + +## SLM as Experience Orchestrator + +The SLM sits between: + +1. **User input** — Classification, safety pre-check, parental control validation +2. **Story state / profile state** — Memory compression, continuity tracking +3. **Generation pipeline** — Prompt enrichment, context window management +4. **Illustration / asset prompts** — Style consistency, character adherence +5. **Moderation / age-appropriateness checks** — Multi-layer safety filtering + +```mermaid +flowchart LR + subgraph Input["Input Processing"] + I1[User Request] + I2[Parent Settings] + I3[Session History] + end + + subgraph SLM["SLM Decision Points"] + S1[Request Classification] + S2[Age-Tone Mapping] + S3[Safety Filter] + S4[Memory Compression] + end + + subgraph Decision["Routing Decision"] + D1{Complexity?} + D1 -->|Simple| R1[SLM Direct] + D1 -->|Complex| D2{Age Appropriate?} + D2 -->|Yes| R2[LLM Generation] + D2 -->|No| R3[Safe Rewrite] + end + + subgraph Output["Output Processing"] + O1[Continuity Check] + O2[Image Prompt] + O3[Reading Level] + O4[Final Safety] + end + + I1 --> S1 + I2 --> S2 + I3 --> S4 + S1 --> D1 + S2 --> D1 + S3 --> Decision + Decision --> Output +``` + +## Best SLM Use Cases + +### 1. Story Request Classification + +Determine request type: + +```json +{ + "story_type": "bedtime|learning|adventure|interactive|customization|continuation|image", + "age_range": "3-5|5-8|8-10|10-12", + "is_interactive": true, + "continuation": true, + "needs_images": true, + "curriculum_tags": ["kindness", "sharing", "animals"], + "estimated_complexity": "low|medium|high" +} +``` + +### 2. Age and Tone Control + +Enforce cheaply: + +```json +{ + "reading_level": "easy|moderate|advanced", + "sentence_length": "short|medium|long", + "emotional_tone": "calm|exciting|gentle|funny", + "safe_themes": true, + "lesson_alignment": ["kindness", "courage"], + "content_rating": "G|PG", + "prohibited_elements": [] +} +``` + +### 3. Moderation and Safe Rewriting + +Catch or rewrite: + +- Frightening content +- Inappropriate content +- Emotionally unsuitable scenes +- Unsafe user prompts +- Age-inappropriate vocabulary + +```json +{ + "flagged": false, + "rewritten": null, + "content_rating": "safe|caution|blocked", + "age_appropriate": true, + "concerns": [], + "rewrite_suggestions": [] +} +``` + +### 4. Session Memory Compression + +Keep only essential state: + +```json +{ + "session_id": "abc123", + "active_characters": ["Luna", "Bear"], + "current_quest": "find_moon", + "tone_constraints": "gentle_adventure", + "age_band": "3-5", + "plot_anchors": ["discovered_moon_stone", "met_starlight_friend"], + "character_states": { + "Luna": { "mood": "curious", "location": "forest_edge" }, + "Bear": { "mood": "helpful", "location": "forest_edge" } + }, + "reader_preferences": { "likes": ["animals", "stars"], "dislikes": ["scary"] } +} +``` + +### 5. Character and World Consistency + +Validate: + +- Names remain consistent +- World rules not violated +- Prior events respected +- Visual prompts align with canon + +```json +{ + "valid": true, + "inconsistencies": [], + "suggested_corrections": [], + "world_rules_violated": [], + "character_continuity_ok": true +} +``` + +### 6. Illustration Prompt Shaping + +Convert story scene to constrained image prompts: + +```json +{ + "prompt": "Luna the fox and Bear walking through moonlit forest, children's book style, soft colors, no scary elements", + "style": "children_book", + "style_params": { + "illustration_type": "watercolor", + "color_palette": "warm", + "lighting": "soft_moonlight" + }, + "character_refs": ["luna", "bear"], + "safety_check": "passed", + "age_appropriate": true, + "brand_compliant": true +} +``` + +## Implementation + +### Story Classification + +```python +async def classify_story_request( + user_input: str, + session: Session, + profile: ChildProfile +) -> StoryClassification: + prompt = f"""Classify this story request: + +User input: {user_input} +Session history: {session.summary} +Child age band: {profile.age_band} +Parent settings: {profile.parent_controls} + +Output as JSON with fields: +- story_type: bedtime|learning|adventure|interactive|customization|continuation|image +- age_range: target age range +- is_interactive: boolean +- needs_images: boolean +- curriculum_tags: array of educational tags +- complexity: low|medium|high""" + + return await slm_completion(prompt, schema=StoryClassification) +``` + +### Age and Tone Control + +```python +async def enforce_age_tone( + content: str, + profile: ChildProfile +) -> ControlledContent: + prompt = f"""Adapt content for age group: + +Content: {content[:1000]} +Age band: {profile.age_band} +Profile preferences: {profile.preferences} +Parent tone settings: {profile.parent_tone_settings} + +Output as JSON: +- adapted_content: rewritten content +- reading_level: easy|moderate|advanced +- safety_flag: boolean +- concerns: array of any issues""" + + return await slm_completion(prompt, schema=ControlledContent) +``` + +### Safe Rewriting + +```python +async def safe_rewrite(content: str, age_band: str) -> RewriteResult: + prompt = f"""Rewrite for safety: + +Content: {content[:2000]} +Age band: {age_band} + +If content is safe: return unchanged with "safe" status. +If content needs rewriting: return rewritten version with reason. +If content is unsafe: return blocked with specific reason. + +Output as JSON: +- status: safe|rewritten|blocked +- original: original content +- result: content after rewrite (if applicable) +- reason: explanation""" + + return await slm_completion(prompt, schema=RewriteResult) +``` + +### Memory Compression + +```python +async def compress_session(session: Session) -> CompressedSession: + prompt = f"""Compress session memory for story continuity: + +Current session messages: {session.messages[-20:]} +Active characters: {session.characters} +Current plot state: {session.plot_state} + +Output as JSON: +- summary: 2-3 sentence story summary +- active_characters: array of character names with key traits +- current_quest: current story goal or "none" +- plot_anchors: array of key events that must be remembered +- tone_constraints: current tone settings +- age_band: current age target""" + + return await slm_completion(prompt, schema=CompressedSession) +``` + +### Illustration Prompt Shaping + +```python +async def shape_image_prompt( + scene: StoryScene, + characters: list[Character], + brand_guidelines: BrandGuidelines +) -> ImagePrompt: + prompt = f"""Create child-safe, brand-aligned image prompt: + +Scene: {scene.description} +Characters: {format_characters(characters)} +Story tone: {scene.tone} +Brand style: {brand_guidelines.style} + +Output as JSON: +- prompt: complete image generation prompt +- style: illustration style +- style_params: detailed style parameters +- character_refs: references to character assets +- safety_check: passed|needs_review|failed +- age_appropriate: boolean +- brand_compliant: boolean""" + + return await slm_completion(prompt, schema=ImagePrompt) +``` + +## Implementation Matrix + +### SLM Endpoints + +| Function | Endpoint | Model | Latency Target | +| -------------------- | --------------- | ---------- | -------------- | +| Story Classification | `/classify` | Phi-3 Mini | <100ms | +| Age-Tone Control | `/age-tone` | Phi-3 Mini | <100ms | +| Safe Rewrite | `/safewrite` | Llama 3 8B | <200ms | +| Memory Compression | `/compress` | Phi-3 Mini | <100ms | +| Consistency Check | `/validate` | Phi-3 Mini | <100ms | +| Image Prompt | `/image-prompt` | Phi-3 Mini | <100ms | + +### Contract Shapes + +```typescript +interface StoryClassification { + story_type: StoryType; + age_range: AgeRange; + is_interactive: boolean; + continuation: boolean; + needs_images: boolean; + curriculum_tags: string[]; + complexity: Complexity; + confidence: number; +} + +interface ControlledContent { + adapted_content: string; + reading_level: ReadingLevel; + safety_flag: boolean; + concerns: string[]; + confidence: number; +} + +interface CompressedSession { + summary: string; + active_characters: CharacterSummary[]; + current_quest: string | null; + plot_anchors: string[]; + tone_constraints: ToneConstraints; + age_band: AgeBand; +} + +interface ImagePrompt { + prompt: string; + style: IllustrationStyle; + style_params: StyleParams; + character_refs: string[]; + safety_check: SafetyStatus; + age_appropriate: boolean; + brand_compliant: boolean; +} +``` + +### Telemetry Fields + +| Field | Type | Description | +| ------------------- | ------- | ------------------------------ | +| `request_id` | string | Unique request identifier | +| `session_id` | string | Story session identifier | +| `timestamp` | ISO8601 | Request timestamp | +| `slm_model` | string | SLM model used | +| `function` | string | Classification function called | +| `latency_ms` | number | SLM processing time | +| `confidence` | number | Model confidence score | +| `routed_to_llm` | boolean | Whether LLM was invoked | +| `age_band` | string | Target age range | +| `story_type` | string | Classified story type | +| `safety_flagged` | boolean | Content was flagged | +| `content_rewritten` | boolean | Content was rewritten | +| `tokens_used` | number | Total tokens consumed | +| `cost_usd` | number | Estimated cost | + +### Fallback Rules + +| Condition | Action | +| --------------------------- | ---------------------------------- | +| SLM confidence < 0.7 | Escalate to LLM for classification | +| SLM timeout | Use deterministic rules fallback | +| Moderation flag = "blocked" | Return safe error to user | +| Age band mismatch | Enforce age-appropriate rewrite | +| Consistency check fails | Notify, allow LLM override | +| Image prompt fails safety | Use default safe prompt | + +### Confidence Thresholds Flowchart + +```mermaid +flowchart TD + A[Classification Result] --> B{Confidence >= 0.9?} + B -->|Yes| C[Use SLM Result] + B -->|No| D{Confidence >= 0.7?} + D -->|Yes| E{LLM Verification} + E -->|Agree| C + E -->|Disagree| F[Use LLM Result] + D -->|No| F + F --> G[Log Discrepancy] +``` + +## Tradeoffs + +| Pros | Cons | +| ---------------------------------------- | ----------------------------------------------------- | +| Lowers cost for interactive sessions | SLMs are weaker for rich narrative creativity | +| Improves safety and consistency | Overuse can make stories feel templated | +| Helps maintain story canon | Compression may lose subtle emotional continuity | +| Enables scalable personalization | Moderation can become too restrictive if tuned poorly | +| Reduces unnecessary LLM for simple steps | Image prompts may lack artistic nuance | + +## Correct Role + +| Use SLM For | Use LLM For | +| --------------- | --------------------------- | +| Preparation | Rich storytelling | +| Guardrails | Emotionally nuanced scenes | +| Continuity | Narrative synthesis | +| Personalization | Creative expansions | +| Prompt shaping | Final polished storytelling | + +## Combined Cross-System Architecture + +```mermaid +flowchart TB + U[Users / Apps / Operators / Dev Events] + + U --> G[AI Gateway] + G --> G1[SLM: intent + safety + routing] + G1 --> G2{Path} + + G2 -->|agentic work| M[Cognitive Mesh] + G2 -->|repo / CI work| C[CodeFlow Engine] + G2 -->|tooling / automation| A[AgentKit Forge] + G2 -->|creative storytelling| Y[Mystira] + G2 -->|simple response| S[Small / Mid Model] + G2 -->|deep reasoning| L[Large Model] + + M --> M1[SLM: specialist router] + M1 --> M2[Architecture Agent] + M1 --> M3[Infra Agent] + M1 --> M4[Security Agent] + M1 --> M5[Research / Cost Agent] + M2 --> X[Shared State / Evidence] + M3 --> X + M4 --> X + M5 --> X + X --> M6[SLM: context compressor] + M6 --> L + + C --> C1[SLM: PR / CI classifier] + C1 --> C2[Pipeline policy] + C2 --> C3[Fast path / full path / contract checks] + + A --> A1[SLM: tool selector] + A1 --> A2[GitHub] + A1 --> A3[Azure] + A1 --> A4[Terraform] + A1 --> A5[Kusto / Docs] + + Y --> Y1[SLM: age-fit + moderation + continuity] + Y1 --> Y2[Story Model / Creative LLM] + Y2 --> Y3[SLM: consistency + reading level + image prompt shaping] + + R[PhoenixRooivalk Edge] --> R1[Fusion / Threat Scoring] + R1 --> R2[SLM: operator interpretation] + R2 --> R3[Console / Incident Reports] + + L --> Z[Final synthesis / high-complexity outputs] +``` + +## Platform Comparison + +| Platform | Best SLM Role | Should SLM be Primary? | Escalate to LLM When | +| --------------- | ----------------------------------------------- | ------------------------- | ------------------------------------------------ | +| AI Gateway | routing, safety, cost control | **yes** | ambiguity, complex reasoning | +| Cognitive Mesh | agent routing, decomposition, compression | **yes** | cross-agent synthesis needed | +| CodeFlow Engine | PR/CI triage, failure summaries | **yes** | root cause requires deep analysis | +| AgentKit Forge | tool selection, memory shaping | **yes** | planning becomes ambiguous or multi-step | +| PhoenixRooivalk | operator summaries, reports | **no** | strategic analysis or long-form reporting | +| **Mystira** | moderation, age-fit, continuity, prompt shaping | **yes** for control layer | rich storytelling, emotionally nuanced narrative | + +## Key Concerns + +| Concern | Strategy | +| ------------------- | ------------------------------------------------------------ | +| Safety | SLM pre-filter + LLM post-filter + deterministic rules | +| Age-appropriateness | Hard rules for age bands + SLM adaptation | +| Story continuity | SLM validates consistency with plot anchors | +| Cost | Route simple steps through SLM; LLM only for rich generation | +| Creativity | Reserve LLM for emotionally nuanced storytelling | +| Parental controls | Deterministic rules + SLM suggestion refinement | +| Brand consistency | SLM enforces brand guidelines in image prompts | + +## Canonical Principle for Mystira + +> **Use SLMs to make stories safe, consistent, and affordable.** +> **Use LLMs to make them magical.** + +## Implementation Checklist + +- [ ] Add story request classification endpoint +- [ ] Implement age and tone control pipeline +- [ ] Add moderation and safe rewriting +- [ ] Implement session memory compression +- [ ] Add character/world consistency validation +- [ ] Implement illustration prompt shaping +- [ ] Set up cost tracking per session type +- [ ] Configure confidence threshold cascades +- [ ] Add parental controls integration +- [ ] Implement brand guidelines enforcement +- [ ] Add telemetry and observability +- [ ] Set up fallback deterministic rules diff --git a/docs/architecture/systems/phoenix-rooivalk.md b/docs/architecture/systems/phoenix-rooivalk.md new file mode 100644 index 0000000..dedaf96 --- /dev/null +++ b/docs/architecture/systems/phoenix-rooivalk.md @@ -0,0 +1,180 @@ +# PhoenixRooivalk + +PhoenixRooivalk is an edge AI counter-UAS (Unmanned Aerial System) system. **SLM must NOT be the primary kinetic or safety-critical decision-maker** — it sits in interpretation and operator-support layer only. + +## Architecture + +``` +Sensors + │ + ▼ +┌─────────────────────────────────────┐ +│ Rules + Signal Models + Fusion │ +│ (core detection - NOT SLM) │ +└─────────────────────────────────────┘ + │ + ▼ +Threat Detection + │ + ▼ +┌─────────────────────────────────────┐ +│ SLM Interpretation Layer │ +│ (summaries, reports, narratives) │ +└─────────────────────────────────────┘ + │ + ▼ +Operator Console +``` + +## Critical Principle + +> Use **rules + signal models + fusion engine** for core detection. +> Use **SLM only** for human-readable interpretation and workflow assistance. + +**Never use SLM for:** + +- Primary safety-critical actuation +- Final kinetic authorization +- Real-time hard control loops +- Deterministic low-level signal classification (use classical/ML models) + +## Good SLM Use Cases + +| Use Case | Description | Output | +| ---------------------- | --------------------------- | ---------------------------------------- | +| Alert Summaries | Format alerts for operators | "Drone approaching from NW at 35m" | +| Event Clustering | Group similar events | `{ "cluster": "loitering", "count": 3 }` | +| Post-Mission Narrative | Generate mission reports | Full structured report | +| SOP Lookup | Suggest procedures | `{ "sop": "perimeter breach" }` | +| Incident Drafting | Draft incident reports | Human-readable report | +| Telemetry Translation | Convert raw to text | "RF signature consistent with..." | + +## Example SLM Outputs + +### Alert Summary + +```json +{ + "summary": "Drone detected approaching perimeter at 35m altitude", + "classification": "suspicious", + "confidence": 0.74, + "relevant_sensors": ["radar", "rf"], + "operator_action": "monitor" +} +``` + +### Post-Mission Narrative + +``` +Mission Summary: +- Duration: 45 minutes +- Events detected: 3 +- Threats: 1 (non-critical) +- Actions taken: Monitor mode + +Key Event: +14:32 - Drone detected approaching perimeter from NW +Classification: Consumer quadcopter (RF signature match) +Resolution: Left area at 14:38 +``` + +## Implementation + +### Edge Processing Pipeline + +```python +class EdgeProcessor: + def __init__(self): + self.slm = load_local_slm() # Gemma or Phi-3 + + async def process_telemetry(self, raw_stream: bytes) -> ProcessedEvent: + # Core detection is NOT SLM - rules + signal models + detection = self.fusion_engine.process(raw_stream) + + if detection.threat_level > THRESHOLD: + # SLM only for human interpretation + summary = await self.slm.summarize(detection) + + return ProcessedEvent( + detection=detection, + summary=summary, # SLM output + timestamp=datetime.utcnow() + ) +``` + +### Alert Formatting + +```python +async def format_alert(detection: Detection) -> OperatorAlert: + prompt = f"""Format this detection for operator: + +Radar: {detection.radar_summary} +RF: {detection.rf_signature} +Flight: {detection.flight_pattern} + +Output: summary, classification, recommended_action""" + + return await slm_completion(prompt) +``` + +### Report Generation + +```python +async def generate_mission_report(events: list[Event]) -> MissionReport: + prompt = f"""Generate post-mission report: + +Events: {format_events(events)} +Duration: {mission.duration} + +Output: structured report with key findings""" + + return await slm_completion(prompt) +``` + +## Tradeoffs + +| Pros | Cons | +| ----------------------------- | ------------------------------------------------------------ | +| Better operator comprehension | Hallucinated interpretations dangerous if presented as facts | +| Faster report generation | Must clearly separate inferred from sensor facts | +| Reduced cognitive load | Offline edge deployment constraints | + +## Key Concerns + +| Concern | Strategy | +| ------------------------- | -------------------------------------------- | +| Safety-critical decisions | Never use SLM for actuation | +| Hallucination | Clearly label SLM output as "interpretation" | +| Edge constraints | Optimize SLM for edge (quantization) | +| Offline operation | Full local inference capability | + +## Hardware Options + +| Device | SLM Capability | Notes | +| ----------- | ----------------- | ----------------------- | +| Jetson Nano | Phi-3 Mini (int4) | ~5ms inference | +| Jetson Orin | Phi-3 Mini (fp16) | Real-time processing | +| Edge CPU | Gemma 2B | Offline fallback | +| Mobile SoC | Phi-3 Mini (int4) | Phone/tablet deployment | + +## Model Optimization + +```python +# Quantize for edge deployment +from optimum.quanto import quantize + +model = quantize( + original_model, + weights=quantization_type.q4, + activations=quantization_type.q8 +) +``` + +## Implementation Checklist + +- [ ] Separate SLM from core detection pipeline +- [ ] Implement alert summarization for operators +- [ ] Add post-mission narrative generation +- [ ] Clearly label SLM output vs sensor facts +- [ ] Optimize for edge deployment +- [ ] Test offline operation diff --git a/docs/guides/README.md b/docs/guides/README.md new file mode 100644 index 0000000..a6ea1d0 --- /dev/null +++ b/docs/guides/README.md @@ -0,0 +1,25 @@ +# Guides + +Implementation guides for various topics. + +## SLM Implementation + +- [README](README.md) - Practical SLM implementation patterns, when to use SLM vs LLM + +## Architecture Reference + +See [docs/architecture](../architecture/) for detailed system documentation: + +- AI Gateway — SLM as admission control +- Cognitive Mesh — Agent orchestration +- PhoenixRooivalk — Edge AI (reports only) +- CodeFlow Engine — CI/CD intelligence +- AgentKit Forge — Agent building + +## Coming Soon + +- AI Gateway deployment guide +- Cognitive Mesh setup guide +- Edge deployment guide (PhoenixRooivalk) +- CodeFlow Engine integration +- AgentKit Forge quickstart diff --git a/docs/planning/request_to_token_attribution.md b/docs/planning/request_to_token_attribution.md index 34a282c..d5790c1 100644 --- a/docs/planning/request_to_token_attribution.md +++ b/docs/planning/request_to_token_attribution.md @@ -65,11 +65,11 @@ LiteLLM's OTEL callback automatically emits spans with: ### Phase 2: Correlation ID Propagation -**Status: In Progress** +**Status: ✅ Done** Correlation IDs flow through the system in two ways: -**Method A: Via Request Metadata (Recommended)** +**Method A: Via Request Metadata (Implemented)** Pass correlation IDs in the request body `metadata` field: ```json @@ -122,7 +122,7 @@ Start with downstream aggregation in pvc-costops-analytics - the cheapest and fa ### 1. cognitive-mesh (Upstream Caller) -**Required:** Must pass correlation headers when calling gateway. There are two methods: +**Required:** Pass correlation metadata in request body when calling gateway. There are two methods: **Method A: Via Request Metadata (Recommended)** Pass correlation IDs in the request body `metadata` field: @@ -207,18 +207,18 @@ _Note: Method B requires additional LiteLLM configuration or middleware._ ## Acceptance Criteria -| Criterion | Status | Notes | -| -------------------------------------------- | ---------- | ----------------------------------------- | -| 100% of LLM calls emit token telemetry | ✅ Done | Via OTEL callback | -| 100% include workflow + stage | ⚠️ Partial | Requires upstream to pass metadata | -| Support KQL joins by operation_Id/request_id | ✅ Done | OTEL spans include metadata | -| Request-completion rollup totals | 🔜 Future | Requires Phase 3 (downstream aggregation) | +| Criterion | Status | Notes | +| -------------------------------------------- | --------- | ------------------------------------------------------- | +| 100% of LLM calls emit token telemetry | ✅ Done | Via OTEL callback | +| 100% include workflow + stage | 🔜 Ready | Requires cognitive-mesh to pass metadata to gateway | +| Support KQL joins by operation_Id/request_id | 🔜 Ready | Requires pvc-costops-analytics to implement KQL queries | +| Request-completion rollup totals | 🔜 Future | Requires Phase 3 (downstream aggregation) | ## Dependencies -- cognitive-mesh: Must pass correlation headers to gateway +- cognitive-mesh: Pass correlation metadata in request body - pvc-costops-analytics: Must create KQL queries for new event shape -- infra: May need custom LiteLLM container image or OTEL collector +- infra: Application Insights resource + APPLICATIONINSIGHTS_CONNECTION_STRING wiring added; trace export requires custom LiteLLM image (with azure-monitor-opentelemetry) or explicit OTEL_EXPORTER_OTLP_ENDPOINT configuration (currently empty by default) ## Action Items @@ -226,12 +226,13 @@ _Note: Method B requires additional LiteLLM configuration or middleware._ 1. ✅ ai-gateway: Add OTEL callback for token telemetry (Phase 1) 2. ✅ ai-gateway: Document correlation ID requirements (Phase 2) +3. ✅ ai-gateway: Add Application Insights connection string wiring (Phase 1b - trace export requires custom image or OTLP collector) ### Pending -3. cognitive-mesh: Pass correlation IDs in request metadata -4. pvc-costops-analytics: Create KQL queries for OTEL span joins -5. pvc-costops-analytics: Implement request rollup aggregation (Phase 3) +4. cognitive-mesh: Pass correlation metadata in request body +5. pvc-costops-analytics: Create KQL queries for OTEL span joins +6. pvc-costops-analytics: Implement request rollup aggregation (Phase 3) --- diff --git a/infra/env/dev/variables.tf b/infra/env/dev/variables.tf index cf18df1..92d7c13 100644 --- a/infra/env/dev/variables.tf +++ b/infra/env/dev/variables.tf @@ -1,8 +1,8 @@ variable "env" { type = string validation { - condition = contains(["dev", "uat", "prod"], var.env) - error_message = "Environment must be one of: dev, uat, prod." + condition = contains(["dev", "staging", "prod"], var.env) + error_message = "Environment must be one of: dev, staging, prod." } } variable "location" { type = string } diff --git a/infra/env/prod/variables.tf b/infra/env/prod/variables.tf index 1a9a003..efcd1be 100644 --- a/infra/env/prod/variables.tf +++ b/infra/env/prod/variables.tf @@ -1,8 +1,8 @@ variable "env" { type = string validation { - condition = contains(["dev", "uat", "prod"], var.env) - error_message = "Environment must be one of: dev, uat, prod." + condition = contains(["dev", "staging", "prod"], var.env) + error_message = "Environment must be one of: dev, staging, prod." } } variable "location" { type = string } diff --git a/infra/env/uat/.terraform.lock.hcl b/infra/env/staging/.terraform.lock.hcl similarity index 100% rename from infra/env/uat/.terraform.lock.hcl rename to infra/env/staging/.terraform.lock.hcl diff --git a/infra/env/uat/main.tf b/infra/env/staging/main.tf similarity index 100% rename from infra/env/uat/main.tf rename to infra/env/staging/main.tf diff --git a/infra/env/uat/terraform.tfvars b/infra/env/staging/terraform.tfvars similarity index 89% rename from infra/env/uat/terraform.tfvars rename to infra/env/staging/terraform.tfvars index 566fd2e..2f80c99 100644 --- a/infra/env/uat/terraform.tfvars +++ b/infra/env/staging/terraform.tfvars @@ -1,4 +1,4 @@ -env = "uat" +env = "staging" projname = "aigateway" location = "southafricanorth" location_short = "san" @@ -7,7 +7,7 @@ location_short = "san" # NOTE: The TF_VAR_azure_openai_endpoint environment variable (set via the # GitHub Environment secret AZURE_OPENAI_ENDPOINT) takes precedence over this # value during CI/CD runs. For local development, either set that env var or -# update this file with the correct UAT endpoint. +# update this file with the correct staging endpoint. azure_openai_endpoint = "https://mys-shared-ai-san.cognitiveservices.azure.com" codex_model = "gpt-5.3-codex" @@ -22,7 +22,7 @@ secrets_expiration_date = "2027-03-31T00:00:00Z" tags = { owner = "ai-gateway-team" project = "aigateway" - env = "uat" + env = "staging" } enable_redis_cache = true diff --git a/infra/env/uat/variables.tf b/infra/env/staging/variables.tf similarity index 97% rename from infra/env/uat/variables.tf rename to infra/env/staging/variables.tf index ff4b0e5..2bab6ad 100644 --- a/infra/env/uat/variables.tf +++ b/infra/env/staging/variables.tf @@ -1,8 +1,8 @@ variable "env" { type = string validation { - condition = contains(["dev", "uat", "prod"], var.env) - error_message = "Environment must be one of: dev, uat, prod." + condition = contains(["dev", "staging", "prod"], var.env) + error_message = "Environment must be one of: dev, staging, prod." } } variable "location" { type = string } diff --git a/infra/modules/aigateway_aca/main.tf b/infra/modules/aigateway_aca/main.tf index 412723c..9216a67 100644 --- a/infra/modules/aigateway_aca/main.tf +++ b/infra/modules/aigateway_aca/main.tf @@ -128,6 +128,15 @@ resource "azurerm_log_analytics_workspace" "law" { tags = local.tags } +resource "azurerm_application_insights" "ai" { + name = "${local.prefix}-ai-${var.location_short}" + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + application_type = "web" + retention_in_days = var.env == "prod" ? 90 : 30 + tags = local.tags +} + resource "azurerm_container_app_environment" "cae" { name = local.cae_name location = azurerm_resource_group.rg.location @@ -245,6 +254,15 @@ resource "azurerm_key_vault_secret" "langfuse_secret_key" { depends_on = [azurerm_key_vault_access_policy.terraform_client] } +resource "azurerm_key_vault_secret" "appinsights_connection_string" { + name = "appinsights-connection-string" + value = azurerm_application_insights.ai.connection_string + key_vault_id = azurerm_key_vault.kv.id + expiration_date = var.secrets_expiration_date + + depends_on = [azurerm_key_vault_access_policy.terraform_client] +} + resource "azurerm_user_assigned_identity" "ca" { name = "${local.ca_name}-id" resource_group_name = azurerm_resource_group.rg.name @@ -330,6 +348,12 @@ resource "azurerm_container_app" "ca" { } } + secret { + name = "appinsights-connection-string" + key_vault_secret_id = azurerm_key_vault_secret.appinsights_connection_string.versionless_id + identity = azurerm_user_assigned_identity.ca.id + } + template { min_replicas = var.min_replicas max_replicas = var.max_replicas @@ -429,6 +453,13 @@ resource "azurerm_container_app" "ca" { } } + # Azure Application Insights connection string (for azure-monitor-opentelemetry exporter) + # Use with custom LiteLLM image that includes azure-monitor-opentelemetry package + env { + name = "APPLICATIONINSIGHTS_CONNECTION_STRING" + secret_name = "appinsights-connection-string" + } + # LiteLLM commonly listens on 4000; set port as needed } } diff --git a/infra/modules/aigateway_aca/outputs.tf b/infra/modules/aigateway_aca/outputs.tf index f00e81a..a8dfe6b 100644 --- a/infra/modules/aigateway_aca/outputs.tf +++ b/infra/modules/aigateway_aca/outputs.tf @@ -29,3 +29,9 @@ output "container_app_environment_id" { description = "ID of the Container App Environment — used by sibling modules (e.g. dashboard_aca) to deploy into the same environment." value = azurerm_container_app_environment.cae.id } + +output "application_insights_connection_string" { + value = azurerm_application_insights.ai.connection_string + description = "Application Insights connection string for OTEL export." + sensitive = true +} diff --git a/infra/modules/aigateway_aca/variables.tf b/infra/modules/aigateway_aca/variables.tf index 377b408..7fce498 100644 --- a/infra/modules/aigateway_aca/variables.tf +++ b/infra/modules/aigateway_aca/variables.tf @@ -3,10 +3,10 @@ variable "env" { type = string - description = "Environment name (dev|uat|prod)" + description = "Environment name (dev|staging|prod)" validation { - condition = contains(["dev", "uat", "prod"], var.env) - error_message = "Environment must be one of: dev, uat, prod." + condition = contains(["dev", "staging", "prod"], var.env) + error_message = "Environment must be one of: dev, staging, prod." } } diff --git a/infra/modules/dashboard_aca/variables.tf b/infra/modules/dashboard_aca/variables.tf index 6aa0eea..7fbb116 100644 --- a/infra/modules/dashboard_aca/variables.tf +++ b/infra/modules/dashboard_aca/variables.tf @@ -1,9 +1,9 @@ variable "env" { type = string - description = "Environment name (dev|uat|prod)" + description = "Environment name (dev|staging|prod)" validation { - condition = contains(["dev", "uat", "prod"], var.env) - error_message = "Environment must be one of: dev, uat, prod." + condition = contains(["dev", "staging", "prod"], var.env) + error_message = "Environment must be one of: dev, staging, prod." } } diff --git a/infra/modules/state_service_aca/variables.tf b/infra/modules/state_service_aca/variables.tf index 51febf4..d11da7d 100644 --- a/infra/modules/state_service_aca/variables.tf +++ b/infra/modules/state_service_aca/variables.tf @@ -1,9 +1,9 @@ variable "env" { type = string - description = "Environment name (dev|uat|prod)" + description = "Environment name (dev|staging|prod)" validation { - condition = contains(["dev", "uat", "prod"], var.env) - error_message = "Environment must be one of: dev, uat, prod." + condition = contains(["dev", "staging", "prod"], var.env) + error_message = "Environment must be one of: dev, staging, prod." } } diff --git a/infra/scripts/terraform-init.ps1 b/infra/scripts/terraform-init.ps1 index 6b3dee5..9ba1588 100644 --- a/infra/scripts/terraform-init.ps1 +++ b/infra/scripts/terraform-init.ps1 @@ -1,7 +1,7 @@ # Load .env.local and run terraform init -upgrade -# Usage: .\infra\scripts\terraform-init.ps1 [dev|uat|prod] +# Usage: .\infra\scripts\terraform-init.ps1 [dev|staging|prod] -param([Parameter(Mandatory=$true)][ValidateSet("dev","uat","prod")][string]$Env) +param([Parameter(Mandatory=$true)][ValidateSet("dev","staging","prod")][string]$Env) $envFile = Join-Path $PSScriptRoot ".." ".env.local" if (-not (Test-Path $envFile)) { diff --git a/infra/scripts/terraform-init.sh b/infra/scripts/terraform-init.sh index 52f3300..c7f5798 100644 --- a/infra/scripts/terraform-init.sh +++ b/infra/scripts/terraform-init.sh @@ -1,15 +1,15 @@ #!/bin/bash # Load .env.local and run terraform init -upgrade -# Usage: ./infra/scripts/terraform-init.sh [dev|uat|prod] +# Usage: ./infra/scripts/terraform-init.sh [dev|staging|prod] set -e -ENV="${1:?Usage: $0 dev|uat|prod}" +ENV="${1:?Usage: $0 dev|staging|prod}" case "$ENV" in - dev|uat|prod) ;; + dev|staging|prod) ;; *) - echo "Usage: $0 dev|uat|prod" - echo "Error: ENV must be dev, uat, or prod; got: $ENV" + echo "Usage: $0 dev|staging|prod" + echo "Error: ENV must be dev, staging, or prod; got: $ENV" exit 1 ;; esac diff --git a/scripts/add-federated-credentials.sh b/scripts/add-federated-credentials.sh index f7740bc..9969b0c 100644 --- a/scripts/add-federated-credentials.sh +++ b/scripts/add-federated-credentials.sh @@ -3,7 +3,7 @@ set -e # Add Federated Credentials for GitHub Actions Environments # Use this script if you already ran bootstrap and got AADSTS700213 because -# the workflow uses environment: dev/uat/prod but Azure only had branch-based credentials. +# the workflow uses environment: dev/staging/prod but Azure only had branch-based credentials. # # Usage: $0 # Example: $0 abc123-def456 phoenixvc ai-gateway @@ -11,7 +11,7 @@ set -e if [ "$#" -ne 3 ]; then echo "Usage: $0 " echo "" - echo "Adds federated identity credentials for dev, uat, prod environments" + echo "Adds federated identity credentials for dev, staging, prod environments" echo "to an existing Azure AD app registration (fixes AADSTS700213)." echo "" echo "Example: $0 \$(az ad app list --display-name pvc-shared-github-actions-oidc --query [0].appId -o tsv) phoenixvc ai-gateway" @@ -30,8 +30,8 @@ fi command -v jq >/dev/null 2>&1 || { echo "Error: jq is required for safe JSON construction. Install jq and retry."; exit 1; } -echo "Ensuring federated credentials for environments (dev, uat, prod) on app $APP_ID..." -for ENV in dev uat prod; do +echo "Ensuring federated credentials for environments (dev, staging, prod) on app $APP_ID..." +for ENV in dev staging prod; do SUBJECT="repo:$GITHUB_ORG/$GITHUB_REPO:environment:$ENV" EXISTING_SUBJECT=$(az ad app federated-credential list --id "$OBJECT_ID" --query "[?name=='github-actions-$ENV'].subject" -o tsv 2>/dev/null | head -n1) if [ -n "$EXISTING_SUBJECT" ] && [ "$EXISTING_SUBJECT" = "$SUBJECT" ]; then diff --git a/scripts/bootstrap.ps1 b/scripts/bootstrap.ps1 index 19a8e44..0583b05 100644 --- a/scripts/bootstrap.ps1 +++ b/scripts/bootstrap.ps1 @@ -120,8 +120,8 @@ $bytes = New-Object byte[] 32 [System.Security.Cryptography.RandomNumberGenerator]::Create().GetBytes($bytes) $AIGATEWAY_KEY = [Convert]::ToBase64String($bytes) -Write-Host "Ensuring Federated Credentials for GitHub Actions (environments: dev, uat, prod)..." -foreach ($EnvName in @("dev","uat","prod")) { +Write-Host "Ensuring Federated Credentials for GitHub Actions (environments: dev, staging, prod)..." +foreach ($EnvName in @("dev","staging","prod")) { $SUBJECT = "repo:" + $GITHUB_ORG + "/" + $GITHUB_REPO + ":environment:" + $EnvName $EXISTING_SUBJECT = az ad app federated-credential list --id $OBJECT_ID --query "[?name=='github-actions-$EnvName'].subject" -o tsv 2>$null | Select-Object -First 1 if ($EXISTING_SUBJECT -and ($EXISTING_SUBJECT -eq $SUBJECT)) { diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index aa093f0..18417dc 100644 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -13,7 +13,7 @@ GITHUB_REPO="$2" SCOPE="$3" # --- Configuration --- -# Shared infra: OIDC app and TF state span dev/uat/prod +# Shared infra: OIDC app and TF state span dev/staging/prod LOCATION="southafricanorth" RG_NAME="pvc-shared-tfstate-rg-san" CONTAINER_NAME="tfstate" @@ -119,9 +119,9 @@ OBJECT_ID=$(az ad app show --id "$APP_ID" --query id --output tsv) AIGATEWAY_KEY=$(openssl rand -base64 32 2>/dev/null || head -c 32 /dev/urandom | base64) -echo "Ensuring Federated Credentials for GitHub Actions (environments: dev, uat, prod)..." +echo "Ensuring Federated Credentials for GitHub Actions (environments: dev, staging, prod)..." command -v jq >/dev/null 2>&1 || { echo "Error: jq is required for safe JSON construction. Install jq and retry."; exit 1; } -for ENV in dev uat prod; do +for ENV in dev staging prod; do SUBJECT="repo:$GITHUB_ORG/$GITHUB_REPO:environment:$ENV" EXISTING_SUBJECT=$(az ad app federated-credential list --id "$OBJECT_ID" --query "[?name=='github-actions-$ENV'].subject" -o tsv 2>/dev/null | head -n1) if [ -n "$EXISTING_SUBJECT" ] && [ "$EXISTING_SUBJECT" = "$SUBJECT" ]; then