From f7c2943b994ddc2d119701c9ee0c5625039440c3 Mon Sep 17 00:00:00 2001 From: "James N." Date: Fri, 13 Feb 2026 08:22:48 -0800 Subject: [PATCH 1/8] feat: per-developer GitHub Environments with OIDC - Create 6 GitHub environments: production, integration-james, integration-nicole, integration-heena, integration-tim, integration-matt - Move all variables from repo-level to environment-level - Update orchestrate.yml: *-dev branch integration- mapping - Uncomment environment: binding in all 7 reusable workflows - Fix TF state key: use environment name instead of branch name - Fix destroy.yml bugs: iteration var and unsanitized state key - Remove auto-destroy (all environments persist) - Add OIDC federated credentials for integration-james and production - Create prod.tfvars for production environment - Update GITHUB_ACTIONS_SETUP.md with developer onboarding guide --- .github/workflows/agent-evaluation.yml | 1 + .github/workflows/destroy.yml | 9 +- .github/workflows/docker-application.yml | 2 +- .github/workflows/docker-mcp.yml | 2 +- .github/workflows/infrastructure.yml | 20 +-- .github/workflows/orchestrate.yml | 55 +++++---- .github/workflows/update-containers.yml | 2 +- infra/GITHUB_ACTIONS_SETUP.md | 147 +++++++++++++---------- infra/terraform/prod.tfvars | 34 ++++++ 9 files changed, 168 insertions(+), 104 deletions(-) create mode 100644 infra/terraform/prod.tfvars diff --git a/.github/workflows/agent-evaluation.yml b/.github/workflows/agent-evaluation.yml index 952507cbe..725fd44c2 100644 --- a/.github/workflows/agent-evaluation.yml +++ b/.github/workflows/agent-evaluation.yml @@ -54,6 +54,7 @@ jobs: agent-evaluation: name: Agent Quality Evaluation runs-on: ubuntu-latest + environment: ${{ inputs.environment || 'integration' }} permissions: contents: read id-token: write # Needed for OIDC → DefaultAzureCredential diff --git a/.github/workflows/destroy.yml b/.github/workflows/destroy.yml index a47111ce3..38bfb6a09 100644 --- a/.github/workflows/destroy.yml +++ b/.github/workflows/destroy.yml @@ -30,7 +30,7 @@ jobs: terraform_destroy: name: Terraform Destroy runs-on: ubuntu-latest - # environment: ${{ inputs.environment || 'dev' }} # Commented out to use repo-level variables + environment: ${{ inputs.environment || 'integration' }} permissions: id-token: write contents: read @@ -66,13 +66,14 @@ jobs: -var subscription_id=${{ vars.AZURE_SUBSCRIPTION_ID }} \ -var acr_name=${{ vars.ACR_NAME }} \ -var location=${{ vars.AZ_REGION }} \ - -var environment=${{ inputs.environment || 'dev' }} \ + -var environment=${{ inputs.environment || 'integration' }} \ -var docker_image_mcp=${{ vars.DOCKER_IMAGE_MCP }} \ -var docker_image_backend=${{ vars.DOCKER_IMAGE_BACKEND }} \ - -var iteration=${{ inputs.environment || 'dev' }} + -var iteration=${{ vars.ITERATION }} env: TFSTATE_RG: ${{ vars.TFSTATE_RG }} TFSTATE_ACCOUNT: ${{ vars.TFSTATE_ACCOUNT }} TFSTATE_CONTAINER: ${{ vars.TFSTATE_CONTAINER }} - TFSTATE_KEY: "${{ github.event.repository.name }}-${{ github.ref_name }}.tfstate" + # Use environment name for state key — must match infrastructure.yml + TFSTATE_KEY: "${{ github.event.repository.name }}-${{ inputs.environment || 'integration' }}.tfstate" diff --git a/.github/workflows/docker-application.yml b/.github/workflows/docker-application.yml index 49089cc44..407406041 100644 --- a/.github/workflows/docker-application.yml +++ b/.github/workflows/docker-application.yml @@ -27,7 +27,7 @@ jobs: build: name: Build & Push Backend Image runs-on: ubuntu-latest - # environment: ${{ inputs.environment || 'dev' }} # Commented out to use repo-level variables + environment: ${{ inputs.environment || 'integration' }} permissions: id-token: write contents: read diff --git a/.github/workflows/docker-mcp.yml b/.github/workflows/docker-mcp.yml index 1d995f362..f111351a9 100644 --- a/.github/workflows/docker-mcp.yml +++ b/.github/workflows/docker-mcp.yml @@ -27,7 +27,7 @@ jobs: build: name: Build & Push MCP Image runs-on: ubuntu-latest - # environment: ${{ inputs.environment || 'dev' }} # Commented out to use repo-level variables + environment: ${{ inputs.environment || 'integration' }} permissions: id-token: write contents: read diff --git a/.github/workflows/infrastructure.yml b/.github/workflows/infrastructure.yml index 25f33238f..3ba7374ff 100644 --- a/.github/workflows/infrastructure.yml +++ b/.github/workflows/infrastructure.yml @@ -41,7 +41,7 @@ jobs: tf: name: Terraform Deployment runs-on: ubuntu-latest - # environment: removed to use repo-level variables + environment: ${{ inputs.environment }} if: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.iac-tool || 'tf') == 'tf' }} permissions: id-token: write @@ -65,13 +65,13 @@ jobs: - name: Terraform Setup uses: hashicorp/setup-terraform@v3 - - name: Sanitize branch name for state key + - name: Sanitize environment name for state key id: sanitize run: | # Replace / and other invalid chars with - for valid Azure blob name - BRANCH="${{ github.head_ref || github.ref_name }}" - SAFE_BRANCH=$(echo "$BRANCH" | sed 's/[^a-zA-Z0-9._-]/-/g') - echo "branch=$SAFE_BRANCH" >> $GITHUB_OUTPUT + ENV="${{ inputs.environment }}" + SAFE_ENV=$(echo "$ENV" | sed 's/[^a-zA-Z0-9._-]/-/g') + echo "env=$SAFE_ENV" >> $GITHUB_OUTPUT - name: Terraform Init/Plan/Apply id: terraform @@ -87,14 +87,14 @@ jobs: -backend-config="container_name=${TFSTATE_CONTAINER}" -backend-config="use_oidc=true" -backend-config="use_azuread_auth=true" terraform plan -out tfplan \ -var project_name=${{ github.event.repository.name }} \ - -var environment=${{ github.event_name == 'workflow_dispatch' && github.event.inputs.environment || (github.base_ref == 'main' && 'prod') || (github.base_ref == 'int-agentic' && 'integration') || 'dev' }} \ + -var environment=${{ inputs.environment }} \ -var tenant_id=${{ vars.AZURE_TENANT_ID }} \ -var subscription_id=${{ vars.AZURE_SUBSCRIPTION_ID }} \ -var acr_name=${{ vars.ACR_NAME }} \ -var location=${{ vars.AZ_REGION }} \ -var docker_image_mcp=${{ vars.DOCKER_IMAGE_MCP }} \ -var docker_image_backend=${{ vars.DOCKER_IMAGE_BACKEND }} \ - -var iteration=${{ (github.event_name != 'workflow_dispatch' && github.base_ref != 'main' && github.base_ref != 'int-agentic') && '${GITHUB_SHA:0:7}' || vars.ITERATION }} + -var iteration=${{ vars.ITERATION }} terraform apply -auto-approve tfplan @@ -109,12 +109,12 @@ jobs: TFSTATE_RG: ${{ vars.TFSTATE_RG }} TFSTATE_ACCOUNT: ${{ vars.TFSTATE_ACCOUNT }} TFSTATE_CONTAINER: ${{ vars.TFSTATE_CONTAINER }} - # Use sanitized branch name for valid Azure blob name - TFSTATE_KEY: "${{ github.event.repository.name }}-${{ steps.sanitize.outputs.branch }}.tfstate" + # Use environment name for state key — each env gets its own TF state + TFSTATE_KEY: "${{ github.event.repository.name }}-${{ steps.sanitize.outputs.env }}.tfstate" bicep: runs-on: ubuntu-latest - # environment: removed to use repo-level variables + environment: ${{ inputs.environment }} if: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.iac-tool || 'tf') == 'bicep' }} permissions: id-token: write diff --git a/.github/workflows/orchestrate.yml b/.github/workflows/orchestrate.yml index 38c421497..ec09578fd 100644 --- a/.github/workflows/orchestrate.yml +++ b/.github/workflows/orchestrate.yml @@ -3,18 +3,22 @@ name: Orchestrate Deployment # ───────────────────────────────────────────────────────────────────── # Pipeline modes: # PR → main / int-agentic ➜ tests-only (validate against existing env) -# Push → main ➜ full deploy (deploy to prod after merge) -# Push → tjs-infra-as-code ➜ full deploy (dev, with auto-destroy) +# Push → main ➜ full deploy (deploy to production) +# Push → *-dev ➜ full deploy (deploy to integration-) # Manual dispatch ➜ full deploy (chosen environment) +# +# Per-developer environments: +# Each developer pushes to their own -dev branch. +# The pipeline maps -dev → integration- environment, +# which contains that developer's own Azure subscription credentials. # ───────────────────────────────────────────────────────────────────── on: workflow_dispatch: inputs: target_env: - type: choice - description: Environment to deploy - options: [dev, test, prod] + type: string + description: "Environment to deploy (e.g. integration-james, production)" required: true pull_request: @@ -25,7 +29,7 @@ on: push: branches: - main - - tjs-infra-as-code + - '*-dev' permissions: contents: read @@ -51,19 +55,26 @@ jobs: if [ "$EVENT" = "workflow_dispatch" ]; then ENV="${{ inputs.target_env }}" elif [ "$EVENT" = "pull_request" ]; then + # PRs: resolve from the target (base) branch case "${{ github.base_ref }}" in - main) ENV="prod" ;; + main) ENV="production" ;; int-agentic) ENV="integration" ;; - *) ENV="dev" ;; + *) ENV="integration" ;; esac elif [ "$EVENT" = "push" ]; then - case "${{ github.ref_name }}" in - main) ENV="prod" ;; - tjs-infra-as-code) ENV="dev" ;; - *) ENV="dev" ;; + BRANCH="${{ github.ref_name }}" + case "$BRANCH" in + main) + ENV="production" ;; + *-dev) + # Extract developer name: james-dev → integration-james + DEV_NAME="${BRANCH%-dev}" + ENV="integration-${DEV_NAME}" ;; + *) + ENV="integration" ;; esac else - ENV="dev" + ENV="integration" fi # ── Resolve pipeline mode ── @@ -90,6 +101,7 @@ jobs: needs: pipeline-config if: needs.pipeline-config.outputs.full_deploy == 'true' runs-on: ubuntu-latest + environment: ${{ needs.pipeline-config.outputs.environment }} steps: - name: Azure OIDC Login uses: azure/login@v2 @@ -155,6 +167,7 @@ jobs: needs: pipeline-config if: needs.pipeline-config.outputs.full_deploy == 'false' runs-on: ubuntu-latest + environment: ${{ needs.pipeline-config.outputs.environment }} outputs: backend_endpoint: ${{ steps.lookup.outputs.backend_endpoint }} mcp_endpoint: ${{ steps.lookup.outputs.mcp_endpoint }} @@ -242,17 +255,7 @@ jobs: secrets: inherit # ──────────────────────────────────────────────────────────────────── - # Optional: Destroy infrastructure (dev branches only, after tests pass) + # NOTE: Auto-destroy is disabled. All environments (integration-* and + # production) persist their infrastructure. To tear down an environment + # manually, use: workflow_dispatch → destroy.yml with the target env. # ──────────────────────────────────────────────────────────────────── - destroy-infrastructure: - needs: [pipeline-config, integration-tests, agent-evaluation] - if: >- - always() - && needs.pipeline-config.outputs.full_deploy == 'true' - && needs.integration-tests.result == 'success' - && (needs.agent-evaluation.result == 'success' || needs.agent-evaluation.result == 'skipped' || needs.agent-evaluation.result == 'failure') - && (github.ref_name == 'tjs-infra-as-code' || github.ref_name == 'james-dev' || (inputs.target_env && inputs.target_env == 'dev')) - uses: ./.github/workflows/destroy.yml - with: - environment: ${{ needs.pipeline-config.outputs.environment }} - secrets: inherit diff --git a/.github/workflows/update-containers.yml b/.github/workflows/update-containers.yml index 51460d7ff..9137649c6 100644 --- a/.github/workflows/update-containers.yml +++ b/.github/workflows/update-containers.yml @@ -35,7 +35,7 @@ jobs: update-containers: name: Update Container Apps runs-on: ubuntu-latest - # environment: ${{ inputs.environment }} # Commented out to use repo-level variables + environment: ${{ inputs.environment }} permissions: id-token: write contents: read diff --git a/infra/GITHUB_ACTIONS_SETUP.md b/infra/GITHUB_ACTIONS_SETUP.md index 010b9bf67..a8178319a 100644 --- a/infra/GITHUB_ACTIONS_SETUP.md +++ b/infra/GITHUB_ACTIONS_SETUP.md @@ -7,7 +7,8 @@ This guide documents how to configure GitHub Actions for automated infrastructur The CI/CD pipeline uses: - **OIDC Authentication** - No secrets stored in GitHub, uses federated identity - **Remote Terraform State** - Shared state in Azure Storage for team collaboration -- **Environment-based Deployments** - Separate configs for dev, integration, prod +- **Per-developer GitHub Environments** - Each developer has their own `integration-` environment backed by their own Azure subscription +- **Environment-scoped Variables** - All Azure credentials and config are stored per-environment, not at repo level ## Architecture @@ -17,6 +18,10 @@ The CI/CD pipeline uses: ├─────────────────────────────────────────────────────────────────────┤ │ orchestrate.yml │ │ ├── pipeline-config (determine mode + environment) │ +│ │ ├── main branch → production environment │ +│ │ ├── james-dev branch → integration-james environment │ +│ │ ├── nicole-dev branch → integration-nicole environment │ +│ │ └── -dev branch → integration- environment │ │ │ │ │ ├── [Full Deploy – push/manual] │ │ │ ├── preflight (enable storage access) │ @@ -25,8 +30,7 @@ The CI/CD pipeline uses: │ │ ├── docker-mcp.yml (build MCP service image) │ │ │ ├── update-containers.yml (refresh running apps) │ │ │ ├── integration-tests.yml (smoke tests) │ -│ │ ├── agent-evaluation.yml (AI quality evaluation) │ -│ │ └── destroy.yml (optional cleanup, dev only) │ +│ │ └── agent-evaluation.yml (AI quality evaluation) │ │ │ │ │ ├── [Tests Only – pull requests] │ │ │ └── resolve-endpoints (az containerapp show) │ @@ -37,10 +41,9 @@ The CI/CD pipeline uses: │ OIDC (no secrets) ▼ ┌─────────────────────────────────────────────────────────────────────┐ -│ Azure │ +│ Azure (per developer subscription) │ ├─────────────────────────────────────────────────────────────────────┤ -│ ├── App Registration (GitHub-Actions-OpenAIWorkshop) │ -│ │ └── Federated Credentials (main, int-agentic, PRs) │ +│ ├── App Registration (federated credential for environment) │ │ ├── Storage Account (Terraform state) │ │ ├── Container Registry (Docker images) │ │ ├── Container Apps (MCP + Backend) │ @@ -90,54 +93,49 @@ Write-Host "Subscription ID: $SubscriptionId" ## Step 2: Configure Federated Credentials -Create federated credentials for each branch/environment. +Create federated credentials for the GitHub environment that maps to this developer. -> **Important:** GitHub org/repos that have a [customized OIDC subject claim template](https://docs.github.com/en/actions/security-for-github-actions/security-hardening-your-deployments/about-security-hardening-with-openid-connect#customizing-the-subject-claims-for-an-organization-or-repository) -> use a numeric subject format: `repository_owner_id::repository_id::...`. -> You can find these IDs via `gh api repos/{owner}/{repo} --jq '.owner.id, .id'`. -> If your org has NOT customized the template, use the default `repo:ORG/REPO:...` format. +> **Important:** This repo uses a [customized OIDC subject claim template](https://docs.github.com/en/actions/security-for-github-actions/security-hardening-your-deployments/about-security-hardening-with-openid-connect#customizing-the-subject-claims-for-an-organization-or-repository) +> with `repository_owner_id` and `repository_id` instead of the default `repo:ORG/REPO:...` format. +> All CI jobs bind an `environment:` context, so the OIDC subject includes `environment:`. ```powershell $AppId = "YOUR_APP_ID" # From Step 1 -# --- Option A: Default subject format --- -# Main branch (prod) +# ── Per-developer integration environment ── +# Replace with your developer name (e.g., james, nicole, tim) +# The subject must exactly match what GitHub presents in the OIDC token. az ad app federated-credential create --id $AppId --parameters '{ - "name": "github-main", + "name": "github-env-integration-", "issuer": "https://token.actions.githubusercontent.com", - "subject": "repo:YOUR_ORG/YOUR_REPO:ref:refs/heads/main", + "subject": "repository_owner_id:6154722:repository_id:605201834:environment:integration-", "audiences": ["api://AzureADTokenExchange"] }' -# --- Option B: Customized (numeric ID) subject format --- -# Use this if your org has customized the OIDC subject claim template. -# Replace OWNER_ID and REPO_ID with actual values from the GitHub API. - -# Main branch (prod) -az ad app federated-credential create --id $AppId --parameters '{ - "name": "github-main", - "issuer": "https://token.actions.githubusercontent.com", - "subject": "repository_owner_id:OWNER_ID:repository_id:REPO_ID:ref:refs/heads/main", - "audiences": ["api://AzureADTokenExchange"] -}' - -# Integration branch +# ── Production environment (only needed for the prod subscription owner) ── az ad app federated-credential create --id $AppId --parameters '{ - "name": "github-int-agentic", + "name": "github-env-production", "issuer": "https://token.actions.githubusercontent.com", - "subject": "repository_owner_id:OWNER_ID:repository_id:REPO_ID:ref:refs/heads/int-agentic", + "subject": "repository_owner_id:6154722:repository_id:605201834:environment:production", "audiences": ["api://AzureADTokenExchange"] }' -# Pull Requests +# ── Pull Requests (for PR validation against existing env) ── +# Note: PR jobs also bind environment:, so the subject includes it. +# You may need a credential for the PR context too if your PRs run OIDC. az ad app federated-credential create --id $AppId --parameters '{ "name": "github-pullrequests", "issuer": "https://token.actions.githubusercontent.com", - "subject": "repository_owner_id:OWNER_ID:repository_id:REPO_ID:pull_request", + "subject": "repository_owner_id:6154722:repository_id:605201834:pull_request", "audiences": ["api://AzureADTokenExchange"] }' ``` +> **How to find your IDs:** +> - Owner ID: `gh api repos/microsoft/OpenAIWorkshop --jq '.owner.id'` → `6154722` +> - Repo ID: `gh api repos/microsoft/OpenAIWorkshop --jq '.id'` → `605201834` +> - Check current OIDC template: `gh api repos/microsoft/OpenAIWorkshop/actions/oidc/customization/sub` + ## Step 3: Assign Azure Roles ```powershell @@ -224,36 +222,44 @@ az role assignment create ` --scope $STORAGE_ID ``` -## Step 5: Configure GitHub Repository Variables +## Step 5: Configure GitHub Environment Variables + +All variables are stored at the **environment level** (not repo level). Each developer's +`integration-` environment contains their own Azure subscription credentials. -Go to **GitHub → Repository → Settings → Secrets and Variables → Actions → Variables** +Go to **GitHub → Repository → Settings → Environments → `integration-` → Environment variables** -### Required Variables +### Required Variables (per environment) | Variable | Description | Example Value | |----------|-------------|---------------| -| `AZURE_CLIENT_ID` | App Registration Client ID | `1d34c51d-9d49-48f3-9e48-6a0f099c5f03` | -| `AZURE_TENANT_ID` | Azure AD Tenant ID | `0fbe7234-45ea-498b-b7e4-1a8b2d3be4d9` | -| `AZURE_SUBSCRIPTION_ID` | Azure Subscription ID | `840b5c5c-3f4a-459a-94fc-6bad2a969f9d` | +| `AZURE_CLIENT_ID` | App Registration Client ID | `1d34c51d-...` | +| `AZURE_TENANT_ID` | Azure AD Tenant ID | `0fbe7234-...` | +| `AZURE_SUBSCRIPTION_ID` | Azure Subscription ID | `840b5c5c-...` | | `TFSTATE_RG` | Resource group for TF state | `rg-tfstate` | -| `TFSTATE_ACCOUNT` | Storage account name | `sttfstateoaiworkshop` | +| `TFSTATE_ACCOUNT` | Storage account name (globally unique) | `sttfstateoaiworkshop` | | `TFSTATE_CONTAINER` | Blob container name | `tfstate` | -| `ACR_NAME` | Azure Container Registry name | `acropenaiworkshop002` | -| `PROJECT_NAME` | Project identifier | `OpenAIWorkshop` | +| `ACR_NAME` | Azure Container Registry name | `OpenAIWorkshopdevacr002` | +| `PROJECT_NAME` | Project identifier | `openaiworkshop` | | `ITERATION` | Deployment iteration | `002` | | `AZ_REGION` | Azure region | `eastus2` | -| `AZURE_AI_PROJECT_ENDPOINT` | AI Foundry project endpoint for evaluation | `https://eastus2oai.services.ai.azure.com/api/projects/eastus2` | -| `AZURE_OPENAI_EVAL_ENDPOINT` | AI Services endpoint for judge models | `https://eastus2oai.services.ai.azure.com/` | +| `DOCKER_IMAGE_MCP` | MCP Docker image name | `mcp-service` | +| `DOCKER_IMAGE_BACKEND` | Backend Docker image name | `backend-service` | +| `REGISTRY_LOGIN_SERVER` | Container registry server | `docker.io` | +| `AZURE_AI_PROJECT_ENDPOINT` | AI Foundry project endpoint for evaluation | `https://...services.ai.azure.com/api/projects/...` | +| `AZURE_OPENAI_EVAL_ENDPOINT` | AI Services endpoint for judge models | `https://...services.ai.azure.com/` | | `AZURE_OPENAI_EVAL_DEPLOYMENT` | Model deployment for LLM-as-judge | `gpt-5.2` | -### Optional Environment-Specific Variables +### Current Environments -Create GitHub Environments (`dev`, `integration`, `prod`) for environment-specific overrides: - -| Environment | Variable | Value | -|-------------|----------|-------| -| `prod` | `AZ_REGION` | `eastus` | -| `prod` | `ITERATION` | `001` | +| Environment | Owner | Branch Mapping | +|-------------|-------|----------------| +| `production` | James | `main` | +| `integration-james` | James | `james-dev` | +| `integration-nicole` | Nicole | `nicole-dev` | +| `integration-heena` | Heena | `heena-dev` | +| `integration-tim` | Tim | `tim-dev` | +| `integration-matt` | Matt | `matt-dev` | --- @@ -263,10 +269,10 @@ The orchestrator has two modes determined by the trigger: | Trigger | Mode | What runs | Environment | |---------|------|-----------|-------------| -| **PR → `main`** | Tests only | `resolve-endpoints` → `integration-tests` | `prod` | +| **PR → `main`** | Tests only | `resolve-endpoints` → `integration-tests` | `production` | | **PR → `int-agentic`** | Tests only | `resolve-endpoints` → `integration-tests` | `integration` | -| **Push to `main`** (after merge) | Full deploy | Preflight → Infra → Build → Update → Tests → Eval | `prod` | -| **Push to `tjs-infra-as-code`** | Full deploy | Preflight → Infra → Build → Update → Tests → Eval → Destroy | `dev` | +| **Push to `main`** (after merge) | Full deploy | Preflight → Infra → Build → Update → Tests → Eval | `production` | +| **Push to `-dev`** | Full deploy | Preflight → Infra → Build → Update → Tests → Eval | `integration-` | | **Manual dispatch** | Full deploy | Preflight → Infra → Build → Update → Tests → Eval | Chosen env | ### Tests-Only Mode (PRs) @@ -288,23 +294,42 @@ environment. | Workflow | Trigger | What it does | |----------|---------|--------------| -| `orchestrate.yml` | PRs, push to main/tjs-infra-as-code, manual | Orchestrates full or tests-only pipeline | +| `orchestrate.yml` | PRs, push to main/*-dev, manual | Orchestrates full or tests-only pipeline | | `infrastructure.yml` | Called by orchestrate (full deploy) | Terraform plan/apply | | `docker-application.yml` | Called by orchestrate (full deploy) | Build backend container | | `docker-mcp.yml` | Called by orchestrate (full deploy) | Build MCP container | | `update-containers.yml` | Called by orchestrate (full deploy) | Refresh Container Apps | -| `destroy.yml` | Called by orchestrate (dev only) | Terraform destroy | +| `destroy.yml` | Manual dispatch only | Terraform destroy | | `agent-evaluation.yml` | Called by orchestrate (full deploy) | AI quality evaluation via Azure AI Foundry | | `integration-tests.yml` | Called by orchestrate (both modes) | Run pytest integration tests | ## Branch to Environment Mapping -| Branch | Environment | Auto-destroy | -|--------|-------------|--------------| -| `main` | `prod` | ❌ No | -| `int-agentic` | `integration` | ❌ No | -| `tjs-infra-as-code` | `dev` | ✅ Yes | -| Other branches | `dev` | Depends on config | +| Branch | Environment | Persistent | +|--------|-------------|------------| +| `main` | `production` | ✅ Yes | +| `james-dev` | `integration-james` | ✅ Yes | +| `nicole-dev` | `integration-nicole` | ✅ Yes | +| `heena-dev` | `integration-heena` | ✅ Yes | +| `tim-dev` | `integration-tim` | ✅ Yes | +| `matt-dev` | `integration-matt` | ✅ Yes | +| `-dev` | `integration-` | ✅ Yes | + +> All environments persist their infrastructure. To tear down manually, use +> `workflow_dispatch` → `destroy.yml` with the target environment. + +--- + +## Developer Onboarding + +To add a new developer to the pipeline: + +1. **Create an Azure App Registration** in the developer's own Azure tenant (Step 1 above) +2. **Add a federated credential** with subject `repository_owner_id:6154722:repository_id:605201834:environment:integration-` (Step 2 above) +3. **Assign Azure roles** to the service principal (Steps 3 and 3b above) +4. **Create TF state storage** in the developer's subscription (Step 4 above) +5. **Ask a repo admin** to create the `integration-` GitHub Environment and set the 16 environment variables (Step 5 above) +6. **Developer pushes to `-dev`** branch — the pipeline will pick up the environment automatically --- diff --git a/infra/terraform/prod.tfvars b/infra/terraform/prod.tfvars new file mode 100644 index 000000000..8ee8b17d5 --- /dev/null +++ b/infra/terraform/prod.tfvars @@ -0,0 +1,34 @@ +# Production environment configuration +environment = "production" +location = "eastus2" +project_name = "OpenAIWorkshop" +iteration = "002" +tenant_id = "0fbe7234-45ea-498b-b7e4-1a8b2d3be4d9" +subscription_id = "840b5c5c-3f4a-459a-94fc-6bad2a969f9d" + +# Optional: Set to false if you want to use API keys (not recommended) +use_cosmos_managed_identity = true + +# OpenAI deployment configuration +create_openai_deployment = true +openai_deployment_name = "gpt-5.2-chat" +openai_model_name = "gpt-5.2-chat" +openai_model_version = "2025-12-11" +openai_api_version = "2025-04-01-preview" +openai_deployment_capacity = 200 # 200k tokens/minute + +# OpenAI embedding deployment configuration +create_openai_embedding_deployment = true +openai_embedding_deployment_name = "text-embedding-ada-002" +openai_embedding_model_name = "text-embedding-ada-002" +openai_embedding_model_version = "2" + +# Networking configuration +enable_networking = true +enable_private_endpoint = true +vnet_address_prefix = "10.10.0.0/16" +container_apps_subnet_prefix = "10.10.0.0/23" +private_endpoint_subnet_prefix = "10.10.2.0/24" + +# MCP Service Security +mcp_internal_only = true From 055915b110813295ae1374fe0c9d75ea517962cc Mon Sep 17 00:00:00 2001 From: "James N." Date: Fri, 13 Feb 2026 08:34:00 -0800 Subject: [PATCH 2/8] fix: strip hyphens from ACR name in Docker workflows --- .github/workflows/docker-application.yml | 3 ++- .github/workflows/docker-mcp.yml | 3 ++- .github/workflows/update-containers.yml | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/docker-application.yml b/.github/workflows/docker-application.yml index 407406041..3f1ab122a 100644 --- a/.github/workflows/docker-application.yml +++ b/.github/workflows/docker-application.yml @@ -46,10 +46,11 @@ jobs: id: acr run: | # Construct ACR name matching Terraform pattern: {project}{env}acr{iteration} + # ACR names must be alphanumeric — strip hyphens to match Terraform's replace("-", "") PROJECT="${{ vars.PROJECT_NAME || 'OpenAIWorkshop' }}" ENV="${{ inputs.environment || 'dev' }}" ITERATION="${{ vars.ITERATION || '002' }}" - ACR_NAME="${PROJECT}${ENV}acr${ITERATION}" + ACR_NAME=$(echo "${PROJECT}${ENV}acr${ITERATION}" | tr -d '-') echo "name=${ACR_NAME}" >> $GITHUB_OUTPUT echo "server=${ACR_NAME}.azurecr.io" >> $GITHUB_OUTPUT echo "Using ACR: ${ACR_NAME}" diff --git a/.github/workflows/docker-mcp.yml b/.github/workflows/docker-mcp.yml index f111351a9..3242142a8 100644 --- a/.github/workflows/docker-mcp.yml +++ b/.github/workflows/docker-mcp.yml @@ -46,10 +46,11 @@ jobs: id: acr run: | # Construct ACR name matching Terraform pattern: {project}{env}acr{iteration} + # ACR names must be alphanumeric — strip hyphens to match Terraform's replace("-", "") PROJECT="${{ vars.PROJECT_NAME || 'OpenAIWorkshop' }}" ENV="${{ inputs.environment || 'dev' }}" ITERATION="${{ vars.ITERATION || '002' }}" - ACR_NAME="${PROJECT}${ENV}acr${ITERATION}" + ACR_NAME=$(echo "${PROJECT}${ENV}acr${ITERATION}" | tr -d '-') echo "name=${ACR_NAME}" >> $GITHUB_OUTPUT echo "server=${ACR_NAME}.azurecr.io" >> $GITHUB_OUTPUT echo "Using ACR: ${ACR_NAME}" diff --git a/.github/workflows/update-containers.yml b/.github/workflows/update-containers.yml index 9137649c6..294d3aee7 100644 --- a/.github/workflows/update-containers.yml +++ b/.github/workflows/update-containers.yml @@ -64,7 +64,8 @@ jobs: echo "backend_app=ca-be-${ITERATION}" >> $GITHUB_OUTPUT # ACR name follows Terraform pattern: {project}{env}acr{iteration} - ACR_NAME="${PROJECT}${ENV}acr${ITERATION}" + # ACR names must be alphanumeric — strip hyphens to match Terraform's replace("-", "") + ACR_NAME=$(echo "${PROJECT}${ENV}acr${ITERATION}" | tr -d '-') echo "acr_name=${ACR_NAME}" >> $GITHUB_OUTPUT echo "acr_server=${ACR_NAME}.azurecr.io" >> $GITHUB_OUTPUT echo "Using ACR: ${ACR_NAME}" From 0f10403dd1353584d10db7c3dcdd21510762bdd1 Mon Sep 17 00:00:00 2001 From: "James N." Date: Fri, 13 Feb 2026 10:15:58 -0800 Subject: [PATCH 3/8] Add auto-import recovery for Terraform 'already exists' errors When a Terraform apply fails midway (e.g., timeout, quota), resources may exist in Azure but not in TF state. On retry, Terraform fails with 'already exists'. This change adds a retry loop (max 3 attempts) that: 1. Detects 'already exists' errors in apply output 2. Parses the TF resource address and Azure resource ID 3. Auto-imports orphaned resources into state 4. Retries the apply Eliminates need for manual deletion via Azure Portal. --- .github/workflows/infrastructure.yml | 74 +++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 11 deletions(-) diff --git a/.github/workflows/infrastructure.yml b/.github/workflows/infrastructure.yml index 3ba7374ff..00a4f579b 100644 --- a/.github/workflows/infrastructure.yml +++ b/.github/workflows/infrastructure.yml @@ -82,21 +82,73 @@ jobs: export ARM_TENANT_ID="${{ vars.AZURE_TENANT_ID }}" export ARM_SUBSCRIPTION_ID="${{ vars.AZURE_SUBSCRIPTION_ID }}" + # Common -var flags used by plan and import + TF_VARS=( + -var project_name=${{ github.event.repository.name }} + -var environment=${{ inputs.environment }} + -var tenant_id=${{ vars.AZURE_TENANT_ID }} + -var subscription_id=${{ vars.AZURE_SUBSCRIPTION_ID }} + -var acr_name=${{ vars.ACR_NAME }} + -var location=${{ vars.AZ_REGION }} + -var docker_image_mcp=${{ vars.DOCKER_IMAGE_MCP }} + -var docker_image_backend=${{ vars.DOCKER_IMAGE_BACKEND }} + -var iteration=${{ vars.ITERATION }} + ) + terraform init -backend-config="resource_group_name=${TFSTATE_RG}" \ -backend-config="key=${TFSTATE_KEY}" -backend-config="storage_account_name=${TFSTATE_ACCOUNT}" \ -backend-config="container_name=${TFSTATE_CONTAINER}" -backend-config="use_oidc=true" -backend-config="use_azuread_auth=true" - terraform plan -out tfplan \ - -var project_name=${{ github.event.repository.name }} \ - -var environment=${{ inputs.environment }} \ - -var tenant_id=${{ vars.AZURE_TENANT_ID }} \ - -var subscription_id=${{ vars.AZURE_SUBSCRIPTION_ID }} \ - -var acr_name=${{ vars.ACR_NAME }} \ - -var location=${{ vars.AZ_REGION }} \ - -var docker_image_mcp=${{ vars.DOCKER_IMAGE_MCP }} \ - -var docker_image_backend=${{ vars.DOCKER_IMAGE_BACKEND }} \ - -var iteration=${{ vars.ITERATION }} - terraform apply -auto-approve tfplan + # ── Apply with auto-import on "already exists" errors ── + # If a prior run partially created resources but crashed before recording + # them in state, Terraform will fail with "already exists". This loop + # detects those errors, auto-imports the orphaned resources, and retries. + MAX_ATTEMPTS=3 + for attempt in $(seq 1 $MAX_ATTEMPTS); do + echo "🔄 Terraform apply attempt $attempt/$MAX_ATTEMPTS" + + terraform plan -out tfplan "${TF_VARS[@]}" + + if terraform apply -auto-approve tfplan 2>&1 | tee /tmp/tf_apply.log; then + echo "✅ Terraform apply succeeded" + break + fi + + # Check if the failure is due to "already exists" errors + if ! grep -q "already exists" /tmp/tf_apply.log; then + echo "❌ Terraform failed with a non-import error" + cat /tmp/tf_apply.log + exit 1 + fi + + if [ "$attempt" -eq "$MAX_ATTEMPTS" ]; then + echo "❌ Terraform failed after $MAX_ATTEMPTS attempts" + cat /tmp/tf_apply.log + exit 1 + fi + + echo "⚠️ Detected 'already exists' errors — auto-importing orphaned resources..." + + # Parse error output: extract terraform address and Azure resource ID pairs + # Error format: with azurerm_container_app.mcp, + # followed by: a resource with the ID "/.../containerApps/ca-mcp-002" already exists + while IFS= read -r line; do + # Extract the TF resource address (e.g. azurerm_container_app.mcp) + tf_addr=$(echo "$line" | grep -oP 'with \K[a-zA-Z0-9_.]+(?=,)') + # Extract the Azure resource ID + azure_id=$(echo "$line" | grep -oP 'the ID "\K[^"]+') + + if [ -n "$tf_addr" ] && [ -n "$azure_id" ]; then + echo " 📥 Importing $tf_addr → $azure_id" + terraform import "${TF_VARS[@]}" "$tf_addr" "$azure_id" || true + fi + done < <( + # Combine consecutive lines so address + ID are on the same logical line + cat /tmp/tf_apply.log | tr '\n' '§' | sed 's/§│/│/g' | tr '§' '\n' | grep "already exists" + ) + + echo "🔁 Retrying terraform apply..." + done output=$(terraform output -raw openai_endpoint 2>/dev/null || true) echo "MODEL_ENDPOINT=$output" >> $GITHUB_OUTPUT From 1a93fc32fb28ada132d857de70112110a79de236 Mon Sep 17 00:00:00 2001 From: "James N." Date: Fri, 13 Feb 2026 10:18:32 -0800 Subject: [PATCH 4/8] Rename workflow to CI/CD Pipeline; fix PR trigger for int-agentic - Rename 'Orchestrate Deployment' -> 'CI/CD Pipeline' - Remove int-agentic from pull_request trigger PRs to int-agentic were failing because environment 'integration' has no OIDC federated credential. PR validation only needed for main (production gate). - Simplify base_ref case statement --- .github/workflows/orchestrate.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/orchestrate.yml b/.github/workflows/orchestrate.yml index ec09578fd..93716859d 100644 --- a/.github/workflows/orchestrate.yml +++ b/.github/workflows/orchestrate.yml @@ -1,4 +1,4 @@ -name: Orchestrate Deployment +name: CI/CD Pipeline # ───────────────────────────────────────────────────────────────────── # Pipeline modes: @@ -24,7 +24,6 @@ on: pull_request: branches: - main - - int-agentic push: branches: @@ -58,7 +57,6 @@ jobs: # PRs: resolve from the target (base) branch case "${{ github.base_ref }}" in main) ENV="production" ;; - int-agentic) ENV="integration" ;; *) ENV="integration" ;; esac elif [ "$EVENT" = "push" ]; then From 2aa25a0261ea14b38367a876094ec0478da68821 Mon Sep 17 00:00:00 2001 From: "James N." Date: Fri, 13 Feb 2026 13:12:06 -0800 Subject: [PATCH 5/8] fix: gracefully skip PR tests when target environment not yet deployed resolve-endpoints now sets deployed=false instead of exit 1 when Container Apps don't exist. integration-tests job checks this flag and skips when the environment hasn't been deployed yet (e.g., first PR to production). --- .github/workflows/orchestrate.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/orchestrate.yml b/.github/workflows/orchestrate.yml index 93716859d..56a34d48b 100644 --- a/.github/workflows/orchestrate.yml +++ b/.github/workflows/orchestrate.yml @@ -169,6 +169,7 @@ jobs: outputs: backend_endpoint: ${{ steps.lookup.outputs.backend_endpoint }} mcp_endpoint: ${{ steps.lookup.outputs.mcp_endpoint }} + deployed: ${{ steps.lookup.outputs.deployed }} steps: - name: Azure OIDC Login uses: azure/login@v2 @@ -199,11 +200,13 @@ jobs: if [ -n "$BE_FQDN" ]; then echo "backend_endpoint=https://${BE_FQDN}" >> $GITHUB_OUTPUT + echo "deployed=true" >> $GITHUB_OUTPUT echo "✅ Backend: https://${BE_FQDN}" else - echo "::error::Backend Container App not found in $RG – is the environment deployed?" + echo "::warning::Backend Container App not found in $RG – environment not yet deployed. Skipping PR tests." echo "backend_endpoint=" >> $GITHUB_OUTPUT - exit 1 + echo "deployed=false" >> $GITHUB_OUTPUT + exit 0 fi if [ -n "$MCP_FQDN" ]; then @@ -224,7 +227,7 @@ jobs: if: >- always() && ( needs.update-containers.result == 'success' - || needs.resolve-endpoints.result == 'success' + || (needs.resolve-endpoints.result == 'success' && needs.resolve-endpoints.outputs.deployed == 'true') ) uses: ./.github/workflows/integration-tests.yml with: From 21e09be4693a9e91bb01e5bf8a10e052f30fe412 Mon Sep 17 00:00:00 2001 From: "James N." Date: Fri, 13 Feb 2026 13:16:25 -0800 Subject: [PATCH 6/8] feat: auto-promotion pipeline (dev int-agentic main) - New workflow: promote-to-main.yml Triggered on push to int-agentic, creates/updates a single rolling PR to main with latest commit summary. Human review required. - New job: auto-merge in orchestrate.yml After successful full pipeline on *-dev branch, auto-merges the open PR from that dev branch into int-agentic (squash merge). - Updated permissions: contents:write, pull-requests:write Flow: dev push full pipeline auto-merge to int-agentic auto-create PR to main human review merge prod deploy --- .github/workflows/orchestrate.yml | 57 +++++++++++++++- .github/workflows/promote-to-main.yml | 94 +++++++++++++++++++++++++++ 2 files changed, 149 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/promote-to-main.yml diff --git a/.github/workflows/orchestrate.yml b/.github/workflows/orchestrate.yml index 56a34d48b..19d01b36d 100644 --- a/.github/workflows/orchestrate.yml +++ b/.github/workflows/orchestrate.yml @@ -2,11 +2,16 @@ name: CI/CD Pipeline # ───────────────────────────────────────────────────────────────────── # Pipeline modes: -# PR → main / int-agentic ➜ tests-only (validate against existing env) +# PR → main ➜ tests-only (validate against existing env) # Push → main ➜ full deploy (deploy to production) # Push → *-dev ➜ full deploy (deploy to integration-) # Manual dispatch ➜ full deploy (chosen environment) # +# Promotion flow: +# *-dev push → full pipeline → auto-merge PR to int-agentic +# int-agentic push → promote-to-main.yml → creates/updates PR to main +# main PR → human review + tests-only → merge → full deploy to production +# # Per-developer environments: # Each developer pushes to their own -dev branch. # The pipeline maps -dev → integration- environment, @@ -31,7 +36,8 @@ on: - '*-dev' permissions: - contents: read + contents: write + pull-requests: write id-token: write @@ -255,6 +261,53 @@ jobs: eval_limit: 5 secrets: inherit + # ──────────────────────────────────────────────────────────────────── + # Step 7: Auto-merge dev branch PR into int-agentic + # After a successful full deploy from a *-dev branch, automatically + # merge the open PR from that branch into int-agentic. This triggers + # the promote-to-main workflow which creates/updates a PR to main. + # ──────────────────────────────────────────────────────────────────── + auto-merge: + needs: [pipeline-config, integration-tests, agent-evaluation] + if: >- + always() + && needs.pipeline-config.outputs.full_deploy == 'true' + && needs.integration-tests.result == 'success' + && (needs.agent-evaluation.result == 'success' || needs.agent-evaluation.result == 'skipped') + && github.event_name == 'push' + && endsWith(github.ref_name, '-dev') + runs-on: ubuntu-latest + steps: + - name: Merge dev PR into int-agentic + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + BRANCH="${{ github.ref_name }}" + echo "🔍 Looking for open PR: ${BRANCH} → int-agentic" + + PR_NUMBER=$(gh pr list \ + --repo "${{ github.repository }}" \ + --base int-agentic \ + --head "$BRANCH" \ + --state open \ + --json number \ + --jq '.[0].number // empty') + + if [ -n "$PR_NUMBER" ]; then + echo "✅ Found PR #${PR_NUMBER}" + echo "🔀 Merging ${BRANCH} → int-agentic..." + gh pr merge "$PR_NUMBER" \ + --repo "${{ github.repository }}" \ + --squash \ + --auto \ + --subject "chore: merge ${BRANCH} into int-agentic (auto)" \ + --body "Auto-merged after successful CI/CD pipeline run ${{ github.run_id }}" + echo "✅ PR #${PR_NUMBER} merge initiated" + else + echo "ℹ️ No open PR found from ${BRANCH} → int-agentic" + echo " Create one with: gh pr create --base int-agentic --head ${BRANCH}" + fi + # ──────────────────────────────────────────────────────────────────── # NOTE: Auto-destroy is disabled. All environments (integration-* and # production) persist their infrastructure. To tear down an environment diff --git a/.github/workflows/promote-to-main.yml b/.github/workflows/promote-to-main.yml new file mode 100644 index 000000000..cb2c93faf --- /dev/null +++ b/.github/workflows/promote-to-main.yml @@ -0,0 +1,94 @@ +name: Promote to Main + +# ───────────────────────────────────────────────────────────────────── +# Triggered when int-agentic receives a merge (push). +# Creates or updates a single rolling PR from int-agentic → main. +# The PR accumulates all developer changes and requires human review +# before merging to production. +# ───────────────────────────────────────────────────────────────────── + +on: + push: + branches: + - int-agentic + +permissions: + contents: read + pull-requests: write + +jobs: + promote: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Create or update PR to main + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Check if there's already an open PR from int-agentic → main + EXISTING_PR=$(gh pr list \ + --base main \ + --head int-agentic \ + --state open \ + --json number \ + --jq '.[0].number // empty') + + if [ -n "$EXISTING_PR" ]; then + echo "✅ PR #${EXISTING_PR} already exists (int-agentic → main)" + echo " Updating PR body with latest commit info..." + + # Get recent commits since the PR was created + RECENT_COMMITS=$(git log origin/main..HEAD --oneline --no-merges | head -20) + + gh pr edit "$EXISTING_PR" --body "## Promotion: int-agentic → main + + This is an auto-maintained PR that promotes changes from \`int-agentic\` to \`main\` (production). + + **Review required** before merging to production. + + ### Recent Changes + \`\`\` + ${RECENT_COMMITS} + \`\`\` + + ### Pipeline Status + - Integration tests passed on each developer's environment before merge to int-agentic + - Merging this PR will trigger a full production deployment + + --- + _Last updated: $(date -u '+%Y-%m-%d %H:%M UTC') by commit ${{ github.sha }}_" + + echo "✅ PR #${EXISTING_PR} body updated" + else + echo "📝 Creating new PR: int-agentic → main" + + RECENT_COMMITS=$(git log origin/main..HEAD --oneline --no-merges | head -20) + + gh pr create \ + --base main \ + --head int-agentic \ + --title "Promote: int-agentic → main (production)" \ + --body "## Promotion: int-agentic → main + + This is an auto-maintained PR that promotes changes from \`int-agentic\` to \`main\` (production). + + **Review required** before merging to production. + + ### Recent Changes + \`\`\` + ${RECENT_COMMITS} + \`\`\` + + ### Pipeline Status + - Integration tests passed on each developer's environment before merge to int-agentic + - Merging this PR will trigger a full production deployment + + --- + _Created: $(date -u '+%Y-%m-%d %H:%M UTC') by commit ${{ github.sha }}_" + + echo "✅ New PR created" + fi From 632597869475dade91849cc10bba915271b2a531 Mon Sep 17 00:00:00 2001 From: "James N." Date: Fri, 13 Feb 2026 14:37:45 -0800 Subject: [PATCH 7/8] ci: skip pipeline on doc-only changes; update workflows readme --- .github/workflows/orchestrate.yml | 10 +++ .github/workflows/promote-to-main.yml | 4 + .github/workflows/readme.md | 101 ++++++++++++++++++-------- 3 files changed, 85 insertions(+), 30 deletions(-) diff --git a/.github/workflows/orchestrate.yml b/.github/workflows/orchestrate.yml index 19d01b36d..52632b9b8 100644 --- a/.github/workflows/orchestrate.yml +++ b/.github/workflows/orchestrate.yml @@ -29,11 +29,21 @@ on: pull_request: branches: - main + paths-ignore: + - '**/*.md' + - 'docs/**' + - 'LICENSE' + - '.github/workflows/readme.md' push: branches: - main - '*-dev' + paths-ignore: + - '**/*.md' + - 'docs/**' + - 'LICENSE' + - '.github/workflows/readme.md' permissions: contents: write diff --git a/.github/workflows/promote-to-main.yml b/.github/workflows/promote-to-main.yml index cb2c93faf..5eef368a3 100644 --- a/.github/workflows/promote-to-main.yml +++ b/.github/workflows/promote-to-main.yml @@ -11,6 +11,10 @@ on: push: branches: - int-agentic + paths-ignore: + - '**/*.md' + - 'docs/**' + - 'LICENSE' permissions: contents: read diff --git a/.github/workflows/readme.md b/.github/workflows/readme.md index 2b79de81d..2f7cb8baf 100644 --- a/.github/workflows/readme.md +++ b/.github/workflows/readme.md @@ -1,40 +1,81 @@ -# Workflows +# CI/CD Pipeline -## infra-plan-apply.yml +## Flow -### Summary +``` +*-dev ──push──▶ CI/CD Pipeline (8 stages) ──pass──▶ auto-merge PR → int-agentic + │ +int-agentic ◀──────────────────────────────────────────────────┘ + │ + └──push──▶ promote-to-main.yml ──▶ creates/updates PR → main + │ +main ◀────────── human review + merge ◀──────────────────────┘ + │ + └──push──▶ CI/CD Pipeline (production deploy) +``` -The infra plan and apply pipeline is a pipeline to deploy the infrastructure necessary for the Azure Open AI Workshop ot run. It is currently configured to do a workflow dispatch that expects you to choose whether you want bicep or terraform as well as a target environment. Terraform is currently tested. +**Doc-only changes** (`.md`, `docs/`, `LICENSE`) are ignored — no pipeline runs. -### Requirements +## Workflows -#### Environment Variables in GitHub +| File | Trigger | Purpose | +|------|---------|---------| +| `orchestrate.yml` | push to `*-dev`/`main`, PR to `main` | Main CI/CD: infra → build → deploy → test → eval → auto-merge | +| `promote-to-main.yml` | push to `int-agentic` | Creates/updates a rolling PR from `int-agentic` → `main` | +| `infrastructure.yml` | called by orchestrate | Terraform plan + apply with auto-import recovery | +| `docker-application.yml` | called by orchestrate | Build & push backend container to ACR | +| `docker-mcp.yml` | called by orchestrate | Build & push MCP container to ACR | +| `update-containers.yml` | called by orchestrate | Deploy new images to Container Apps | +| `integration-tests.yml` | called by orchestrate | API tests against live environment | +| `agent-evaluation.yml` | called by orchestrate | Agent quality eval → Azure AI Foundry | +| `destroy.yml` | manual dispatch | Terraform destroy for a target environment | -Configure your repo to have necessary variables for your environments. At a minimum, the following are needed: -- AZ_REGION: azure region you plan to deploy to -- AZURE_CLIENT_ID: the deployment client. Currently, this is used with an OIDC process so we don't need to set the secrets. Because of the way we are deploying, needs the ability to assign RBAC in Azure as well as creating resources. -- AZURE_SUBSCRIPTION_ID: the subscription to deploy into. -- AZURE_TENANT_ID: the tenant the client was created in -- DOCKER_IMAGE_BACKEND: docker image repo/name:tag from docker hub for backend FastAPI service. Still need to test with ACR. Also need to test with dynamic build from the repo. -- DOCKER_IMAGE_MCP: docker image repo/name:tag from docker hub for MCP service. Still need to test with ACR. Also need to test with dynamic build from the repo. +## Pipeline Stages -Required for terraform: -- TFSTATE_ACCOUNT: We expect an Azure Storage account for the backend. This is the account name. -- TFSTATE_CONTAINER: the blob container within the storage account where we will hold the state. -- TFSTATE_RG: resource group holding the storage account. +| # | Stage | Push | PR | +|---|-------|------|----| +| 0 | **pipeline-config** — resolve environment & mode | ✅ | ✅ | +| 1 | **preflight** — unlock TF state storage | ✅ | — | +| 2 | **deploy-infrastructure** — Terraform | ✅ | — | +| 3 | **build containers** (backend + MCP, parallel) | ✅ | — | +| 4 | **update-containers** — deploy to Container Apps | ✅ | — | +| — | **resolve-endpoints** — look up existing env | — | ✅ | +| 5 | **integration-tests** | ✅ | ✅* | +| 6 | **agent-evaluation** → Foundry | ✅ | — | +| 7 | **auto-merge** — squash-merge dev PR → int-agentic | ✅† | — | -#### Azure Set Up +\* Skipped if target environment not yet deployed +† Only on `*-dev` branches -- Azure Subscription -- Resource group with a storage account for terraform -- Azure Service Principal (app registration) configured with federated credentials: +## Per-Developer Environments -``` -az ad app federated-credential create --id "$APP_ID" --parameters "$(jq -cn \ ---arg org "$ORG" --arg repo "$REPO_NAME" '{ -name: ("github-"+$repo+"-env-dev"), -issuer: "https://token.actions.githubusercontent.com", -subject: ("repo:"+$org+"/"+$repo+":environment:dev"), -audiences: ["api://AzureADTokenExchange"] -}')" -``` \ No newline at end of file +Each developer has their own GitHub Environment (`integration-`) with their own Azure subscription and OIDC credentials. All config is stored as **environment-level variables** (zero repo-level variables). + +Branch mapping: `james-dev` → `integration-james`, `main` → `production` + +## Required Environment Variables + +| Variable | Description | +|----------|-------------| +| `AZURE_CLIENT_ID` | App registration client ID (OIDC) | +| `AZURE_TENANT_ID` | Entra ID tenant | +| `AZURE_SUBSCRIPTION_ID` | Target subscription | +| `AZ_REGION` | Azure region | +| `PROJECT_NAME` | Project name (e.g. `OpenAIWorkshop`) | +| `ITERATION` | Deployment iteration (e.g. `002`) | +| `TFSTATE_ACCOUNT` | TF state storage account | +| `TFSTATE_CONTAINER` | TF state blob container | +| `TFSTATE_RG` | TF state resource group | +| `MCP_SERVER_URI` | MCP service URI | +| `AZURE_OPENAI_CHAT_DEPLOYMENT` | Chat model deployment | +| `AZURE_OPENAI_EVAL_DEPLOYMENT` | Eval model deployment | +| `AZURE_AI_PROJECT_ENDPOINT` | AI Foundry project endpoint | +| `AZURE_OPENAI_API_VERSION` | OpenAI API version | + +## Azure Setup + +1. Azure subscription with a resource group + storage account for Terraform state +2. App registration with OIDC federated credentials for each GitHub Environment: + ``` + Subject: repo:microsoft/OpenAIWorkshop:environment: + ``` \ No newline at end of file From e61f15087a14e73322472cdc209ec7355856b069 Mon Sep 17 00:00:00 2001 From: "James N." Date: Fri, 13 Feb 2026 14:41:13 -0800 Subject: [PATCH 8/8] docs: consolidate CI/CD docs, link infra README to workflows readme --- infra/README.md | 95 ++++++++++++++++--------------------------------- 1 file changed, 30 insertions(+), 65 deletions(-) diff --git a/infra/README.md b/infra/README.md index 389cfa0cd..687cf67fe 100644 --- a/infra/README.md +++ b/infra/README.md @@ -336,78 +336,43 @@ az containerapp logs show --name ca-be-002 --resource-group rg-OpenAIWorkshop-de ## Automated CI/CD (GitHub Actions) -For enterprise deployments, we recommend using GitHub Actions with OIDC authentication for secure, automated deployments. +The project uses a fully automated CI/CD pipeline with **per-developer environments** and **OIDC authentication** (no stored secrets). -### 📖 Complete Setup Guide +### Pipeline Flow -See **[GITHUB_ACTIONS_SETUP.md](./GITHUB_ACTIONS_SETUP.md)** for detailed instructions on: +``` +*-dev push → CI/CD Pipeline → auto-merge → int-agentic → PR to main → human review → production deploy +``` -- Creating Azure App Registration with federated credentials -- Configuring GitHub repository variables and secrets -- Setting up Terraform remote state in Azure Storage -- Granting required Azure RBAC roles +Doc-only changes (`.md`, `docs/`, `LICENSE`) are ignored and do not trigger the pipeline. -### Quick Overview +### Setup -```mermaid -flowchart TB - subgraph GitHub["GitHub Repository"] - Push["Git Push"] - Orchestrate["orchestrate.yml"] - Infra["infrastructure.yml"] - DockerApp["docker-application.yml"] - DockerMCP["docker-mcp.yml"] - Update["update-containers.yml"] - Tests["integration-tests.yml"] - end - - subgraph Azure["Azure"] - OIDC["OIDC Federation"] - TFState["Terraform State"] - ACR["Container Registry"] - Resources["Azure Resources"] - end - - Push --> Orchestrate - Orchestrate --> OIDC - Orchestrate --> Infra - Infra --> TFState - Infra --> Resources - Orchestrate --> DockerApp - Orchestrate --> DockerMCP - DockerApp --> ACR - DockerMCP --> ACR - Orchestrate --> Update - Update --> Resources - Orchestrate --> Tests -``` +1. **Azure**: App Registration with OIDC federated credentials — see [GITHUB_ACTIONS_SETUP.md](./GITHUB_ACTIONS_SETUP.md) +2. **GitHub**: Create an Environment (`integration-`) with environment-level variables (no repo-level vars) +3. **Terraform state**: Storage account in Azure — see [GITHUB_ACTIONS_SETUP.md](./GITHUB_ACTIONS_SETUP.md) -### GitHub Actions Features +### Required Environment Variables -| Feature | Description | -|---------|-------------| -| **OIDC Authentication** | No secrets stored in GitHub - uses federated identity | -| **Remote State** | Terraform state stored in Azure Storage for team collaboration | -| **Multi-Environment** | Automatic environment detection based on branch | -| **Parallel Builds** | Backend and MCP containers build simultaneously | -| **Integration Tests** | Automated tests run after deployment | -| **Auto Cleanup** | Optional infrastructure destruction for dev branches | - -### Required GitHub Variables - -Set these in your repository settings (Settings → Secrets and variables → Actions → Variables): - -| Variable | Description | Example | -|----------|-------------|---------| -| `AZURE_CLIENT_ID` | App Registration Client ID | `1d34c51d-...` | -| `AZURE_TENANT_ID` | Azure AD Tenant ID | `0fbe7234-...` | -| `AZURE_SUBSCRIPTION_ID` | Azure Subscription ID | `840b5c5c-...` | -| `TFSTATE_RG` | Resource group for Terraform state | `rg-tfstate` | -| `TFSTATE_ACCOUNT` | Storage account for Terraform state | `sttfstateoaiworkshop` | -| `TFSTATE_CONTAINER` | Blob container for state files | `tfstate` | -| `PROJECT_NAME` | Project name for resource naming | `OpenAIWorkshop` | -| `ITERATION` | Iteration suffix | `002` | -| `AZ_REGION` | Azure region | `eastus2` | +| Variable | Example | +|----------|---------| +| `AZURE_CLIENT_ID` | `1d34c51d-...` | +| `AZURE_TENANT_ID` | `0fbe7234-...` | +| `AZURE_SUBSCRIPTION_ID` | `840b5c5c-...` | +| `AZ_REGION` | `eastus2` | +| `PROJECT_NAME` | `OpenAIWorkshop` | +| `ITERATION` | `002` | +| `TFSTATE_RG` / `TFSTATE_ACCOUNT` / `TFSTATE_CONTAINER` | TF state storage | +| `AZURE_AI_PROJECT_ENDPOINT` | AI Foundry endpoint | +| `AZURE_OPENAI_EVAL_DEPLOYMENT` | Eval model name | + +### 📖 Full Pipeline Documentation + +See **[../.github/workflows/readme.md](../.github/workflows/readme.md)** for complete details on: +- Pipeline stages and promotion flow +- Workflow file reference +- Per-developer environment architecture +- Path filtering rules ---