From f7c2943b994ddc2d119701c9ee0c5625039440c3 Mon Sep 17 00:00:00 2001 From: "James N." Date: Fri, 13 Feb 2026 08:22:48 -0800 Subject: [PATCH 1/4] feat: per-developer GitHub Environments with OIDC - Create 6 GitHub environments: production, integration-james, integration-nicole, integration-heena, integration-tim, integration-matt - Move all variables from repo-level to environment-level - Update orchestrate.yml: *-dev branch integration- mapping - Uncomment environment: binding in all 7 reusable workflows - Fix TF state key: use environment name instead of branch name - Fix destroy.yml bugs: iteration var and unsanitized state key - Remove auto-destroy (all environments persist) - Add OIDC federated credentials for integration-james and production - Create prod.tfvars for production environment - Update GITHUB_ACTIONS_SETUP.md with developer onboarding guide --- .github/workflows/agent-evaluation.yml | 1 + .github/workflows/destroy.yml | 9 +- .github/workflows/docker-application.yml | 2 +- .github/workflows/docker-mcp.yml | 2 +- .github/workflows/infrastructure.yml | 20 +-- .github/workflows/orchestrate.yml | 55 +++++---- .github/workflows/update-containers.yml | 2 +- infra/GITHUB_ACTIONS_SETUP.md | 147 +++++++++++++---------- infra/terraform/prod.tfvars | 34 ++++++ 9 files changed, 168 insertions(+), 104 deletions(-) create mode 100644 infra/terraform/prod.tfvars diff --git a/.github/workflows/agent-evaluation.yml b/.github/workflows/agent-evaluation.yml index 952507cbe..725fd44c2 100644 --- a/.github/workflows/agent-evaluation.yml +++ b/.github/workflows/agent-evaluation.yml @@ -54,6 +54,7 @@ jobs: agent-evaluation: name: Agent Quality Evaluation runs-on: ubuntu-latest + environment: ${{ inputs.environment || 'integration' }} permissions: contents: read id-token: write # Needed for OIDC → DefaultAzureCredential diff --git a/.github/workflows/destroy.yml b/.github/workflows/destroy.yml index a47111ce3..38bfb6a09 100644 --- a/.github/workflows/destroy.yml +++ b/.github/workflows/destroy.yml @@ -30,7 +30,7 @@ jobs: terraform_destroy: name: Terraform Destroy runs-on: ubuntu-latest - # environment: ${{ inputs.environment || 'dev' }} # Commented out to use repo-level variables + environment: ${{ inputs.environment || 'integration' }} permissions: id-token: write contents: read @@ -66,13 +66,14 @@ jobs: -var subscription_id=${{ vars.AZURE_SUBSCRIPTION_ID }} \ -var acr_name=${{ vars.ACR_NAME }} \ -var location=${{ vars.AZ_REGION }} \ - -var environment=${{ inputs.environment || 'dev' }} \ + -var environment=${{ inputs.environment || 'integration' }} \ -var docker_image_mcp=${{ vars.DOCKER_IMAGE_MCP }} \ -var docker_image_backend=${{ vars.DOCKER_IMAGE_BACKEND }} \ - -var iteration=${{ inputs.environment || 'dev' }} + -var iteration=${{ vars.ITERATION }} env: TFSTATE_RG: ${{ vars.TFSTATE_RG }} TFSTATE_ACCOUNT: ${{ vars.TFSTATE_ACCOUNT }} TFSTATE_CONTAINER: ${{ vars.TFSTATE_CONTAINER }} - TFSTATE_KEY: "${{ github.event.repository.name }}-${{ github.ref_name }}.tfstate" + # Use environment name for state key — must match infrastructure.yml + TFSTATE_KEY: "${{ github.event.repository.name }}-${{ inputs.environment || 'integration' }}.tfstate" diff --git a/.github/workflows/docker-application.yml b/.github/workflows/docker-application.yml index 49089cc44..407406041 100644 --- a/.github/workflows/docker-application.yml +++ b/.github/workflows/docker-application.yml @@ -27,7 +27,7 @@ jobs: build: name: Build & Push Backend Image runs-on: ubuntu-latest - # environment: ${{ inputs.environment || 'dev' }} # Commented out to use repo-level variables + environment: ${{ inputs.environment || 'integration' }} permissions: id-token: write contents: read diff --git a/.github/workflows/docker-mcp.yml b/.github/workflows/docker-mcp.yml index 1d995f362..f111351a9 100644 --- a/.github/workflows/docker-mcp.yml +++ b/.github/workflows/docker-mcp.yml @@ -27,7 +27,7 @@ jobs: build: name: Build & Push MCP Image runs-on: ubuntu-latest - # environment: ${{ inputs.environment || 'dev' }} # Commented out to use repo-level variables + environment: ${{ inputs.environment || 'integration' }} permissions: id-token: write contents: read diff --git a/.github/workflows/infrastructure.yml b/.github/workflows/infrastructure.yml index 25f33238f..3ba7374ff 100644 --- a/.github/workflows/infrastructure.yml +++ b/.github/workflows/infrastructure.yml @@ -41,7 +41,7 @@ jobs: tf: name: Terraform Deployment runs-on: ubuntu-latest - # environment: removed to use repo-level variables + environment: ${{ inputs.environment }} if: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.iac-tool || 'tf') == 'tf' }} permissions: id-token: write @@ -65,13 +65,13 @@ jobs: - name: Terraform Setup uses: hashicorp/setup-terraform@v3 - - name: Sanitize branch name for state key + - name: Sanitize environment name for state key id: sanitize run: | # Replace / and other invalid chars with - for valid Azure blob name - BRANCH="${{ github.head_ref || github.ref_name }}" - SAFE_BRANCH=$(echo "$BRANCH" | sed 's/[^a-zA-Z0-9._-]/-/g') - echo "branch=$SAFE_BRANCH" >> $GITHUB_OUTPUT + ENV="${{ inputs.environment }}" + SAFE_ENV=$(echo "$ENV" | sed 's/[^a-zA-Z0-9._-]/-/g') + echo "env=$SAFE_ENV" >> $GITHUB_OUTPUT - name: Terraform Init/Plan/Apply id: terraform @@ -87,14 +87,14 @@ jobs: -backend-config="container_name=${TFSTATE_CONTAINER}" -backend-config="use_oidc=true" -backend-config="use_azuread_auth=true" terraform plan -out tfplan \ -var project_name=${{ github.event.repository.name }} \ - -var environment=${{ github.event_name == 'workflow_dispatch' && github.event.inputs.environment || (github.base_ref == 'main' && 'prod') || (github.base_ref == 'int-agentic' && 'integration') || 'dev' }} \ + -var environment=${{ inputs.environment }} \ -var tenant_id=${{ vars.AZURE_TENANT_ID }} \ -var subscription_id=${{ vars.AZURE_SUBSCRIPTION_ID }} \ -var acr_name=${{ vars.ACR_NAME }} \ -var location=${{ vars.AZ_REGION }} \ -var docker_image_mcp=${{ vars.DOCKER_IMAGE_MCP }} \ -var docker_image_backend=${{ vars.DOCKER_IMAGE_BACKEND }} \ - -var iteration=${{ (github.event_name != 'workflow_dispatch' && github.base_ref != 'main' && github.base_ref != 'int-agentic') && '${GITHUB_SHA:0:7}' || vars.ITERATION }} + -var iteration=${{ vars.ITERATION }} terraform apply -auto-approve tfplan @@ -109,12 +109,12 @@ jobs: TFSTATE_RG: ${{ vars.TFSTATE_RG }} TFSTATE_ACCOUNT: ${{ vars.TFSTATE_ACCOUNT }} TFSTATE_CONTAINER: ${{ vars.TFSTATE_CONTAINER }} - # Use sanitized branch name for valid Azure blob name - TFSTATE_KEY: "${{ github.event.repository.name }}-${{ steps.sanitize.outputs.branch }}.tfstate" + # Use environment name for state key — each env gets its own TF state + TFSTATE_KEY: "${{ github.event.repository.name }}-${{ steps.sanitize.outputs.env }}.tfstate" bicep: runs-on: ubuntu-latest - # environment: removed to use repo-level variables + environment: ${{ inputs.environment }} if: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.iac-tool || 'tf') == 'bicep' }} permissions: id-token: write diff --git a/.github/workflows/orchestrate.yml b/.github/workflows/orchestrate.yml index 38c421497..ec09578fd 100644 --- a/.github/workflows/orchestrate.yml +++ b/.github/workflows/orchestrate.yml @@ -3,18 +3,22 @@ name: Orchestrate Deployment # ───────────────────────────────────────────────────────────────────── # Pipeline modes: # PR → main / int-agentic ➜ tests-only (validate against existing env) -# Push → main ➜ full deploy (deploy to prod after merge) -# Push → tjs-infra-as-code ➜ full deploy (dev, with auto-destroy) +# Push → main ➜ full deploy (deploy to production) +# Push → *-dev ➜ full deploy (deploy to integration-) # Manual dispatch ➜ full deploy (chosen environment) +# +# Per-developer environments: +# Each developer pushes to their own -dev branch. +# The pipeline maps -dev → integration- environment, +# which contains that developer's own Azure subscription credentials. # ───────────────────────────────────────────────────────────────────── on: workflow_dispatch: inputs: target_env: - type: choice - description: Environment to deploy - options: [dev, test, prod] + type: string + description: "Environment to deploy (e.g. integration-james, production)" required: true pull_request: @@ -25,7 +29,7 @@ on: push: branches: - main - - tjs-infra-as-code + - '*-dev' permissions: contents: read @@ -51,19 +55,26 @@ jobs: if [ "$EVENT" = "workflow_dispatch" ]; then ENV="${{ inputs.target_env }}" elif [ "$EVENT" = "pull_request" ]; then + # PRs: resolve from the target (base) branch case "${{ github.base_ref }}" in - main) ENV="prod" ;; + main) ENV="production" ;; int-agentic) ENV="integration" ;; - *) ENV="dev" ;; + *) ENV="integration" ;; esac elif [ "$EVENT" = "push" ]; then - case "${{ github.ref_name }}" in - main) ENV="prod" ;; - tjs-infra-as-code) ENV="dev" ;; - *) ENV="dev" ;; + BRANCH="${{ github.ref_name }}" + case "$BRANCH" in + main) + ENV="production" ;; + *-dev) + # Extract developer name: james-dev → integration-james + DEV_NAME="${BRANCH%-dev}" + ENV="integration-${DEV_NAME}" ;; + *) + ENV="integration" ;; esac else - ENV="dev" + ENV="integration" fi # ── Resolve pipeline mode ── @@ -90,6 +101,7 @@ jobs: needs: pipeline-config if: needs.pipeline-config.outputs.full_deploy == 'true' runs-on: ubuntu-latest + environment: ${{ needs.pipeline-config.outputs.environment }} steps: - name: Azure OIDC Login uses: azure/login@v2 @@ -155,6 +167,7 @@ jobs: needs: pipeline-config if: needs.pipeline-config.outputs.full_deploy == 'false' runs-on: ubuntu-latest + environment: ${{ needs.pipeline-config.outputs.environment }} outputs: backend_endpoint: ${{ steps.lookup.outputs.backend_endpoint }} mcp_endpoint: ${{ steps.lookup.outputs.mcp_endpoint }} @@ -242,17 +255,7 @@ jobs: secrets: inherit # ──────────────────────────────────────────────────────────────────── - # Optional: Destroy infrastructure (dev branches only, after tests pass) + # NOTE: Auto-destroy is disabled. All environments (integration-* and + # production) persist their infrastructure. To tear down an environment + # manually, use: workflow_dispatch → destroy.yml with the target env. # ──────────────────────────────────────────────────────────────────── - destroy-infrastructure: - needs: [pipeline-config, integration-tests, agent-evaluation] - if: >- - always() - && needs.pipeline-config.outputs.full_deploy == 'true' - && needs.integration-tests.result == 'success' - && (needs.agent-evaluation.result == 'success' || needs.agent-evaluation.result == 'skipped' || needs.agent-evaluation.result == 'failure') - && (github.ref_name == 'tjs-infra-as-code' || github.ref_name == 'james-dev' || (inputs.target_env && inputs.target_env == 'dev')) - uses: ./.github/workflows/destroy.yml - with: - environment: ${{ needs.pipeline-config.outputs.environment }} - secrets: inherit diff --git a/.github/workflows/update-containers.yml b/.github/workflows/update-containers.yml index 51460d7ff..9137649c6 100644 --- a/.github/workflows/update-containers.yml +++ b/.github/workflows/update-containers.yml @@ -35,7 +35,7 @@ jobs: update-containers: name: Update Container Apps runs-on: ubuntu-latest - # environment: ${{ inputs.environment }} # Commented out to use repo-level variables + environment: ${{ inputs.environment }} permissions: id-token: write contents: read diff --git a/infra/GITHUB_ACTIONS_SETUP.md b/infra/GITHUB_ACTIONS_SETUP.md index 010b9bf67..a8178319a 100644 --- a/infra/GITHUB_ACTIONS_SETUP.md +++ b/infra/GITHUB_ACTIONS_SETUP.md @@ -7,7 +7,8 @@ This guide documents how to configure GitHub Actions for automated infrastructur The CI/CD pipeline uses: - **OIDC Authentication** - No secrets stored in GitHub, uses federated identity - **Remote Terraform State** - Shared state in Azure Storage for team collaboration -- **Environment-based Deployments** - Separate configs for dev, integration, prod +- **Per-developer GitHub Environments** - Each developer has their own `integration-` environment backed by their own Azure subscription +- **Environment-scoped Variables** - All Azure credentials and config are stored per-environment, not at repo level ## Architecture @@ -17,6 +18,10 @@ The CI/CD pipeline uses: ├─────────────────────────────────────────────────────────────────────┤ │ orchestrate.yml │ │ ├── pipeline-config (determine mode + environment) │ +│ │ ├── main branch → production environment │ +│ │ ├── james-dev branch → integration-james environment │ +│ │ ├── nicole-dev branch → integration-nicole environment │ +│ │ └── -dev branch → integration- environment │ │ │ │ │ ├── [Full Deploy – push/manual] │ │ │ ├── preflight (enable storage access) │ @@ -25,8 +30,7 @@ The CI/CD pipeline uses: │ │ ├── docker-mcp.yml (build MCP service image) │ │ │ ├── update-containers.yml (refresh running apps) │ │ │ ├── integration-tests.yml (smoke tests) │ -│ │ ├── agent-evaluation.yml (AI quality evaluation) │ -│ │ └── destroy.yml (optional cleanup, dev only) │ +│ │ └── agent-evaluation.yml (AI quality evaluation) │ │ │ │ │ ├── [Tests Only – pull requests] │ │ │ └── resolve-endpoints (az containerapp show) │ @@ -37,10 +41,9 @@ The CI/CD pipeline uses: │ OIDC (no secrets) ▼ ┌─────────────────────────────────────────────────────────────────────┐ -│ Azure │ +│ Azure (per developer subscription) │ ├─────────────────────────────────────────────────────────────────────┤ -│ ├── App Registration (GitHub-Actions-OpenAIWorkshop) │ -│ │ └── Federated Credentials (main, int-agentic, PRs) │ +│ ├── App Registration (federated credential for environment) │ │ ├── Storage Account (Terraform state) │ │ ├── Container Registry (Docker images) │ │ ├── Container Apps (MCP + Backend) │ @@ -90,54 +93,49 @@ Write-Host "Subscription ID: $SubscriptionId" ## Step 2: Configure Federated Credentials -Create federated credentials for each branch/environment. +Create federated credentials for the GitHub environment that maps to this developer. -> **Important:** GitHub org/repos that have a [customized OIDC subject claim template](https://docs.github.com/en/actions/security-for-github-actions/security-hardening-your-deployments/about-security-hardening-with-openid-connect#customizing-the-subject-claims-for-an-organization-or-repository) -> use a numeric subject format: `repository_owner_id::repository_id::...`. -> You can find these IDs via `gh api repos/{owner}/{repo} --jq '.owner.id, .id'`. -> If your org has NOT customized the template, use the default `repo:ORG/REPO:...` format. +> **Important:** This repo uses a [customized OIDC subject claim template](https://docs.github.com/en/actions/security-for-github-actions/security-hardening-your-deployments/about-security-hardening-with-openid-connect#customizing-the-subject-claims-for-an-organization-or-repository) +> with `repository_owner_id` and `repository_id` instead of the default `repo:ORG/REPO:...` format. +> All CI jobs bind an `environment:` context, so the OIDC subject includes `environment:`. ```powershell $AppId = "YOUR_APP_ID" # From Step 1 -# --- Option A: Default subject format --- -# Main branch (prod) +# ── Per-developer integration environment ── +# Replace with your developer name (e.g., james, nicole, tim) +# The subject must exactly match what GitHub presents in the OIDC token. az ad app federated-credential create --id $AppId --parameters '{ - "name": "github-main", + "name": "github-env-integration-", "issuer": "https://token.actions.githubusercontent.com", - "subject": "repo:YOUR_ORG/YOUR_REPO:ref:refs/heads/main", + "subject": "repository_owner_id:6154722:repository_id:605201834:environment:integration-", "audiences": ["api://AzureADTokenExchange"] }' -# --- Option B: Customized (numeric ID) subject format --- -# Use this if your org has customized the OIDC subject claim template. -# Replace OWNER_ID and REPO_ID with actual values from the GitHub API. - -# Main branch (prod) -az ad app federated-credential create --id $AppId --parameters '{ - "name": "github-main", - "issuer": "https://token.actions.githubusercontent.com", - "subject": "repository_owner_id:OWNER_ID:repository_id:REPO_ID:ref:refs/heads/main", - "audiences": ["api://AzureADTokenExchange"] -}' - -# Integration branch +# ── Production environment (only needed for the prod subscription owner) ── az ad app federated-credential create --id $AppId --parameters '{ - "name": "github-int-agentic", + "name": "github-env-production", "issuer": "https://token.actions.githubusercontent.com", - "subject": "repository_owner_id:OWNER_ID:repository_id:REPO_ID:ref:refs/heads/int-agentic", + "subject": "repository_owner_id:6154722:repository_id:605201834:environment:production", "audiences": ["api://AzureADTokenExchange"] }' -# Pull Requests +# ── Pull Requests (for PR validation against existing env) ── +# Note: PR jobs also bind environment:, so the subject includes it. +# You may need a credential for the PR context too if your PRs run OIDC. az ad app federated-credential create --id $AppId --parameters '{ "name": "github-pullrequests", "issuer": "https://token.actions.githubusercontent.com", - "subject": "repository_owner_id:OWNER_ID:repository_id:REPO_ID:pull_request", + "subject": "repository_owner_id:6154722:repository_id:605201834:pull_request", "audiences": ["api://AzureADTokenExchange"] }' ``` +> **How to find your IDs:** +> - Owner ID: `gh api repos/microsoft/OpenAIWorkshop --jq '.owner.id'` → `6154722` +> - Repo ID: `gh api repos/microsoft/OpenAIWorkshop --jq '.id'` → `605201834` +> - Check current OIDC template: `gh api repos/microsoft/OpenAIWorkshop/actions/oidc/customization/sub` + ## Step 3: Assign Azure Roles ```powershell @@ -224,36 +222,44 @@ az role assignment create ` --scope $STORAGE_ID ``` -## Step 5: Configure GitHub Repository Variables +## Step 5: Configure GitHub Environment Variables + +All variables are stored at the **environment level** (not repo level). Each developer's +`integration-` environment contains their own Azure subscription credentials. -Go to **GitHub → Repository → Settings → Secrets and Variables → Actions → Variables** +Go to **GitHub → Repository → Settings → Environments → `integration-` → Environment variables** -### Required Variables +### Required Variables (per environment) | Variable | Description | Example Value | |----------|-------------|---------------| -| `AZURE_CLIENT_ID` | App Registration Client ID | `1d34c51d-9d49-48f3-9e48-6a0f099c5f03` | -| `AZURE_TENANT_ID` | Azure AD Tenant ID | `0fbe7234-45ea-498b-b7e4-1a8b2d3be4d9` | -| `AZURE_SUBSCRIPTION_ID` | Azure Subscription ID | `840b5c5c-3f4a-459a-94fc-6bad2a969f9d` | +| `AZURE_CLIENT_ID` | App Registration Client ID | `1d34c51d-...` | +| `AZURE_TENANT_ID` | Azure AD Tenant ID | `0fbe7234-...` | +| `AZURE_SUBSCRIPTION_ID` | Azure Subscription ID | `840b5c5c-...` | | `TFSTATE_RG` | Resource group for TF state | `rg-tfstate` | -| `TFSTATE_ACCOUNT` | Storage account name | `sttfstateoaiworkshop` | +| `TFSTATE_ACCOUNT` | Storage account name (globally unique) | `sttfstateoaiworkshop` | | `TFSTATE_CONTAINER` | Blob container name | `tfstate` | -| `ACR_NAME` | Azure Container Registry name | `acropenaiworkshop002` | -| `PROJECT_NAME` | Project identifier | `OpenAIWorkshop` | +| `ACR_NAME` | Azure Container Registry name | `OpenAIWorkshopdevacr002` | +| `PROJECT_NAME` | Project identifier | `openaiworkshop` | | `ITERATION` | Deployment iteration | `002` | | `AZ_REGION` | Azure region | `eastus2` | -| `AZURE_AI_PROJECT_ENDPOINT` | AI Foundry project endpoint for evaluation | `https://eastus2oai.services.ai.azure.com/api/projects/eastus2` | -| `AZURE_OPENAI_EVAL_ENDPOINT` | AI Services endpoint for judge models | `https://eastus2oai.services.ai.azure.com/` | +| `DOCKER_IMAGE_MCP` | MCP Docker image name | `mcp-service` | +| `DOCKER_IMAGE_BACKEND` | Backend Docker image name | `backend-service` | +| `REGISTRY_LOGIN_SERVER` | Container registry server | `docker.io` | +| `AZURE_AI_PROJECT_ENDPOINT` | AI Foundry project endpoint for evaluation | `https://...services.ai.azure.com/api/projects/...` | +| `AZURE_OPENAI_EVAL_ENDPOINT` | AI Services endpoint for judge models | `https://...services.ai.azure.com/` | | `AZURE_OPENAI_EVAL_DEPLOYMENT` | Model deployment for LLM-as-judge | `gpt-5.2` | -### Optional Environment-Specific Variables +### Current Environments -Create GitHub Environments (`dev`, `integration`, `prod`) for environment-specific overrides: - -| Environment | Variable | Value | -|-------------|----------|-------| -| `prod` | `AZ_REGION` | `eastus` | -| `prod` | `ITERATION` | `001` | +| Environment | Owner | Branch Mapping | +|-------------|-------|----------------| +| `production` | James | `main` | +| `integration-james` | James | `james-dev` | +| `integration-nicole` | Nicole | `nicole-dev` | +| `integration-heena` | Heena | `heena-dev` | +| `integration-tim` | Tim | `tim-dev` | +| `integration-matt` | Matt | `matt-dev` | --- @@ -263,10 +269,10 @@ The orchestrator has two modes determined by the trigger: | Trigger | Mode | What runs | Environment | |---------|------|-----------|-------------| -| **PR → `main`** | Tests only | `resolve-endpoints` → `integration-tests` | `prod` | +| **PR → `main`** | Tests only | `resolve-endpoints` → `integration-tests` | `production` | | **PR → `int-agentic`** | Tests only | `resolve-endpoints` → `integration-tests` | `integration` | -| **Push to `main`** (after merge) | Full deploy | Preflight → Infra → Build → Update → Tests → Eval | `prod` | -| **Push to `tjs-infra-as-code`** | Full deploy | Preflight → Infra → Build → Update → Tests → Eval → Destroy | `dev` | +| **Push to `main`** (after merge) | Full deploy | Preflight → Infra → Build → Update → Tests → Eval | `production` | +| **Push to `-dev`** | Full deploy | Preflight → Infra → Build → Update → Tests → Eval | `integration-` | | **Manual dispatch** | Full deploy | Preflight → Infra → Build → Update → Tests → Eval | Chosen env | ### Tests-Only Mode (PRs) @@ -288,23 +294,42 @@ environment. | Workflow | Trigger | What it does | |----------|---------|--------------| -| `orchestrate.yml` | PRs, push to main/tjs-infra-as-code, manual | Orchestrates full or tests-only pipeline | +| `orchestrate.yml` | PRs, push to main/*-dev, manual | Orchestrates full or tests-only pipeline | | `infrastructure.yml` | Called by orchestrate (full deploy) | Terraform plan/apply | | `docker-application.yml` | Called by orchestrate (full deploy) | Build backend container | | `docker-mcp.yml` | Called by orchestrate (full deploy) | Build MCP container | | `update-containers.yml` | Called by orchestrate (full deploy) | Refresh Container Apps | -| `destroy.yml` | Called by orchestrate (dev only) | Terraform destroy | +| `destroy.yml` | Manual dispatch only | Terraform destroy | | `agent-evaluation.yml` | Called by orchestrate (full deploy) | AI quality evaluation via Azure AI Foundry | | `integration-tests.yml` | Called by orchestrate (both modes) | Run pytest integration tests | ## Branch to Environment Mapping -| Branch | Environment | Auto-destroy | -|--------|-------------|--------------| -| `main` | `prod` | ❌ No | -| `int-agentic` | `integration` | ❌ No | -| `tjs-infra-as-code` | `dev` | ✅ Yes | -| Other branches | `dev` | Depends on config | +| Branch | Environment | Persistent | +|--------|-------------|------------| +| `main` | `production` | ✅ Yes | +| `james-dev` | `integration-james` | ✅ Yes | +| `nicole-dev` | `integration-nicole` | ✅ Yes | +| `heena-dev` | `integration-heena` | ✅ Yes | +| `tim-dev` | `integration-tim` | ✅ Yes | +| `matt-dev` | `integration-matt` | ✅ Yes | +| `-dev` | `integration-` | ✅ Yes | + +> All environments persist their infrastructure. To tear down manually, use +> `workflow_dispatch` → `destroy.yml` with the target environment. + +--- + +## Developer Onboarding + +To add a new developer to the pipeline: + +1. **Create an Azure App Registration** in the developer's own Azure tenant (Step 1 above) +2. **Add a federated credential** with subject `repository_owner_id:6154722:repository_id:605201834:environment:integration-` (Step 2 above) +3. **Assign Azure roles** to the service principal (Steps 3 and 3b above) +4. **Create TF state storage** in the developer's subscription (Step 4 above) +5. **Ask a repo admin** to create the `integration-` GitHub Environment and set the 16 environment variables (Step 5 above) +6. **Developer pushes to `-dev`** branch — the pipeline will pick up the environment automatically --- diff --git a/infra/terraform/prod.tfvars b/infra/terraform/prod.tfvars new file mode 100644 index 000000000..8ee8b17d5 --- /dev/null +++ b/infra/terraform/prod.tfvars @@ -0,0 +1,34 @@ +# Production environment configuration +environment = "production" +location = "eastus2" +project_name = "OpenAIWorkshop" +iteration = "002" +tenant_id = "0fbe7234-45ea-498b-b7e4-1a8b2d3be4d9" +subscription_id = "840b5c5c-3f4a-459a-94fc-6bad2a969f9d" + +# Optional: Set to false if you want to use API keys (not recommended) +use_cosmos_managed_identity = true + +# OpenAI deployment configuration +create_openai_deployment = true +openai_deployment_name = "gpt-5.2-chat" +openai_model_name = "gpt-5.2-chat" +openai_model_version = "2025-12-11" +openai_api_version = "2025-04-01-preview" +openai_deployment_capacity = 200 # 200k tokens/minute + +# OpenAI embedding deployment configuration +create_openai_embedding_deployment = true +openai_embedding_deployment_name = "text-embedding-ada-002" +openai_embedding_model_name = "text-embedding-ada-002" +openai_embedding_model_version = "2" + +# Networking configuration +enable_networking = true +enable_private_endpoint = true +vnet_address_prefix = "10.10.0.0/16" +container_apps_subnet_prefix = "10.10.0.0/23" +private_endpoint_subnet_prefix = "10.10.2.0/24" + +# MCP Service Security +mcp_internal_only = true From 055915b110813295ae1374fe0c9d75ea517962cc Mon Sep 17 00:00:00 2001 From: "James N." Date: Fri, 13 Feb 2026 08:34:00 -0800 Subject: [PATCH 2/4] fix: strip hyphens from ACR name in Docker workflows --- .github/workflows/docker-application.yml | 3 ++- .github/workflows/docker-mcp.yml | 3 ++- .github/workflows/update-containers.yml | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/docker-application.yml b/.github/workflows/docker-application.yml index 407406041..3f1ab122a 100644 --- a/.github/workflows/docker-application.yml +++ b/.github/workflows/docker-application.yml @@ -46,10 +46,11 @@ jobs: id: acr run: | # Construct ACR name matching Terraform pattern: {project}{env}acr{iteration} + # ACR names must be alphanumeric — strip hyphens to match Terraform's replace("-", "") PROJECT="${{ vars.PROJECT_NAME || 'OpenAIWorkshop' }}" ENV="${{ inputs.environment || 'dev' }}" ITERATION="${{ vars.ITERATION || '002' }}" - ACR_NAME="${PROJECT}${ENV}acr${ITERATION}" + ACR_NAME=$(echo "${PROJECT}${ENV}acr${ITERATION}" | tr -d '-') echo "name=${ACR_NAME}" >> $GITHUB_OUTPUT echo "server=${ACR_NAME}.azurecr.io" >> $GITHUB_OUTPUT echo "Using ACR: ${ACR_NAME}" diff --git a/.github/workflows/docker-mcp.yml b/.github/workflows/docker-mcp.yml index f111351a9..3242142a8 100644 --- a/.github/workflows/docker-mcp.yml +++ b/.github/workflows/docker-mcp.yml @@ -46,10 +46,11 @@ jobs: id: acr run: | # Construct ACR name matching Terraform pattern: {project}{env}acr{iteration} + # ACR names must be alphanumeric — strip hyphens to match Terraform's replace("-", "") PROJECT="${{ vars.PROJECT_NAME || 'OpenAIWorkshop' }}" ENV="${{ inputs.environment || 'dev' }}" ITERATION="${{ vars.ITERATION || '002' }}" - ACR_NAME="${PROJECT}${ENV}acr${ITERATION}" + ACR_NAME=$(echo "${PROJECT}${ENV}acr${ITERATION}" | tr -d '-') echo "name=${ACR_NAME}" >> $GITHUB_OUTPUT echo "server=${ACR_NAME}.azurecr.io" >> $GITHUB_OUTPUT echo "Using ACR: ${ACR_NAME}" diff --git a/.github/workflows/update-containers.yml b/.github/workflows/update-containers.yml index 9137649c6..294d3aee7 100644 --- a/.github/workflows/update-containers.yml +++ b/.github/workflows/update-containers.yml @@ -64,7 +64,8 @@ jobs: echo "backend_app=ca-be-${ITERATION}" >> $GITHUB_OUTPUT # ACR name follows Terraform pattern: {project}{env}acr{iteration} - ACR_NAME="${PROJECT}${ENV}acr${ITERATION}" + # ACR names must be alphanumeric — strip hyphens to match Terraform's replace("-", "") + ACR_NAME=$(echo "${PROJECT}${ENV}acr${ITERATION}" | tr -d '-') echo "acr_name=${ACR_NAME}" >> $GITHUB_OUTPUT echo "acr_server=${ACR_NAME}.azurecr.io" >> $GITHUB_OUTPUT echo "Using ACR: ${ACR_NAME}" From 0f10403dd1353584d10db7c3dcdd21510762bdd1 Mon Sep 17 00:00:00 2001 From: "James N." Date: Fri, 13 Feb 2026 10:15:58 -0800 Subject: [PATCH 3/4] Add auto-import recovery for Terraform 'already exists' errors When a Terraform apply fails midway (e.g., timeout, quota), resources may exist in Azure but not in TF state. On retry, Terraform fails with 'already exists'. This change adds a retry loop (max 3 attempts) that: 1. Detects 'already exists' errors in apply output 2. Parses the TF resource address and Azure resource ID 3. Auto-imports orphaned resources into state 4. Retries the apply Eliminates need for manual deletion via Azure Portal. --- .github/workflows/infrastructure.yml | 74 +++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 11 deletions(-) diff --git a/.github/workflows/infrastructure.yml b/.github/workflows/infrastructure.yml index 3ba7374ff..00a4f579b 100644 --- a/.github/workflows/infrastructure.yml +++ b/.github/workflows/infrastructure.yml @@ -82,21 +82,73 @@ jobs: export ARM_TENANT_ID="${{ vars.AZURE_TENANT_ID }}" export ARM_SUBSCRIPTION_ID="${{ vars.AZURE_SUBSCRIPTION_ID }}" + # Common -var flags used by plan and import + TF_VARS=( + -var project_name=${{ github.event.repository.name }} + -var environment=${{ inputs.environment }} + -var tenant_id=${{ vars.AZURE_TENANT_ID }} + -var subscription_id=${{ vars.AZURE_SUBSCRIPTION_ID }} + -var acr_name=${{ vars.ACR_NAME }} + -var location=${{ vars.AZ_REGION }} + -var docker_image_mcp=${{ vars.DOCKER_IMAGE_MCP }} + -var docker_image_backend=${{ vars.DOCKER_IMAGE_BACKEND }} + -var iteration=${{ vars.ITERATION }} + ) + terraform init -backend-config="resource_group_name=${TFSTATE_RG}" \ -backend-config="key=${TFSTATE_KEY}" -backend-config="storage_account_name=${TFSTATE_ACCOUNT}" \ -backend-config="container_name=${TFSTATE_CONTAINER}" -backend-config="use_oidc=true" -backend-config="use_azuread_auth=true" - terraform plan -out tfplan \ - -var project_name=${{ github.event.repository.name }} \ - -var environment=${{ inputs.environment }} \ - -var tenant_id=${{ vars.AZURE_TENANT_ID }} \ - -var subscription_id=${{ vars.AZURE_SUBSCRIPTION_ID }} \ - -var acr_name=${{ vars.ACR_NAME }} \ - -var location=${{ vars.AZ_REGION }} \ - -var docker_image_mcp=${{ vars.DOCKER_IMAGE_MCP }} \ - -var docker_image_backend=${{ vars.DOCKER_IMAGE_BACKEND }} \ - -var iteration=${{ vars.ITERATION }} - terraform apply -auto-approve tfplan + # ── Apply with auto-import on "already exists" errors ── + # If a prior run partially created resources but crashed before recording + # them in state, Terraform will fail with "already exists". This loop + # detects those errors, auto-imports the orphaned resources, and retries. + MAX_ATTEMPTS=3 + for attempt in $(seq 1 $MAX_ATTEMPTS); do + echo "🔄 Terraform apply attempt $attempt/$MAX_ATTEMPTS" + + terraform plan -out tfplan "${TF_VARS[@]}" + + if terraform apply -auto-approve tfplan 2>&1 | tee /tmp/tf_apply.log; then + echo "✅ Terraform apply succeeded" + break + fi + + # Check if the failure is due to "already exists" errors + if ! grep -q "already exists" /tmp/tf_apply.log; then + echo "❌ Terraform failed with a non-import error" + cat /tmp/tf_apply.log + exit 1 + fi + + if [ "$attempt" -eq "$MAX_ATTEMPTS" ]; then + echo "❌ Terraform failed after $MAX_ATTEMPTS attempts" + cat /tmp/tf_apply.log + exit 1 + fi + + echo "⚠️ Detected 'already exists' errors — auto-importing orphaned resources..." + + # Parse error output: extract terraform address and Azure resource ID pairs + # Error format: with azurerm_container_app.mcp, + # followed by: a resource with the ID "/.../containerApps/ca-mcp-002" already exists + while IFS= read -r line; do + # Extract the TF resource address (e.g. azurerm_container_app.mcp) + tf_addr=$(echo "$line" | grep -oP 'with \K[a-zA-Z0-9_.]+(?=,)') + # Extract the Azure resource ID + azure_id=$(echo "$line" | grep -oP 'the ID "\K[^"]+') + + if [ -n "$tf_addr" ] && [ -n "$azure_id" ]; then + echo " 📥 Importing $tf_addr → $azure_id" + terraform import "${TF_VARS[@]}" "$tf_addr" "$azure_id" || true + fi + done < <( + # Combine consecutive lines so address + ID are on the same logical line + cat /tmp/tf_apply.log | tr '\n' '§' | sed 's/§│/│/g' | tr '§' '\n' | grep "already exists" + ) + + echo "🔁 Retrying terraform apply..." + done output=$(terraform output -raw openai_endpoint 2>/dev/null || true) echo "MODEL_ENDPOINT=$output" >> $GITHUB_OUTPUT From 1a93fc32fb28ada132d857de70112110a79de236 Mon Sep 17 00:00:00 2001 From: "James N." Date: Fri, 13 Feb 2026 10:18:32 -0800 Subject: [PATCH 4/4] Rename workflow to CI/CD Pipeline; fix PR trigger for int-agentic - Rename 'Orchestrate Deployment' -> 'CI/CD Pipeline' - Remove int-agentic from pull_request trigger PRs to int-agentic were failing because environment 'integration' has no OIDC federated credential. PR validation only needed for main (production gate). - Simplify base_ref case statement --- .github/workflows/orchestrate.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/orchestrate.yml b/.github/workflows/orchestrate.yml index ec09578fd..93716859d 100644 --- a/.github/workflows/orchestrate.yml +++ b/.github/workflows/orchestrate.yml @@ -1,4 +1,4 @@ -name: Orchestrate Deployment +name: CI/CD Pipeline # ───────────────────────────────────────────────────────────────────── # Pipeline modes: @@ -24,7 +24,6 @@ on: pull_request: branches: - main - - int-agentic push: branches: @@ -58,7 +57,6 @@ jobs: # PRs: resolve from the target (base) branch case "${{ github.base_ref }}" in main) ENV="production" ;; - int-agentic) ENV="integration" ;; *) ENV="integration" ;; esac elif [ "$EVENT" = "push" ]; then