diff --git a/.github/workflows/agent-evaluation.yml b/.github/workflows/agent-evaluation.yml index 952507cb..725fd44c 100644 --- a/.github/workflows/agent-evaluation.yml +++ b/.github/workflows/agent-evaluation.yml @@ -54,6 +54,7 @@ jobs: agent-evaluation: name: Agent Quality Evaluation runs-on: ubuntu-latest + environment: ${{ inputs.environment || 'integration' }} permissions: contents: read id-token: write # Needed for OIDC → DefaultAzureCredential diff --git a/.github/workflows/destroy.yml b/.github/workflows/destroy.yml index a47111ce..38bfb6a0 100644 --- a/.github/workflows/destroy.yml +++ b/.github/workflows/destroy.yml @@ -30,7 +30,7 @@ jobs: terraform_destroy: name: Terraform Destroy runs-on: ubuntu-latest - # environment: ${{ inputs.environment || 'dev' }} # Commented out to use repo-level variables + environment: ${{ inputs.environment || 'integration' }} permissions: id-token: write contents: read @@ -66,13 +66,14 @@ jobs: -var subscription_id=${{ vars.AZURE_SUBSCRIPTION_ID }} \ -var acr_name=${{ vars.ACR_NAME }} \ -var location=${{ vars.AZ_REGION }} \ - -var environment=${{ inputs.environment || 'dev' }} \ + -var environment=${{ inputs.environment || 'integration' }} \ -var docker_image_mcp=${{ vars.DOCKER_IMAGE_MCP }} \ -var docker_image_backend=${{ vars.DOCKER_IMAGE_BACKEND }} \ - -var iteration=${{ inputs.environment || 'dev' }} + -var iteration=${{ vars.ITERATION }} env: TFSTATE_RG: ${{ vars.TFSTATE_RG }} TFSTATE_ACCOUNT: ${{ vars.TFSTATE_ACCOUNT }} TFSTATE_CONTAINER: ${{ vars.TFSTATE_CONTAINER }} - TFSTATE_KEY: "${{ github.event.repository.name }}-${{ github.ref_name }}.tfstate" + # Use environment name for state key — must match infrastructure.yml + TFSTATE_KEY: "${{ github.event.repository.name }}-${{ inputs.environment || 'integration' }}.tfstate" diff --git a/.github/workflows/docker-application.yml b/.github/workflows/docker-application.yml index 49089cc4..3f1ab122 100644 --- a/.github/workflows/docker-application.yml +++ b/.github/workflows/docker-application.yml @@ -27,7 +27,7 @@ jobs: build: name: Build & Push Backend Image runs-on: ubuntu-latest - # environment: ${{ inputs.environment || 'dev' }} # Commented out to use repo-level variables + environment: ${{ inputs.environment || 'integration' }} permissions: id-token: write contents: read @@ -46,10 +46,11 @@ jobs: id: acr run: | # Construct ACR name matching Terraform pattern: {project}{env}acr{iteration} + # ACR names must be alphanumeric — strip hyphens to match Terraform's replace("-", "") PROJECT="${{ vars.PROJECT_NAME || 'OpenAIWorkshop' }}" ENV="${{ inputs.environment || 'dev' }}" ITERATION="${{ vars.ITERATION || '002' }}" - ACR_NAME="${PROJECT}${ENV}acr${ITERATION}" + ACR_NAME=$(echo "${PROJECT}${ENV}acr${ITERATION}" | tr -d '-') echo "name=${ACR_NAME}" >> $GITHUB_OUTPUT echo "server=${ACR_NAME}.azurecr.io" >> $GITHUB_OUTPUT echo "Using ACR: ${ACR_NAME}" diff --git a/.github/workflows/docker-mcp.yml b/.github/workflows/docker-mcp.yml index 1d995f36..3242142a 100644 --- a/.github/workflows/docker-mcp.yml +++ b/.github/workflows/docker-mcp.yml @@ -27,7 +27,7 @@ jobs: build: name: Build & Push MCP Image runs-on: ubuntu-latest - # environment: ${{ inputs.environment || 'dev' }} # Commented out to use repo-level variables + environment: ${{ inputs.environment || 'integration' }} permissions: id-token: write contents: read @@ -46,10 +46,11 @@ jobs: id: acr run: | # Construct ACR name matching Terraform pattern: {project}{env}acr{iteration} + # ACR names must be alphanumeric — strip hyphens to match Terraform's replace("-", "") PROJECT="${{ vars.PROJECT_NAME || 'OpenAIWorkshop' }}" ENV="${{ inputs.environment || 'dev' }}" ITERATION="${{ vars.ITERATION || '002' }}" - ACR_NAME="${PROJECT}${ENV}acr${ITERATION}" + ACR_NAME=$(echo "${PROJECT}${ENV}acr${ITERATION}" | tr -d '-') echo "name=${ACR_NAME}" >> $GITHUB_OUTPUT echo "server=${ACR_NAME}.azurecr.io" >> $GITHUB_OUTPUT echo "Using ACR: ${ACR_NAME}" diff --git a/.github/workflows/infrastructure.yml b/.github/workflows/infrastructure.yml index 25f33238..00a4f579 100644 --- a/.github/workflows/infrastructure.yml +++ b/.github/workflows/infrastructure.yml @@ -41,7 +41,7 @@ jobs: tf: name: Terraform Deployment runs-on: ubuntu-latest - # environment: removed to use repo-level variables + environment: ${{ inputs.environment }} if: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.iac-tool || 'tf') == 'tf' }} permissions: id-token: write @@ -65,13 +65,13 @@ jobs: - name: Terraform Setup uses: hashicorp/setup-terraform@v3 - - name: Sanitize branch name for state key + - name: Sanitize environment name for state key id: sanitize run: | # Replace / and other invalid chars with - for valid Azure blob name - BRANCH="${{ github.head_ref || github.ref_name }}" - SAFE_BRANCH=$(echo "$BRANCH" | sed 's/[^a-zA-Z0-9._-]/-/g') - echo "branch=$SAFE_BRANCH" >> $GITHUB_OUTPUT + ENV="${{ inputs.environment }}" + SAFE_ENV=$(echo "$ENV" | sed 's/[^a-zA-Z0-9._-]/-/g') + echo "env=$SAFE_ENV" >> $GITHUB_OUTPUT - name: Terraform Init/Plan/Apply id: terraform @@ -82,21 +82,73 @@ jobs: export ARM_TENANT_ID="${{ vars.AZURE_TENANT_ID }}" export ARM_SUBSCRIPTION_ID="${{ vars.AZURE_SUBSCRIPTION_ID }}" + # Common -var flags used by plan and import + TF_VARS=( + -var project_name=${{ github.event.repository.name }} + -var environment=${{ inputs.environment }} + -var tenant_id=${{ vars.AZURE_TENANT_ID }} + -var subscription_id=${{ vars.AZURE_SUBSCRIPTION_ID }} + -var acr_name=${{ vars.ACR_NAME }} + -var location=${{ vars.AZ_REGION }} + -var docker_image_mcp=${{ vars.DOCKER_IMAGE_MCP }} + -var docker_image_backend=${{ vars.DOCKER_IMAGE_BACKEND }} + -var iteration=${{ vars.ITERATION }} + ) + terraform init -backend-config="resource_group_name=${TFSTATE_RG}" \ -backend-config="key=${TFSTATE_KEY}" -backend-config="storage_account_name=${TFSTATE_ACCOUNT}" \ -backend-config="container_name=${TFSTATE_CONTAINER}" -backend-config="use_oidc=true" -backend-config="use_azuread_auth=true" - terraform plan -out tfplan \ - -var project_name=${{ github.event.repository.name }} \ - -var environment=${{ github.event_name == 'workflow_dispatch' && github.event.inputs.environment || (github.base_ref == 'main' && 'prod') || (github.base_ref == 'int-agentic' && 'integration') || 'dev' }} \ - -var tenant_id=${{ vars.AZURE_TENANT_ID }} \ - -var subscription_id=${{ vars.AZURE_SUBSCRIPTION_ID }} \ - -var acr_name=${{ vars.ACR_NAME }} \ - -var location=${{ vars.AZ_REGION }} \ - -var docker_image_mcp=${{ vars.DOCKER_IMAGE_MCP }} \ - -var docker_image_backend=${{ vars.DOCKER_IMAGE_BACKEND }} \ - -var iteration=${{ (github.event_name != 'workflow_dispatch' && github.base_ref != 'main' && github.base_ref != 'int-agentic') && '${GITHUB_SHA:0:7}' || vars.ITERATION }} - - terraform apply -auto-approve tfplan + + # ── Apply with auto-import on "already exists" errors ── + # If a prior run partially created resources but crashed before recording + # them in state, Terraform will fail with "already exists". This loop + # detects those errors, auto-imports the orphaned resources, and retries. + MAX_ATTEMPTS=3 + for attempt in $(seq 1 $MAX_ATTEMPTS); do + echo "🔄 Terraform apply attempt $attempt/$MAX_ATTEMPTS" + + terraform plan -out tfplan "${TF_VARS[@]}" + + if terraform apply -auto-approve tfplan 2>&1 | tee /tmp/tf_apply.log; then + echo "✅ Terraform apply succeeded" + break + fi + + # Check if the failure is due to "already exists" errors + if ! grep -q "already exists" /tmp/tf_apply.log; then + echo "❌ Terraform failed with a non-import error" + cat /tmp/tf_apply.log + exit 1 + fi + + if [ "$attempt" -eq "$MAX_ATTEMPTS" ]; then + echo "❌ Terraform failed after $MAX_ATTEMPTS attempts" + cat /tmp/tf_apply.log + exit 1 + fi + + echo "⚠️ Detected 'already exists' errors — auto-importing orphaned resources..." + + # Parse error output: extract terraform address and Azure resource ID pairs + # Error format: with azurerm_container_app.mcp, + # followed by: a resource with the ID "/.../containerApps/ca-mcp-002" already exists + while IFS= read -r line; do + # Extract the TF resource address (e.g. azurerm_container_app.mcp) + tf_addr=$(echo "$line" | grep -oP 'with \K[a-zA-Z0-9_.]+(?=,)') + # Extract the Azure resource ID + azure_id=$(echo "$line" | grep -oP 'the ID "\K[^"]+') + + if [ -n "$tf_addr" ] && [ -n "$azure_id" ]; then + echo " 📥 Importing $tf_addr → $azure_id" + terraform import "${TF_VARS[@]}" "$tf_addr" "$azure_id" || true + fi + done < <( + # Combine consecutive lines so address + ID are on the same logical line + cat /tmp/tf_apply.log | tr '\n' '§' | sed 's/§│/│/g' | tr '§' '\n' | grep "already exists" + ) + + echo "🔁 Retrying terraform apply..." + done output=$(terraform output -raw openai_endpoint 2>/dev/null || true) echo "MODEL_ENDPOINT=$output" >> $GITHUB_OUTPUT @@ -109,12 +161,12 @@ jobs: TFSTATE_RG: ${{ vars.TFSTATE_RG }} TFSTATE_ACCOUNT: ${{ vars.TFSTATE_ACCOUNT }} TFSTATE_CONTAINER: ${{ vars.TFSTATE_CONTAINER }} - # Use sanitized branch name for valid Azure blob name - TFSTATE_KEY: "${{ github.event.repository.name }}-${{ steps.sanitize.outputs.branch }}.tfstate" + # Use environment name for state key — each env gets its own TF state + TFSTATE_KEY: "${{ github.event.repository.name }}-${{ steps.sanitize.outputs.env }}.tfstate" bicep: runs-on: ubuntu-latest - # environment: removed to use repo-level variables + environment: ${{ inputs.environment }} if: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.iac-tool || 'tf') == 'bicep' }} permissions: id-token: write diff --git a/.github/workflows/orchestrate.yml b/.github/workflows/orchestrate.yml index 00eba75a..ec08ff61 100644 --- a/.github/workflows/orchestrate.yml +++ b/.github/workflows/orchestrate.yml @@ -1,34 +1,53 @@ -name: Orchestrate Deployment +name: CI/CD Pipeline # ───────────────────────────────────────────────────────────────────── # Pipeline modes: -# PR → main / int-agentic ➜ tests-only (validate against existing env) -# Push → main ➜ full deploy (deploy to prod after merge) -# Push → tjs-infra-as-code ➜ full deploy (dev, with auto-destroy) +# PR → main ➜ tests-only (validate against existing env) +# Push → main ➜ full deploy (deploy to production) +# Push → *-dev ➜ full deploy (deploy to integration-) # Manual dispatch ➜ full deploy (chosen environment) +# +# Promotion flow: +# *-dev push → full pipeline → auto-merge PR to int-agentic +# int-agentic push → promote-to-main.yml → creates/updates PR to main +# main PR → human review + tests-only → merge → full deploy to production +# +# Per-developer environments: +# Each developer pushes to their own -dev branch. +# The pipeline maps -dev → integration- environment, +# which contains that developer's own Azure subscription credentials. # ───────────────────────────────────────────────────────────────────── on: workflow_dispatch: inputs: target_env: - type: choice - description: Environment to deploy - options: [dev, test, prod] + type: string + description: "Environment to deploy (e.g. integration-james, production)" required: true pull_request: branches: - main - - int-agentic + paths-ignore: + - '**/*.md' + - 'docs/**' + - 'LICENSE' + - '.github/workflows/readme.md' push: branches: - main - - tjs-infra-as-code + - '*-dev' + paths-ignore: + - '**/*.md' + - 'docs/**' + - 'LICENSE' + - '.github/workflows/readme.md' permissions: - contents: read + contents: write + pull-requests: write id-token: write @@ -51,19 +70,25 @@ jobs: if [ "$EVENT" = "workflow_dispatch" ]; then ENV="${{ inputs.target_env }}" elif [ "$EVENT" = "pull_request" ]; then + # PRs: resolve from the target (base) branch case "${{ github.base_ref }}" in - main) ENV="prod" ;; - int-agentic) ENV="integration" ;; - *) ENV="dev" ;; + main) ENV="production" ;; + *) ENV="integration" ;; esac elif [ "$EVENT" = "push" ]; then - case "${{ github.ref_name }}" in - main) ENV="prod" ;; - tjs-infra-as-code) ENV="dev" ;; - *) ENV="dev" ;; + BRANCH="${{ github.ref_name }}" + case "$BRANCH" in + main) + ENV="production" ;; + *-dev) + # Extract developer name: james-dev → integration-james + DEV_NAME="${BRANCH%-dev}" + ENV="integration-${DEV_NAME}" ;; + *) + ENV="integration" ;; esac else - ENV="dev" + ENV="integration" fi # ── Resolve pipeline mode ── @@ -90,6 +115,7 @@ jobs: needs: pipeline-config if: needs.pipeline-config.outputs.full_deploy == 'true' runs-on: ubuntu-latest + environment: ${{ needs.pipeline-config.outputs.environment }} steps: - name: Azure OIDC Login uses: azure/login@v2 @@ -164,9 +190,11 @@ jobs: needs: pipeline-config if: needs.pipeline-config.outputs.full_deploy == 'false' runs-on: ubuntu-latest + environment: ${{ needs.pipeline-config.outputs.environment }} outputs: backend_endpoint: ${{ steps.lookup.outputs.backend_endpoint }} mcp_endpoint: ${{ steps.lookup.outputs.mcp_endpoint }} + deployed: ${{ steps.lookup.outputs.deployed }} steps: - name: Azure OIDC Login uses: azure/login@v2 @@ -197,11 +225,13 @@ jobs: if [ -n "$BE_FQDN" ]; then echo "backend_endpoint=https://${BE_FQDN}" >> $GITHUB_OUTPUT + echo "deployed=true" >> $GITHUB_OUTPUT echo "✅ Backend: https://${BE_FQDN}" else - echo "::error::Backend Container App not found in $RG – is the environment deployed?" + echo "::warning::Backend Container App not found in $RG – environment not yet deployed. Skipping PR tests." echo "backend_endpoint=" >> $GITHUB_OUTPUT - exit 1 + echo "deployed=false" >> $GITHUB_OUTPUT + exit 0 fi if [ -n "$MCP_FQDN" ]; then @@ -222,7 +252,7 @@ jobs: if: >- always() && ( needs.update-containers.result == 'success' - || needs.resolve-endpoints.result == 'success' + || (needs.resolve-endpoints.result == 'success' && needs.resolve-endpoints.outputs.deployed == 'true') ) uses: ./.github/workflows/integration-tests.yml with: @@ -251,17 +281,54 @@ jobs: secrets: inherit # ──────────────────────────────────────────────────────────────────── - # Optional: Destroy infrastructure (dev branches only, after tests pass) + # Step 7: Auto-merge dev branch PR into int-agentic + # After a successful full deploy from a *-dev branch, automatically + # merge the open PR from that branch into int-agentic. This triggers + # the promote-to-main workflow which creates/updates a PR to main. # ──────────────────────────────────────────────────────────────────── - destroy-infrastructure: + auto-merge: needs: [pipeline-config, integration-tests, agent-evaluation] if: >- always() && needs.pipeline-config.outputs.full_deploy == 'true' && needs.integration-tests.result == 'success' - && (needs.agent-evaluation.result == 'success' || needs.agent-evaluation.result == 'skipped' || needs.agent-evaluation.result == 'failure') - && (github.ref_name == 'tjs-infra-as-code' || github.ref_name == 'james-dev' || (inputs.target_env && inputs.target_env == 'dev')) - uses: ./.github/workflows/destroy.yml - with: - environment: ${{ needs.pipeline-config.outputs.environment }} - secrets: inherit + && (needs.agent-evaluation.result == 'success' || needs.agent-evaluation.result == 'skipped') + && github.event_name == 'push' + && endsWith(github.ref_name, '-dev') + runs-on: ubuntu-latest + steps: + - name: Merge dev PR into int-agentic + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + BRANCH="${{ github.ref_name }}" + echo "🔍 Looking for open PR: ${BRANCH} → int-agentic" + + PR_NUMBER=$(gh pr list \ + --repo "${{ github.repository }}" \ + --base int-agentic \ + --head "$BRANCH" \ + --state open \ + --json number \ + --jq '.[0].number // empty') + + if [ -n "$PR_NUMBER" ]; then + echo "✅ Found PR #${PR_NUMBER}" + echo "🔀 Merging ${BRANCH} → int-agentic..." + gh pr merge "$PR_NUMBER" \ + --repo "${{ github.repository }}" \ + --squash \ + --auto \ + --subject "chore: merge ${BRANCH} into int-agentic (auto)" \ + --body "Auto-merged after successful CI/CD pipeline run ${{ github.run_id }}" + echo "✅ PR #${PR_NUMBER} merge initiated" + else + echo "ℹ️ No open PR found from ${BRANCH} → int-agentic" + echo " Create one with: gh pr create --base int-agentic --head ${BRANCH}" + fi + + # ──────────────────────────────────────────────────────────────────── + # NOTE: Auto-destroy is disabled. All environments (integration-* and + # production) persist their infrastructure. To tear down an environment + # manually, use: workflow_dispatch → destroy.yml with the target env. + # ──────────────────────────────────────────────────────────────────── diff --git a/.github/workflows/promote-to-main.yml b/.github/workflows/promote-to-main.yml new file mode 100644 index 00000000..5eef368a --- /dev/null +++ b/.github/workflows/promote-to-main.yml @@ -0,0 +1,98 @@ +name: Promote to Main + +# ───────────────────────────────────────────────────────────────────── +# Triggered when int-agentic receives a merge (push). +# Creates or updates a single rolling PR from int-agentic → main. +# The PR accumulates all developer changes and requires human review +# before merging to production. +# ───────────────────────────────────────────────────────────────────── + +on: + push: + branches: + - int-agentic + paths-ignore: + - '**/*.md' + - 'docs/**' + - 'LICENSE' + +permissions: + contents: read + pull-requests: write + +jobs: + promote: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Create or update PR to main + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Check if there's already an open PR from int-agentic → main + EXISTING_PR=$(gh pr list \ + --base main \ + --head int-agentic \ + --state open \ + --json number \ + --jq '.[0].number // empty') + + if [ -n "$EXISTING_PR" ]; then + echo "✅ PR #${EXISTING_PR} already exists (int-agentic → main)" + echo " Updating PR body with latest commit info..." + + # Get recent commits since the PR was created + RECENT_COMMITS=$(git log origin/main..HEAD --oneline --no-merges | head -20) + + gh pr edit "$EXISTING_PR" --body "## Promotion: int-agentic → main + + This is an auto-maintained PR that promotes changes from \`int-agentic\` to \`main\` (production). + + **Review required** before merging to production. + + ### Recent Changes + \`\`\` + ${RECENT_COMMITS} + \`\`\` + + ### Pipeline Status + - Integration tests passed on each developer's environment before merge to int-agentic + - Merging this PR will trigger a full production deployment + + --- + _Last updated: $(date -u '+%Y-%m-%d %H:%M UTC') by commit ${{ github.sha }}_" + + echo "✅ PR #${EXISTING_PR} body updated" + else + echo "📝 Creating new PR: int-agentic → main" + + RECENT_COMMITS=$(git log origin/main..HEAD --oneline --no-merges | head -20) + + gh pr create \ + --base main \ + --head int-agentic \ + --title "Promote: int-agentic → main (production)" \ + --body "## Promotion: int-agentic → main + + This is an auto-maintained PR that promotes changes from \`int-agentic\` to \`main\` (production). + + **Review required** before merging to production. + + ### Recent Changes + \`\`\` + ${RECENT_COMMITS} + \`\`\` + + ### Pipeline Status + - Integration tests passed on each developer's environment before merge to int-agentic + - Merging this PR will trigger a full production deployment + + --- + _Created: $(date -u '+%Y-%m-%d %H:%M UTC') by commit ${{ github.sha }}_" + + echo "✅ New PR created" + fi diff --git a/.github/workflows/readme.md b/.github/workflows/readme.md index 2b79de81..2f7cb8ba 100644 --- a/.github/workflows/readme.md +++ b/.github/workflows/readme.md @@ -1,40 +1,81 @@ -# Workflows +# CI/CD Pipeline -## infra-plan-apply.yml +## Flow -### Summary +``` +*-dev ──push──▶ CI/CD Pipeline (8 stages) ──pass──▶ auto-merge PR → int-agentic + │ +int-agentic ◀──────────────────────────────────────────────────┘ + │ + └──push──▶ promote-to-main.yml ──▶ creates/updates PR → main + │ +main ◀────────── human review + merge ◀──────────────────────┘ + │ + └──push──▶ CI/CD Pipeline (production deploy) +``` -The infra plan and apply pipeline is a pipeline to deploy the infrastructure necessary for the Azure Open AI Workshop ot run. It is currently configured to do a workflow dispatch that expects you to choose whether you want bicep or terraform as well as a target environment. Terraform is currently tested. +**Doc-only changes** (`.md`, `docs/`, `LICENSE`) are ignored — no pipeline runs. -### Requirements +## Workflows -#### Environment Variables in GitHub +| File | Trigger | Purpose | +|------|---------|---------| +| `orchestrate.yml` | push to `*-dev`/`main`, PR to `main` | Main CI/CD: infra → build → deploy → test → eval → auto-merge | +| `promote-to-main.yml` | push to `int-agentic` | Creates/updates a rolling PR from `int-agentic` → `main` | +| `infrastructure.yml` | called by orchestrate | Terraform plan + apply with auto-import recovery | +| `docker-application.yml` | called by orchestrate | Build & push backend container to ACR | +| `docker-mcp.yml` | called by orchestrate | Build & push MCP container to ACR | +| `update-containers.yml` | called by orchestrate | Deploy new images to Container Apps | +| `integration-tests.yml` | called by orchestrate | API tests against live environment | +| `agent-evaluation.yml` | called by orchestrate | Agent quality eval → Azure AI Foundry | +| `destroy.yml` | manual dispatch | Terraform destroy for a target environment | -Configure your repo to have necessary variables for your environments. At a minimum, the following are needed: -- AZ_REGION: azure region you plan to deploy to -- AZURE_CLIENT_ID: the deployment client. Currently, this is used with an OIDC process so we don't need to set the secrets. Because of the way we are deploying, needs the ability to assign RBAC in Azure as well as creating resources. -- AZURE_SUBSCRIPTION_ID: the subscription to deploy into. -- AZURE_TENANT_ID: the tenant the client was created in -- DOCKER_IMAGE_BACKEND: docker image repo/name:tag from docker hub for backend FastAPI service. Still need to test with ACR. Also need to test with dynamic build from the repo. -- DOCKER_IMAGE_MCP: docker image repo/name:tag from docker hub for MCP service. Still need to test with ACR. Also need to test with dynamic build from the repo. +## Pipeline Stages -Required for terraform: -- TFSTATE_ACCOUNT: We expect an Azure Storage account for the backend. This is the account name. -- TFSTATE_CONTAINER: the blob container within the storage account where we will hold the state. -- TFSTATE_RG: resource group holding the storage account. +| # | Stage | Push | PR | +|---|-------|------|----| +| 0 | **pipeline-config** — resolve environment & mode | ✅ | ✅ | +| 1 | **preflight** — unlock TF state storage | ✅ | — | +| 2 | **deploy-infrastructure** — Terraform | ✅ | — | +| 3 | **build containers** (backend + MCP, parallel) | ✅ | — | +| 4 | **update-containers** — deploy to Container Apps | ✅ | — | +| — | **resolve-endpoints** — look up existing env | — | ✅ | +| 5 | **integration-tests** | ✅ | ✅* | +| 6 | **agent-evaluation** → Foundry | ✅ | — | +| 7 | **auto-merge** — squash-merge dev PR → int-agentic | ✅† | — | -#### Azure Set Up +\* Skipped if target environment not yet deployed +† Only on `*-dev` branches -- Azure Subscription -- Resource group with a storage account for terraform -- Azure Service Principal (app registration) configured with federated credentials: +## Per-Developer Environments -``` -az ad app federated-credential create --id "$APP_ID" --parameters "$(jq -cn \ ---arg org "$ORG" --arg repo "$REPO_NAME" '{ -name: ("github-"+$repo+"-env-dev"), -issuer: "https://token.actions.githubusercontent.com", -subject: ("repo:"+$org+"/"+$repo+":environment:dev"), -audiences: ["api://AzureADTokenExchange"] -}')" -``` \ No newline at end of file +Each developer has their own GitHub Environment (`integration-`) with their own Azure subscription and OIDC credentials. All config is stored as **environment-level variables** (zero repo-level variables). + +Branch mapping: `james-dev` → `integration-james`, `main` → `production` + +## Required Environment Variables + +| Variable | Description | +|----------|-------------| +| `AZURE_CLIENT_ID` | App registration client ID (OIDC) | +| `AZURE_TENANT_ID` | Entra ID tenant | +| `AZURE_SUBSCRIPTION_ID` | Target subscription | +| `AZ_REGION` | Azure region | +| `PROJECT_NAME` | Project name (e.g. `OpenAIWorkshop`) | +| `ITERATION` | Deployment iteration (e.g. `002`) | +| `TFSTATE_ACCOUNT` | TF state storage account | +| `TFSTATE_CONTAINER` | TF state blob container | +| `TFSTATE_RG` | TF state resource group | +| `MCP_SERVER_URI` | MCP service URI | +| `AZURE_OPENAI_CHAT_DEPLOYMENT` | Chat model deployment | +| `AZURE_OPENAI_EVAL_DEPLOYMENT` | Eval model deployment | +| `AZURE_AI_PROJECT_ENDPOINT` | AI Foundry project endpoint | +| `AZURE_OPENAI_API_VERSION` | OpenAI API version | + +## Azure Setup + +1. Azure subscription with a resource group + storage account for Terraform state +2. App registration with OIDC federated credentials for each GitHub Environment: + ``` + Subject: repo:microsoft/OpenAIWorkshop:environment: + ``` \ No newline at end of file diff --git a/.github/workflows/update-containers.yml b/.github/workflows/update-containers.yml index 51460d7f..294d3aee 100644 --- a/.github/workflows/update-containers.yml +++ b/.github/workflows/update-containers.yml @@ -35,7 +35,7 @@ jobs: update-containers: name: Update Container Apps runs-on: ubuntu-latest - # environment: ${{ inputs.environment }} # Commented out to use repo-level variables + environment: ${{ inputs.environment }} permissions: id-token: write contents: read @@ -64,7 +64,8 @@ jobs: echo "backend_app=ca-be-${ITERATION}" >> $GITHUB_OUTPUT # ACR name follows Terraform pattern: {project}{env}acr{iteration} - ACR_NAME="${PROJECT}${ENV}acr${ITERATION}" + # ACR names must be alphanumeric — strip hyphens to match Terraform's replace("-", "") + ACR_NAME=$(echo "${PROJECT}${ENV}acr${ITERATION}" | tr -d '-') echo "acr_name=${ACR_NAME}" >> $GITHUB_OUTPUT echo "acr_server=${ACR_NAME}.azurecr.io" >> $GITHUB_OUTPUT echo "Using ACR: ${ACR_NAME}" diff --git a/infra/GITHUB_ACTIONS_SETUP.md b/infra/GITHUB_ACTIONS_SETUP.md index 010b9bf6..a8178319 100644 --- a/infra/GITHUB_ACTIONS_SETUP.md +++ b/infra/GITHUB_ACTIONS_SETUP.md @@ -7,7 +7,8 @@ This guide documents how to configure GitHub Actions for automated infrastructur The CI/CD pipeline uses: - **OIDC Authentication** - No secrets stored in GitHub, uses federated identity - **Remote Terraform State** - Shared state in Azure Storage for team collaboration -- **Environment-based Deployments** - Separate configs for dev, integration, prod +- **Per-developer GitHub Environments** - Each developer has their own `integration-` environment backed by their own Azure subscription +- **Environment-scoped Variables** - All Azure credentials and config are stored per-environment, not at repo level ## Architecture @@ -17,6 +18,10 @@ The CI/CD pipeline uses: ├─────────────────────────────────────────────────────────────────────┤ │ orchestrate.yml │ │ ├── pipeline-config (determine mode + environment) │ +│ │ ├── main branch → production environment │ +│ │ ├── james-dev branch → integration-james environment │ +│ │ ├── nicole-dev branch → integration-nicole environment │ +│ │ └── -dev branch → integration- environment │ │ │ │ │ ├── [Full Deploy – push/manual] │ │ │ ├── preflight (enable storage access) │ @@ -25,8 +30,7 @@ The CI/CD pipeline uses: │ │ ├── docker-mcp.yml (build MCP service image) │ │ │ ├── update-containers.yml (refresh running apps) │ │ │ ├── integration-tests.yml (smoke tests) │ -│ │ ├── agent-evaluation.yml (AI quality evaluation) │ -│ │ └── destroy.yml (optional cleanup, dev only) │ +│ │ └── agent-evaluation.yml (AI quality evaluation) │ │ │ │ │ ├── [Tests Only – pull requests] │ │ │ └── resolve-endpoints (az containerapp show) │ @@ -37,10 +41,9 @@ The CI/CD pipeline uses: │ OIDC (no secrets) ▼ ┌─────────────────────────────────────────────────────────────────────┐ -│ Azure │ +│ Azure (per developer subscription) │ ├─────────────────────────────────────────────────────────────────────┤ -│ ├── App Registration (GitHub-Actions-OpenAIWorkshop) │ -│ │ └── Federated Credentials (main, int-agentic, PRs) │ +│ ├── App Registration (federated credential for environment) │ │ ├── Storage Account (Terraform state) │ │ ├── Container Registry (Docker images) │ │ ├── Container Apps (MCP + Backend) │ @@ -90,54 +93,49 @@ Write-Host "Subscription ID: $SubscriptionId" ## Step 2: Configure Federated Credentials -Create federated credentials for each branch/environment. +Create federated credentials for the GitHub environment that maps to this developer. -> **Important:** GitHub org/repos that have a [customized OIDC subject claim template](https://docs.github.com/en/actions/security-for-github-actions/security-hardening-your-deployments/about-security-hardening-with-openid-connect#customizing-the-subject-claims-for-an-organization-or-repository) -> use a numeric subject format: `repository_owner_id::repository_id::...`. -> You can find these IDs via `gh api repos/{owner}/{repo} --jq '.owner.id, .id'`. -> If your org has NOT customized the template, use the default `repo:ORG/REPO:...` format. +> **Important:** This repo uses a [customized OIDC subject claim template](https://docs.github.com/en/actions/security-for-github-actions/security-hardening-your-deployments/about-security-hardening-with-openid-connect#customizing-the-subject-claims-for-an-organization-or-repository) +> with `repository_owner_id` and `repository_id` instead of the default `repo:ORG/REPO:...` format. +> All CI jobs bind an `environment:` context, so the OIDC subject includes `environment:`. ```powershell $AppId = "YOUR_APP_ID" # From Step 1 -# --- Option A: Default subject format --- -# Main branch (prod) +# ── Per-developer integration environment ── +# Replace with your developer name (e.g., james, nicole, tim) +# The subject must exactly match what GitHub presents in the OIDC token. az ad app federated-credential create --id $AppId --parameters '{ - "name": "github-main", + "name": "github-env-integration-", "issuer": "https://token.actions.githubusercontent.com", - "subject": "repo:YOUR_ORG/YOUR_REPO:ref:refs/heads/main", + "subject": "repository_owner_id:6154722:repository_id:605201834:environment:integration-", "audiences": ["api://AzureADTokenExchange"] }' -# --- Option B: Customized (numeric ID) subject format --- -# Use this if your org has customized the OIDC subject claim template. -# Replace OWNER_ID and REPO_ID with actual values from the GitHub API. - -# Main branch (prod) -az ad app federated-credential create --id $AppId --parameters '{ - "name": "github-main", - "issuer": "https://token.actions.githubusercontent.com", - "subject": "repository_owner_id:OWNER_ID:repository_id:REPO_ID:ref:refs/heads/main", - "audiences": ["api://AzureADTokenExchange"] -}' - -# Integration branch +# ── Production environment (only needed for the prod subscription owner) ── az ad app federated-credential create --id $AppId --parameters '{ - "name": "github-int-agentic", + "name": "github-env-production", "issuer": "https://token.actions.githubusercontent.com", - "subject": "repository_owner_id:OWNER_ID:repository_id:REPO_ID:ref:refs/heads/int-agentic", + "subject": "repository_owner_id:6154722:repository_id:605201834:environment:production", "audiences": ["api://AzureADTokenExchange"] }' -# Pull Requests +# ── Pull Requests (for PR validation against existing env) ── +# Note: PR jobs also bind environment:, so the subject includes it. +# You may need a credential for the PR context too if your PRs run OIDC. az ad app federated-credential create --id $AppId --parameters '{ "name": "github-pullrequests", "issuer": "https://token.actions.githubusercontent.com", - "subject": "repository_owner_id:OWNER_ID:repository_id:REPO_ID:pull_request", + "subject": "repository_owner_id:6154722:repository_id:605201834:pull_request", "audiences": ["api://AzureADTokenExchange"] }' ``` +> **How to find your IDs:** +> - Owner ID: `gh api repos/microsoft/OpenAIWorkshop --jq '.owner.id'` → `6154722` +> - Repo ID: `gh api repos/microsoft/OpenAIWorkshop --jq '.id'` → `605201834` +> - Check current OIDC template: `gh api repos/microsoft/OpenAIWorkshop/actions/oidc/customization/sub` + ## Step 3: Assign Azure Roles ```powershell @@ -224,36 +222,44 @@ az role assignment create ` --scope $STORAGE_ID ``` -## Step 5: Configure GitHub Repository Variables +## Step 5: Configure GitHub Environment Variables + +All variables are stored at the **environment level** (not repo level). Each developer's +`integration-` environment contains their own Azure subscription credentials. -Go to **GitHub → Repository → Settings → Secrets and Variables → Actions → Variables** +Go to **GitHub → Repository → Settings → Environments → `integration-` → Environment variables** -### Required Variables +### Required Variables (per environment) | Variable | Description | Example Value | |----------|-------------|---------------| -| `AZURE_CLIENT_ID` | App Registration Client ID | `1d34c51d-9d49-48f3-9e48-6a0f099c5f03` | -| `AZURE_TENANT_ID` | Azure AD Tenant ID | `0fbe7234-45ea-498b-b7e4-1a8b2d3be4d9` | -| `AZURE_SUBSCRIPTION_ID` | Azure Subscription ID | `840b5c5c-3f4a-459a-94fc-6bad2a969f9d` | +| `AZURE_CLIENT_ID` | App Registration Client ID | `1d34c51d-...` | +| `AZURE_TENANT_ID` | Azure AD Tenant ID | `0fbe7234-...` | +| `AZURE_SUBSCRIPTION_ID` | Azure Subscription ID | `840b5c5c-...` | | `TFSTATE_RG` | Resource group for TF state | `rg-tfstate` | -| `TFSTATE_ACCOUNT` | Storage account name | `sttfstateoaiworkshop` | +| `TFSTATE_ACCOUNT` | Storage account name (globally unique) | `sttfstateoaiworkshop` | | `TFSTATE_CONTAINER` | Blob container name | `tfstate` | -| `ACR_NAME` | Azure Container Registry name | `acropenaiworkshop002` | -| `PROJECT_NAME` | Project identifier | `OpenAIWorkshop` | +| `ACR_NAME` | Azure Container Registry name | `OpenAIWorkshopdevacr002` | +| `PROJECT_NAME` | Project identifier | `openaiworkshop` | | `ITERATION` | Deployment iteration | `002` | | `AZ_REGION` | Azure region | `eastus2` | -| `AZURE_AI_PROJECT_ENDPOINT` | AI Foundry project endpoint for evaluation | `https://eastus2oai.services.ai.azure.com/api/projects/eastus2` | -| `AZURE_OPENAI_EVAL_ENDPOINT` | AI Services endpoint for judge models | `https://eastus2oai.services.ai.azure.com/` | +| `DOCKER_IMAGE_MCP` | MCP Docker image name | `mcp-service` | +| `DOCKER_IMAGE_BACKEND` | Backend Docker image name | `backend-service` | +| `REGISTRY_LOGIN_SERVER` | Container registry server | `docker.io` | +| `AZURE_AI_PROJECT_ENDPOINT` | AI Foundry project endpoint for evaluation | `https://...services.ai.azure.com/api/projects/...` | +| `AZURE_OPENAI_EVAL_ENDPOINT` | AI Services endpoint for judge models | `https://...services.ai.azure.com/` | | `AZURE_OPENAI_EVAL_DEPLOYMENT` | Model deployment for LLM-as-judge | `gpt-5.2` | -### Optional Environment-Specific Variables +### Current Environments -Create GitHub Environments (`dev`, `integration`, `prod`) for environment-specific overrides: - -| Environment | Variable | Value | -|-------------|----------|-------| -| `prod` | `AZ_REGION` | `eastus` | -| `prod` | `ITERATION` | `001` | +| Environment | Owner | Branch Mapping | +|-------------|-------|----------------| +| `production` | James | `main` | +| `integration-james` | James | `james-dev` | +| `integration-nicole` | Nicole | `nicole-dev` | +| `integration-heena` | Heena | `heena-dev` | +| `integration-tim` | Tim | `tim-dev` | +| `integration-matt` | Matt | `matt-dev` | --- @@ -263,10 +269,10 @@ The orchestrator has two modes determined by the trigger: | Trigger | Mode | What runs | Environment | |---------|------|-----------|-------------| -| **PR → `main`** | Tests only | `resolve-endpoints` → `integration-tests` | `prod` | +| **PR → `main`** | Tests only | `resolve-endpoints` → `integration-tests` | `production` | | **PR → `int-agentic`** | Tests only | `resolve-endpoints` → `integration-tests` | `integration` | -| **Push to `main`** (after merge) | Full deploy | Preflight → Infra → Build → Update → Tests → Eval | `prod` | -| **Push to `tjs-infra-as-code`** | Full deploy | Preflight → Infra → Build → Update → Tests → Eval → Destroy | `dev` | +| **Push to `main`** (after merge) | Full deploy | Preflight → Infra → Build → Update → Tests → Eval | `production` | +| **Push to `-dev`** | Full deploy | Preflight → Infra → Build → Update → Tests → Eval | `integration-` | | **Manual dispatch** | Full deploy | Preflight → Infra → Build → Update → Tests → Eval | Chosen env | ### Tests-Only Mode (PRs) @@ -288,23 +294,42 @@ environment. | Workflow | Trigger | What it does | |----------|---------|--------------| -| `orchestrate.yml` | PRs, push to main/tjs-infra-as-code, manual | Orchestrates full or tests-only pipeline | +| `orchestrate.yml` | PRs, push to main/*-dev, manual | Orchestrates full or tests-only pipeline | | `infrastructure.yml` | Called by orchestrate (full deploy) | Terraform plan/apply | | `docker-application.yml` | Called by orchestrate (full deploy) | Build backend container | | `docker-mcp.yml` | Called by orchestrate (full deploy) | Build MCP container | | `update-containers.yml` | Called by orchestrate (full deploy) | Refresh Container Apps | -| `destroy.yml` | Called by orchestrate (dev only) | Terraform destroy | +| `destroy.yml` | Manual dispatch only | Terraform destroy | | `agent-evaluation.yml` | Called by orchestrate (full deploy) | AI quality evaluation via Azure AI Foundry | | `integration-tests.yml` | Called by orchestrate (both modes) | Run pytest integration tests | ## Branch to Environment Mapping -| Branch | Environment | Auto-destroy | -|--------|-------------|--------------| -| `main` | `prod` | ❌ No | -| `int-agentic` | `integration` | ❌ No | -| `tjs-infra-as-code` | `dev` | ✅ Yes | -| Other branches | `dev` | Depends on config | +| Branch | Environment | Persistent | +|--------|-------------|------------| +| `main` | `production` | ✅ Yes | +| `james-dev` | `integration-james` | ✅ Yes | +| `nicole-dev` | `integration-nicole` | ✅ Yes | +| `heena-dev` | `integration-heena` | ✅ Yes | +| `tim-dev` | `integration-tim` | ✅ Yes | +| `matt-dev` | `integration-matt` | ✅ Yes | +| `-dev` | `integration-` | ✅ Yes | + +> All environments persist their infrastructure. To tear down manually, use +> `workflow_dispatch` → `destroy.yml` with the target environment. + +--- + +## Developer Onboarding + +To add a new developer to the pipeline: + +1. **Create an Azure App Registration** in the developer's own Azure tenant (Step 1 above) +2. **Add a federated credential** with subject `repository_owner_id:6154722:repository_id:605201834:environment:integration-` (Step 2 above) +3. **Assign Azure roles** to the service principal (Steps 3 and 3b above) +4. **Create TF state storage** in the developer's subscription (Step 4 above) +5. **Ask a repo admin** to create the `integration-` GitHub Environment and set the 16 environment variables (Step 5 above) +6. **Developer pushes to `-dev`** branch — the pipeline will pick up the environment automatically --- diff --git a/infra/README.md b/infra/README.md index 389cfa0c..687cf67f 100644 --- a/infra/README.md +++ b/infra/README.md @@ -336,78 +336,43 @@ az containerapp logs show --name ca-be-002 --resource-group rg-OpenAIWorkshop-de ## Automated CI/CD (GitHub Actions) -For enterprise deployments, we recommend using GitHub Actions with OIDC authentication for secure, automated deployments. +The project uses a fully automated CI/CD pipeline with **per-developer environments** and **OIDC authentication** (no stored secrets). -### 📖 Complete Setup Guide +### Pipeline Flow -See **[GITHUB_ACTIONS_SETUP.md](./GITHUB_ACTIONS_SETUP.md)** for detailed instructions on: +``` +*-dev push → CI/CD Pipeline → auto-merge → int-agentic → PR to main → human review → production deploy +``` -- Creating Azure App Registration with federated credentials -- Configuring GitHub repository variables and secrets -- Setting up Terraform remote state in Azure Storage -- Granting required Azure RBAC roles +Doc-only changes (`.md`, `docs/`, `LICENSE`) are ignored and do not trigger the pipeline. -### Quick Overview +### Setup -```mermaid -flowchart TB - subgraph GitHub["GitHub Repository"] - Push["Git Push"] - Orchestrate["orchestrate.yml"] - Infra["infrastructure.yml"] - DockerApp["docker-application.yml"] - DockerMCP["docker-mcp.yml"] - Update["update-containers.yml"] - Tests["integration-tests.yml"] - end - - subgraph Azure["Azure"] - OIDC["OIDC Federation"] - TFState["Terraform State"] - ACR["Container Registry"] - Resources["Azure Resources"] - end - - Push --> Orchestrate - Orchestrate --> OIDC - Orchestrate --> Infra - Infra --> TFState - Infra --> Resources - Orchestrate --> DockerApp - Orchestrate --> DockerMCP - DockerApp --> ACR - DockerMCP --> ACR - Orchestrate --> Update - Update --> Resources - Orchestrate --> Tests -``` +1. **Azure**: App Registration with OIDC federated credentials — see [GITHUB_ACTIONS_SETUP.md](./GITHUB_ACTIONS_SETUP.md) +2. **GitHub**: Create an Environment (`integration-`) with environment-level variables (no repo-level vars) +3. **Terraform state**: Storage account in Azure — see [GITHUB_ACTIONS_SETUP.md](./GITHUB_ACTIONS_SETUP.md) -### GitHub Actions Features +### Required Environment Variables -| Feature | Description | -|---------|-------------| -| **OIDC Authentication** | No secrets stored in GitHub - uses federated identity | -| **Remote State** | Terraform state stored in Azure Storage for team collaboration | -| **Multi-Environment** | Automatic environment detection based on branch | -| **Parallel Builds** | Backend and MCP containers build simultaneously | -| **Integration Tests** | Automated tests run after deployment | -| **Auto Cleanup** | Optional infrastructure destruction for dev branches | - -### Required GitHub Variables - -Set these in your repository settings (Settings → Secrets and variables → Actions → Variables): - -| Variable | Description | Example | -|----------|-------------|---------| -| `AZURE_CLIENT_ID` | App Registration Client ID | `1d34c51d-...` | -| `AZURE_TENANT_ID` | Azure AD Tenant ID | `0fbe7234-...` | -| `AZURE_SUBSCRIPTION_ID` | Azure Subscription ID | `840b5c5c-...` | -| `TFSTATE_RG` | Resource group for Terraform state | `rg-tfstate` | -| `TFSTATE_ACCOUNT` | Storage account for Terraform state | `sttfstateoaiworkshop` | -| `TFSTATE_CONTAINER` | Blob container for state files | `tfstate` | -| `PROJECT_NAME` | Project name for resource naming | `OpenAIWorkshop` | -| `ITERATION` | Iteration suffix | `002` | -| `AZ_REGION` | Azure region | `eastus2` | +| Variable | Example | +|----------|---------| +| `AZURE_CLIENT_ID` | `1d34c51d-...` | +| `AZURE_TENANT_ID` | `0fbe7234-...` | +| `AZURE_SUBSCRIPTION_ID` | `840b5c5c-...` | +| `AZ_REGION` | `eastus2` | +| `PROJECT_NAME` | `OpenAIWorkshop` | +| `ITERATION` | `002` | +| `TFSTATE_RG` / `TFSTATE_ACCOUNT` / `TFSTATE_CONTAINER` | TF state storage | +| `AZURE_AI_PROJECT_ENDPOINT` | AI Foundry endpoint | +| `AZURE_OPENAI_EVAL_DEPLOYMENT` | Eval model name | + +### 📖 Full Pipeline Documentation + +See **[../.github/workflows/readme.md](../.github/workflows/readme.md)** for complete details on: +- Pipeline stages and promotion flow +- Workflow file reference +- Per-developer environment architecture +- Path filtering rules --- diff --git a/infra/terraform/prod.tfvars b/infra/terraform/prod.tfvars new file mode 100644 index 00000000..8ee8b17d --- /dev/null +++ b/infra/terraform/prod.tfvars @@ -0,0 +1,34 @@ +# Production environment configuration +environment = "production" +location = "eastus2" +project_name = "OpenAIWorkshop" +iteration = "002" +tenant_id = "0fbe7234-45ea-498b-b7e4-1a8b2d3be4d9" +subscription_id = "840b5c5c-3f4a-459a-94fc-6bad2a969f9d" + +# Optional: Set to false if you want to use API keys (not recommended) +use_cosmos_managed_identity = true + +# OpenAI deployment configuration +create_openai_deployment = true +openai_deployment_name = "gpt-5.2-chat" +openai_model_name = "gpt-5.2-chat" +openai_model_version = "2025-12-11" +openai_api_version = "2025-04-01-preview" +openai_deployment_capacity = 200 # 200k tokens/minute + +# OpenAI embedding deployment configuration +create_openai_embedding_deployment = true +openai_embedding_deployment_name = "text-embedding-ada-002" +openai_embedding_model_name = "text-embedding-ada-002" +openai_embedding_model_version = "2" + +# Networking configuration +enable_networking = true +enable_private_endpoint = true +vnet_address_prefix = "10.10.0.0/16" +container_apps_subnet_prefix = "10.10.0.0/23" +private_endpoint_subnet_prefix = "10.10.2.0/24" + +# MCP Service Security +mcp_internal_only = true