From 4a0ce1830ebe6ba3c618abd98d3386043684340f Mon Sep 17 00:00:00 2001
From: Motty Chen <motty@xirius.com>
Date: Wed, 4 Mar 2026 16:39:30 -0600
Subject: [PATCH] fix(deploy): harden hotfix promotion reliability

Support production deploys from merged hotfix PRs, add an automatic hotfix-to-preprod deployment workflow, and fail fast when Lambda is not Active after deploy. Stop managing the shared ECR repository from the coaching service stack to prevent image churn breaking Lambda.

Made-with: Cursor
---
 .github/workflows/deploy-preprod-hotfix.yml | 181 ++++++++++++++++++++
 .github/workflows/deploy-production.yml     |  29 +++-
 coaching/pulumi/__main__.py                 |  19 +-
 3 files changed, 212 insertions(+), 17 deletions(-)
 create mode 100644 .github/workflows/deploy-preprod-hotfix.yml

diff --git a/.github/workflows/deploy-preprod-hotfix.yml b/.github/workflows/deploy-preprod-hotfix.yml
new file mode 100644
index 0000000..1857f4f
--- /dev/null
+++ b/.github/workflows/deploy-preprod-hotfix.yml
@@ -0,0 +1,181 @@
+name: Deploy Preprod Hotfix
+
+on:
+  push:
+    branches:
+      - "hotfix/**"
+  workflow_dispatch:
+    inputs:
+      branch:
+        description: "Branch to deploy (defaults to current ref)"
+        required: false
+        type: string
+      skip_tests:
+        description: "Skip tests before deployment"
+        required: false
+        default: "false"
+        type: choice
+        options:
+          - "true"
+          - "false"
+
+concurrency:
+  group: deploy-preprod-hotfix-${{ github.ref_name }}
+  cancel-in-progress: true
+
+jobs:
+  pre-deployment-checks:
+    name: Pre-Deployment Validation
+    runs-on: ubuntu-latest
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.skip_tests != 'true' }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.branch || github.ref }}
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Set up uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          version: "latest"
+          enable-cache: true
+
+      - name: Create virtual environment
+        run: uv venv .venv
+
+      - name: Install dependencies
+        run: |
+          source .venv/bin/activate
+          uv pip install -r coaching/requirements.txt
+          uv pip install -r coaching/requirements-dev.txt
+        shell: bash
+
+      - name: Run Ruff Linting
+        run: |
+          source .venv/bin/activate
+          python -m ruff check . --exclude=".venv,venv,__pycache__,.pytest_cache"
+        shell: bash
+
+      - name: Run MyPy Type Checking
+        run: |
+          source .venv/bin/activate
+          python -m mypy coaching/src/ shared/ --config-file=pyproject.toml
+        shell: bash
+
+      - name: Run Unit Tests
+        run: |
+          source .venv/bin/activate
+          python -m pytest coaching/tests/unit/ -v --cov=coaching/src --cov-fail-under=70
+        shell: bash
+        env:
+          PYTHONPATH: coaching:shared:.
+
+  deploy-coaching:
+    name: Deploy to Preprod
+    runs-on: ubuntu-latest
+    needs: [pre-deployment-checks]
+    if: ${{ always() && (needs.pre-deployment-checks.result == 'success' || (github.event_name == 'workflow_dispatch' && github.event.inputs.skip_tests == 'true')) }}
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.branch || github.ref }}
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install Pulumi Python dependencies
+        working-directory: coaching/pulumi
+        run: pip install -r requirements.txt
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: us-east-1
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+
+      - name: Deploy Coaching Service
+        uses: pulumi/actions@v5
+        with:
+          command: up
+          stack-name: preprod
+          work-dir: coaching/pulumi
+        env:
+          PULUMI_ACCESS_TOKEN: ${{ secrets.PULUMI_ACCESS_TOKEN }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          AWS_REGION: us-east-1
+
+  smoke-tests:
+    name: Post-Deployment Smoke Tests
+    runs-on: ubuntu-latest
+    needs: [deploy-coaching]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.branch || github.ref }}
+
+      - name: Install Pulumi CLI
+        uses: pulumi/actions@v5
+        with:
+          pulumi-version: "latest"
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: us-east-1
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+
+      - name: Get API Gateway URL
+        id: api-url
+        working-directory: coaching/pulumi
+        run: |
+          URL=$(pulumi stack output customDomainUrl --stack preprod)
+          echo "url=$URL" >> $GITHUB_OUTPUT
+        env:
+          PULUMI_ACCESS_TOKEN: ${{ secrets.PULUMI_ACCESS_TOKEN }}
+
+      - name: Health Check
+        run: |
+          HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "${{ steps.api-url.outputs.url }}/health" || echo "000")
+          if [ "$HTTP_CODE" != "200" ] && [ "$HTTP_CODE" != "404" ]; then
+            echo "❌ Health check failed with HTTP $HTTP_CODE"
+            exit 1
+          fi
+          echo "✅ Health check passed ($HTTP_CODE)"
+
+      - name: CORS Preflight Check
+        run: |
+          ORIGIN="https://preprod.purposepath.app"
+          TARGET="${{ steps.api-url.outputs.url }}/api/v1/ai/execute-async"
+          CORS_HEADERS=$(curl -s -D - -o /dev/null -X OPTIONS "$TARGET" \
+            -H "Origin: $ORIGIN" \
+            -H "Access-Control-Request-Method: POST" \
+            -H "Access-Control-Request-Headers: Authorization,Content-Type,X-Tenant-Id")
+
+          ALLOW_ORIGIN=$(echo "$CORS_HEADERS" | tr -d '\r' | awk -F': ' 'tolower($1)=="access-control-allow-origin"{print $2}' | tail -n 1)
+          ALLOW_CREDENTIALS=$(echo "$CORS_HEADERS" | tr -d '\r' | awk -F': ' 'tolower($1)=="access-control-allow-credentials"{print $2}' | tail -n 1)
+
+          if [ "$ALLOW_ORIGIN" != "$ORIGIN" ]; then
+            echo "❌ Invalid Access-Control-Allow-Origin: '$ALLOW_ORIGIN' (expected '$ORIGIN')"
+            exit 1
+          fi
+          if [ "$ALLOW_CREDENTIALS" != "true" ]; then
+            echo "❌ Invalid Access-Control-Allow-Credentials: '$ALLOW_CREDENTIALS' (expected 'true')"
+            exit 1
+          fi
+
+          echo "✅ CORS preflight returned expected headers"
diff --git a/.github/workflows/deploy-production.yml b/.github/workflows/deploy-production.yml
index a7b3e66..616c653 100644
--- a/.github/workflows/deploy-production.yml
+++ b/.github/workflows/deploy-production.yml
@@ -34,13 +34,20 @@ jobs:
         run: |
           SHOULD_DEPLOY="false"
           TRIGGER_REASON="not-eligible"
+          HEAD_REF="${{ github.event.pull_request.head.ref }}"
 
           if [ "${{ github.event_name }}" == "pull_request" ] && \
              [ "${{ github.event.pull_request.merged }}" == "true" ] && \
              [ "${{ github.event.pull_request.base.ref }}" == "master" ] && \
-             [ "${{ github.event.pull_request.head.ref }}" == "staging" ]; then
+             [ "$HEAD_REF" == "staging" ]; then
             SHOULD_DEPLOY="true"
             TRIGGER_REASON="merged-staging-to-master-pr"
+          elif [ "${{ github.event_name }}" == "pull_request" ] && \
+               [ "${{ github.event.pull_request.merged }}" == "true" ] && \
+               [ "${{ github.event.pull_request.base.ref }}" == "master" ] && \
+               [[ "$HEAD_REF" == hotfix/* ]]; then
+            SHOULD_DEPLOY="true"
+            TRIGGER_REASON="merged-hotfix-to-master-pr"
           elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
             SHOULD_DEPLOY="true"
             TRIGGER_REASON="manual-dispatch"
@@ -56,7 +63,7 @@ jobs:
         if: steps.promotion-check.outputs.should_deploy != 'true'
         run: |
           echo "No production deployment triggered."
-          echo "This workflow only deploys on merged PRs from staging -> master, or manual dispatch."
+          echo "This workflow deploys on merged PRs from staging/hotfix -> master, or manual dispatch."
 
   pre-deployment-checks:
     name: Pre-Deployment Validation
@@ -199,6 +206,24 @@ jobs:
         env:
           PULUMI_ACCESS_TOKEN: ${{ secrets.PULUMI_ACCESS_TOKEN }}
 
+      - name: Lambda Runtime State Check
+        run: |
+          echo "Validating Lambda runtime state..."
+          LAMBDA_ARN=$(pulumi stack output lambdaArn --stack prod)
+          LAMBDA_NAME=${LAMBDA_ARN##*:function:}
+
+          STATE=$(aws lambda get-function --function-name "$LAMBDA_NAME" --region us-east-1 --query "Configuration.State" --output text)
+          REASON=$(aws lambda get-function --function-name "$LAMBDA_NAME" --region us-east-1 --query "Configuration.StateReason" --output text)
+
+          if [ "$STATE" != "Active" ]; then
+            echo "❌ Lambda is not Active (state=$STATE, reason=$REASON)"
+            exit 1
+          fi
+
+          echo "✅ Lambda state is Active"
+        env:
+          PULUMI_ACCESS_TOKEN: ${{ secrets.PULUMI_ACCESS_TOKEN }}
+
       - name: Create GitHub Release
         uses: actions/create-release@v1
         env:
diff --git a/coaching/pulumi/__main__.py b/coaching/pulumi/__main__.py
index ea6ebb8..51dd22f 100644
--- a/coaching/pulumi/__main__.py
+++ b/coaching/pulumi/__main__.py
@@ -309,21 +309,10 @@
     ),
 )
 
-# Reuse shared ECR repository when it already exists.
-# This avoids cross-stack repository creation conflicts in production.
-try:
-    existing_ecr_repo = aws.ecr.get_repository(name="purposepath-coaching")
-    ecr_repository_url = pulumi.Output.from_input(existing_ecr_repo.repository_url)
-except Exception:
-    ecr_repo = aws.ecr.Repository(
-        "coaching-repo",
-        name="purposepath-coaching",
-        image_scanning_configuration=aws.ecr.RepositoryImageScanningConfigurationArgs(
-            scan_on_push=True
-        ),
-        force_delete=True,
-    )
-    ecr_repository_url = ecr_repo.repository_url
+# Use the shared ECR repository managed outside this stack.
+# Do not create/delete this repository from service deployments.
+existing_ecr_repo = aws.ecr.get_repository(name="purposepath-coaching")
+ecr_repository_url = pulumi.Output.from_input(existing_ecr_repo.repository_url)
 
 # Build and push Docker image
 auth_token = aws.ecr.get_authorization_token()