endurix/.github/workflows/deploy.yml at main · Mohith1612/endurix · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
name: Deploy to Production

# MANUAL TRIGGER ONLY — this workflow never runs automatically.
# An operator supplies the git SHA that was produced by build-and-push.yml
# and types "DEPLOY" to confirm intent.
#
# Two independent safety gates before any SSH connection is opened:
#   1. confirm input must equal "DEPLOY" (catches accidental triggers)
#   2. GitHub Environment "production" requires a named reviewer to approve
#      (second human in the loop; the approver cannot be the triggerer)
on:
  workflow_dispatch:
    inputs:
      image_tag:
        description: "Full git SHA to deploy — copy from the build-and-push.yml job summary"
        required: true
        type: string
      confirm:
        description: "Type DEPLOY (all caps) to confirm production deployment"
        required: true
        type: string

# Never allow two deploys to run concurrently.
# Never cancel a deploy that is already in flight — let it finish.
concurrency:
  group: deploy-production
  cancel-in-progress: false

permissions:
  contents: read
  packages: read      # Read images from GHCR to verify they exist
  deployments: write  # Create GitHub deployment records

jobs:
  # ──────────────────────────────────────────────────────────────────────────
  # JOB: preflight
  # Validates all inputs and confirms both images exist in GHCR before any
  # SSH connection is opened. Failing fast here is cheap; failing on the VM
  # mid-deploy is expensive.
  #
  # The `environment: production` declaration here triggers the GitHub
  # Environment reviewer gate — a human must approve before this job runs.
  # ──────────────────────────────────────────────────────────────────────────
  preflight:
    name: Preflight — validate inputs and images
    runs-on: ubuntu-24.04
    environment: production
    steps:
      - name: Validate confirmation string
        run: |
          if [ "${{ github.event.inputs.confirm }}" != "DEPLOY" ]; then
            echo "ERROR: Confirmation string must be exactly 'DEPLOY'."
            echo "Got: '${{ github.event.inputs.confirm }}'"
            exit 1
          fi
          echo "Confirmation accepted."

      - name: Validate image tag is a plausible git SHA
        run: |
          TAG="${{ github.event.inputs.image_tag }}"
          if ! echo "$TAG" | grep -qE '^[0-9a-f]{40}$'; then
            echo "WARNING: '$TAG' does not look like a full 40-character git SHA."
            echo "Short SHAs and other strings will be checked in GHCR next,"
            echo "but full SHAs are strongly recommended to avoid ambiguity."
          fi
          echo "Deploying tag: $TAG"

      - name: Confirm images exist in GHCR
        # Uses GITHUB_TOKEN (runner-scoped) for a read-only manifest check.
        # This is sufficient for the CI runner; the VM uses GHCR_TOKEN to pull.
        run: |
          TAG="${{ github.event.inputs.image_tag }}"
          OWNER="${{ github.repository_owner }}"
          echo "${{ secrets.GITHUB_TOKEN }}" | \
            docker login ghcr.io -u "${{ github.actor }}" --password-stdin

          echo "Checking backend image: ghcr.io/${OWNER}/endurix-backend:${TAG}"
          docker manifest inspect "ghcr.io/${OWNER}/endurix-backend:${TAG}" || {
            echo "ERROR: Backend image not found in GHCR for tag '${TAG}'."
            echo "Run build-and-push.yml on the target commit first."
            exit 1
          }

          echo "Checking frontend image: ghcr.io/${OWNER}/endurix-frontend:${TAG}"
          docker manifest inspect "ghcr.io/${OWNER}/endurix-frontend:${TAG}" || {
            echo "ERROR: Frontend image not found in GHCR for tag '${TAG}'."
            exit 1
          }

          echo "Both images confirmed present in GHCR. Proceeding to deploy."

  # ──────────────────────────────────────────────────────────────────────────
  # JOB: deploy
  # SSH into the production VM and execute the deployment sequence:
  #   1. Authenticate docker with GHCR
  #   2. Write IMAGE_TAG to the VM's .env (docker-compose reads it)
  #   3. Pull the new images (old containers still serving traffic)
  #   4. Run Alembic migrations using the NEW image (before restart)
  #   5. Restart only the application containers (not postgres, not nginx)
  #   6. Health-check loop — marks the deploy failed if the app never responds
  #
  # Migration timing rationale:
  #   Migrations run with the OLD app still serving requests. This requires
  #   migrations to be backward-compatible with the previous app version
  #   (additive-only: new columns must be nullable or have defaults; no
  #   renames or drops in the same migration as an app change). This is the
  #   only safe approach on a single VM without blue/green infrastructure.
  # ──────────────────────────────────────────────────────────────────────────
  deploy:
    name: Deploy via SSH
    needs: preflight
    runs-on: ubuntu-24.04
    environment: production
    steps:
      - name: Execute deployment on production VM
        uses: appleboy/ssh-action@v1
        env:
          IMAGE_TAG: ${{ github.event.inputs.image_tag }}
          GHCR_OWNER: ${{ github.repository_owner }}
          GHCR_TOKEN: ${{ secrets.GHCR_TOKEN }}
          DEPLOY_DIR: ${{ secrets.VM_DEPLOY_DIR }}
        with:
          host: ${{ secrets.VM_HOST }}
          username: ${{ secrets.VM_USER }}
          key: ${{ secrets.VM_SSH_KEY }}
          port: ${{ secrets.VM_SSH_PORT }}
          # Pass the env vars above into the remote shell session.
          envs: IMAGE_TAG,GHCR_OWNER,GHCR_TOKEN,DEPLOY_DIR
          # script_stop: true causes the SSH action to exit 1 if any remote
          # command fails (equivalent to `set -e` for the action wrapper).
          script_stop: true
          script: |
            set -euo pipefail

            echo "=== [1/7] Authenticate with GHCR ==="
            echo "${GHCR_TOKEN}" | docker login ghcr.io \
              -u "${GHCR_OWNER}" \
              --password-stdin

            echo "=== [2/7] Move to deploy directory ==="
            cd "${DEPLOY_DIR}"

            echo "=== [3/7] Write IMAGE_TAG to .env ==="
            # docker-compose automatically reads .env from the project directory.
            # We update only the IMAGE_TAG line, leaving all other variables intact.
            # Those other variables (DB passwords, API keys, etc.) are set manually
            # on the VM and are never managed by this pipeline.
            if grep -q '^IMAGE_TAG=' .env 2>/dev/null; then
              sed -i "s|^IMAGE_TAG=.*|IMAGE_TAG=${IMAGE_TAG}|" .env
            else
              echo "IMAGE_TAG=${IMAGE_TAG}" >> .env
            fi

            echo "=== [4/7] Pull new images ==="
            # Old containers are still serving requests at this point.
            # Pulling before stopping minimises downtime.
            docker compose pull backend frontend

            echo "=== [5/7] Run Alembic migrations ==="
            # Runs the NEW migration code in a throwaway container against
            # the live database, while the OLD application containers are
            # still handling requests.
            #
            # REQUIREMENT: every migration must be backward-compatible with
            # the currently deployed application version. No renames or drops
            # in the same deploy as an app change.
            docker compose run \
              --rm \
              --no-deps \
              backend \
              alembic upgrade head

            echo "=== [6/7] Restart application services ==="
            # --no-deps: do not restart postgres or nginx unless their config changed.
            # --pull never: images were already pulled in step 4; skip re-pulling.
            docker compose up -d \
              --no-deps \
              --pull never \
              backend frontend

            echo "=== [7/7] Health check ==="
            MAX_RETRIES=30
            SLEEP_SECS=5
            for i in $(seq 1 "${MAX_RETRIES}"); do
              echo "Attempt ${i}/${MAX_RETRIES}..."
              if curl -sf http://localhost:8000/health > /dev/null; then
                echo "Health check passed."
                break
              fi
              if [ "${i}" -eq "${MAX_RETRIES}" ]; then
                echo "ERROR: Health check failed after ${MAX_RETRIES} attempts ($(( MAX_RETRIES * SLEEP_SECS ))s)."
                echo "The containers are running but the app is not responding."
                echo "Investigate: docker compose logs --tail=100 backend"
                exit 1
              fi
              sleep "${SLEEP_SECS}"
            done

            echo ""
            echo "=== Deployment complete ==="
            echo "Running image tag: ${IMAGE_TAG}"
            docker compose ps

      - name: Write deployment summary
        if: always()
        run: |
          echo "## Deployment Summary" >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "| Field | Value |" >> $GITHUB_STEP_SUMMARY
          echo "|-------|-------|" >> $GITHUB_STEP_SUMMARY
          echo "| Image tag | \`${{ github.event.inputs.image_tag }}\` |" >> $GITHUB_STEP_SUMMARY
          echo "| Triggered by | @${{ github.actor }} |" >> $GITHUB_STEP_SUMMARY
          echo "| Timestamp | $(date -u '+%Y-%m-%dT%H:%M:%SZ') |" >> $GITHUB_STEP_SUMMARY
          echo "| Status | **${{ job.status }}** |" >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "### Rollback" >> $GITHUB_STEP_SUMMARY
          echo "To roll back, trigger this workflow again with a previous git SHA." >> $GITHUB_STEP_SUMMARY
          echo "Rolling back = deploying an older tag. No special rollback workflow is needed." >> $GITHUB_STEP_SUMMARY