From 557f3a52e82fbcd8ed9946af450f455dea6424e5 Mon Sep 17 00:00:00 2001 From: Riley Seaburg Date: Mon, 22 Dec 2025 17:08:25 +0000 Subject: [PATCH] Fix deploy script rolling deployment timeout issue - Add --no-wait flag to cf push so it succeeds after first instance is healthy - Add wait_for_deployment_complete function to explicitly wait for all instances - Improve retry logic to detect and wait for active deployments instead of canceling them with a new push - Increases effective deployment timeout from 180s to 15 minutes This fixes the infinite loop where rolling deployments kept getting canceled before all 18 instances could be replaced, leaving old instances running. --- .circleci/deploy.sh | 83 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 76 insertions(+), 7 deletions(-) diff --git a/.circleci/deploy.sh b/.circleci/deploy.sh index fc3e3629a..a70ce758c 100755 --- a/.circleci/deploy.sh +++ b/.circleci/deploy.sh @@ -87,6 +87,50 @@ wait_for_deployment() { return 0 } +# Wait for the current deployment to fully complete (all instances replaced) +wait_for_deployment_complete() { + local app_name="$1" + local max_wait=900 # 15 minutes max for full deployment + local wait_interval=15 + local waited=0 + + echo "Waiting for deployment of $app_name to complete..." + + local app_guid=$(cf app "$app_name" --guid) + + while [ $waited -lt $max_wait ]; do + # Get the most recent deployment status + local deployment_info=$(cf curl "/v3/deployments?app_guids=${app_guid}&order_by=-created_at&per_page=1" 2>/dev/null) + local status=$(echo "$deployment_info" | grep -o '"value":"[^"]*"' | head -1 | cut -d'"' -f4 || echo "") + local reason=$(echo "$deployment_info" | grep -o '"reason":"[^"]*"' | head -1 | cut -d'"' -f4 || echo "") + + if [ "$status" == "FINALIZED" ]; then + if [ "$reason" == "DEPLOYED" ]; then + echo "✓ Deployment completed successfully" + return 0 + elif [ "$reason" == "CANCELED" ]; then + echo "✗ Deployment was canceled" + return 1 + else + echo "✗ Deployment finalized with reason: $reason" + return 1 + fi + fi + + if [ "$status" == "ACTIVE" ]; then + echo "Deployment in progress (status: $status), waiting ${wait_interval}s... (waited ${waited}s of ${max_wait}s)" + else + echo "Deployment status: $status, reason: $reason" + fi + + sleep $wait_interval + waited=$((waited + wait_interval)) + done + + echo "Warning: Timed out waiting for deployment to complete after ${max_wait}s" + return 1 +} + # Run migrations as a CF task and wait for completion run_migrations() { local app_name="$1" @@ -184,22 +228,47 @@ cf_push_with_retry() { set +e if [ -n "$manifest_path" ]; then echo "Using manifest: $manifest_path" - cf push "$app_name" -f "$manifest_path" --strategy rolling -t 180 + cf push "$app_name" -f "$manifest_path" --strategy rolling -t 180 --no-wait else - cf push "$app_name" --strategy rolling -t 180 + cf push "$app_name" --strategy rolling -t 180 --no-wait fi exit_code=$? set -e if [ $exit_code -eq 0 ]; then - echo "Successfully pushed $app_name" - release_deploy_lock "$app_name" - trap - EXIT # Clear the trap - return 0 + echo "Push initiated successfully, waiting for full deployment to complete..." + if wait_for_deployment_complete "$app_name"; then + echo "Successfully deployed $app_name" + release_deploy_lock "$app_name" + trap - EXIT # Clear the trap + return 0 + else + echo "Deployment did not complete successfully" + # Continue to retry logic below + fi fi if [ $i -lt $max_retries ]; then - echo "Push failed (exit code: $exit_code), waiting ${retry_delay}s before retry..." + echo "Push failed or deployment incomplete (exit code: $exit_code), checking for active deployments..." + + # Check if there's an active deployment that we should wait for instead of retrying + local app_guid=$(cf app "$app_name" --guid 2>/dev/null || echo "") + if [ -n "$app_guid" ]; then + local active_deployment=$(cf curl "/v3/deployments?app_guids=${app_guid}&status_values=ACTIVE" 2>/dev/null | grep -c '"ACTIVE"' || echo "0") + + if [ "$active_deployment" -gt 0 ]; then + echo "Active deployment detected, waiting for it to complete instead of retrying..." + if wait_for_deployment_complete "$app_name"; then + echo "Existing deployment completed successfully" + release_deploy_lock "$app_name" + trap - EXIT + return 0 + fi + echo "Existing deployment did not complete successfully, will retry..." + fi + fi + + echo "Waiting ${retry_delay}s before retry..." sleep $retry_delay # Re-check for in-progress deployments before retrying wait_for_deployment "$app_name"