araya-optinist/cloud-startup.sh at develop-subscription · arayabrain/araya-optinist · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#!/bin/bash
set -e  # Exit immediately if a command exits with a non-zero status

# Map infrastructure environment variables (DB_*) to application expected variables (MYSQL_*)
# This allows the application's config.py to find the correct environment variables
# while maintaining infrastructure naming conventions
export MYSQL_SERVER="${DB_HOST}"
export MYSQL_USER="${DB_USER}"
export MYSQL_PASSWORD="${DB_PASSWORD}"
export MYSQL_DATABASE="${DB_NAME}"

# Fetch Firebase config from Secrets Manager (overrides files baked into Docker image)
if [ -n "$ENV_PREFIX" ]; then
    echo "Fetching Firebase config from Secrets Manager for environment: ${ENV_PREFIX}"
    FIREBASE_CONFIG_DIR="/app/studio/config/auth"
    mkdir -p "$FIREBASE_CONFIG_DIR"

    FIREBASE_CONFIG=$(aws secretsmanager get-secret-value \
        --secret-id "${ENV_PREFIX}-optinist/firebase/config" \
        --query "SecretString" --output text --region "${AWS_DEFAULT_REGION:-ap-northeast-1}" 2>/dev/null || echo "")
    if [ -n "$FIREBASE_CONFIG" ]; then
        echo "$FIREBASE_CONFIG" > "$FIREBASE_CONFIG_DIR/firebase_config.json"
        echo "Firebase config written from Secrets Manager"
    else
        echo "WARNING: Could not fetch Firebase config from Secrets Manager. Using defaults."
    fi

    FIREBASE_PRIVATE=$(aws secretsmanager get-secret-value \
        --secret-id "${ENV_PREFIX}-optinist/firebase/private-key" \
        --query "SecretString" --output text --region "${AWS_DEFAULT_REGION:-ap-northeast-1}" 2>/dev/null || echo "")
    if [ -n "$FIREBASE_PRIVATE" ]; then
        echo "$FIREBASE_PRIVATE" > "$FIREBASE_CONFIG_DIR/firebase_private.json"
        echo "Firebase private key written from Secrets Manager"
    else
        echo "WARNING: Could not fetch Firebase private key from Secrets Manager. Using defaults."
    fi
fi

echo 'Starting container'
echo 'Attempting to connect to RDS'
# Log environment variables for debugging
echo "DB_HOST: ${MYSQL_SERVER}"
echo "DB_USER: ${MYSQL_USER}"
echo "DB_NAME: ${MYSQL_DATABASE}"

# Wait for RDS to be available
# This is necessary because RDS might still be initializing when container starts
# (e.g., after dev scheduler starts the environment — RDS takes 5-10 min)
# Tries 60 times with 10 second intervals (total 10 minutes timeout)
max_tries=60
counter=0

# Build SSL options based on MYSQL_SSL_MODE
if [ -n "${MYSQL_SSL_MODE}" ]; then
    case "${MYSQL_SSL_MODE}" in
        DISABLED)
            ssl_opts="--skip-ssl"
            ;;
        *)
            ssl_opts="--ssl"
            ;;
    esac
else
    ssl_opts=""
fi

until mysql ${ssl_opts} -h "${MYSQL_SERVER}" -u "${MYSQL_USER}" -p"${MYSQL_PASSWORD}" "${MYSQL_DATABASE}" -e 'SELECT 1;'
do
    sleep 10
    [[ counter -eq $max_tries ]] && echo "Failed to connect to Database" && exit 1
    echo "Attempt $counter: Waiting for Database..."
    ((counter++))
done

echo 'Database connection successful'

# Run database migrations using alembic
# This ensures all database tables and schemas are up to date
cd /app

# Verify database SSL connection before running migrations
echo "Verifying database SSL connection..."
python3 -c "
from studio.app.common.db.config import DATABASE_CONFIG, get_ssl_creator
from sqlalchemy import create_engine, text
creator = get_ssl_creator()
kwargs = {'creator': creator} if creator else {}
engine = create_engine(DATABASE_CONFIG.DATABASE_URL, **kwargs)
with engine.connect() as c:
    r = c.execute(text('SHOW STATUS LIKE \"Ssl_cipher\"'))
    cipher = r.fetchone()
    print(f'SSL: {cipher[1] if cipher and cipher[1] else \"disabled\"}')
engine.dispose()
" 2>&1 || echo "WARNING: SSL verification failed (see above)"

# Run Alembic upgrade - if migrations fail, the container will exit
# This causes ECS to mark the deployment as failed and revert to the previous version
echo "Running Alembic upgrade..."
if ! alembic upgrade head 2>&1; then
    echo "ERROR: Database migration failed!"
    echo "Container will exit to prevent data loss and trigger deployment rollback."
    echo "Please investigate the migration error before redeploying."
    exit 1
fi
echo "Database migrations completed successfully"

# Seed subscription plans from SUBSCRIPTION_PLANS_CONFIG env var
if [ -n "$SUBSCRIPTION_PLANS_CONFIG" ]; then
    echo "Seeding subscription plans..."
    python3 /app/scripts/seed_subscription_plans.py || echo "WARNING: Subscription plan seeding failed (non-fatal)"
else
    echo "SUBSCRIPTION_PLANS_CONFIG not set, skipping subscription plan seeding"
fi

# Create test users from TEST_USERS_CONFIG env var
if [ -n "$TEST_USERS_CONFIG" ]; then
    echo "Creating test users..."
    cd /app/scripts && python3 create_test_users.py || echo "WARNING: Test user creation failed (non-fatal)"
    cd /app
else
    echo "TEST_USERS_CONFIG not set, skipping test user creation"
fi

# Verify backend configuration
# Ensures required environment variables are set before starting the application
echo "Host: $BACKEND_HOST"
echo "Port: $BACKEND_PORT"
if [ -z "$BACKEND_HOST" ] || [ -z "$BACKEND_PORT" ]; then
    echo "Please provide 'BACKEND_HOST' and 'BACKEND_PORT' environment variables"
    exit 1
fi

# Configure Uvicorn worker processes
# Workers handle concurrent requests. Recommended formula: (2 × CPU cores) + 1
# t3.large has 2 vCPUs, so optimal range is 2-5 workers
# Set via environment variable UVICORN_WORKERS, defaults to 5 for production use
UVICORN_WORKERS=${UVICORN_WORKERS:-5}
echo "Uvicorn workers: $UVICORN_WORKERS"

# Get EC2 instance ID for free tier user tracking
# This is required for the FreeUserActivityMiddleware to track users across instances
echo "Retrieving EC2 instance ID from ECS container metadata..."
INSTANCE_ID=""

# Temporarily disable exit-on-error to allow graceful handling if metadata is unavailable
set +e

# Get the container instance ARN from ECS metadata
if [ -n "$ECS_CONTAINER_METADATA_URI_V4" ]; then
    TASK_METADATA=$(curl -s "${ECS_CONTAINER_METADATA_URI_V4}/task" 2>/dev/null)
    TASK_ARN=$(echo "$TASK_METADATA" | python3 -c "import sys, json; print(json.load(sys.stdin)['TaskARN'])" 2>/dev/null)

    if [ -n "$TASK_ARN" ]; then
        echo "Found ECS task: $TASK_ARN"
        # Extract cluster name from task ARN (format: arn:aws:ecs:region:account:task/cluster-name/task-id)
        CLUSTER_NAME=$(echo "$TASK_ARN" | cut -d'/' -f2)
        echo "Cluster: $CLUSTER_NAME"

        # Get container instance ARN
        CONTAINER_INSTANCE_ARN=$(aws ecs describe-tasks \
            --cluster "$CLUSTER_NAME" \
            --tasks "$TASK_ARN" \
            --query 'tasks[0].containerInstanceArn' \
            --output text 2>/dev/null)

        if [ -n "$CONTAINER_INSTANCE_ARN" ] && [ "$CONTAINER_INSTANCE_ARN" != "None" ]; then
            echo "Container instance ARN: $CONTAINER_INSTANCE_ARN"
            # Get EC2 instance ID from container instance
            INSTANCE_ID=$(aws ecs describe-container-instances \
                --cluster "$CLUSTER_NAME" \
                --container-instances "$CONTAINER_INSTANCE_ARN" \
                --query 'containerInstances[0].ec2InstanceId' \
                --output text 2>/dev/null)

            if [ -n "$INSTANCE_ID" ] && [ "$INSTANCE_ID" != "None" ]; then
                echo "Got instance ID from ECS container metadata: $INSTANCE_ID"
            fi
        else
            echo "Could not retrieve container instance ARN"
        fi
    else
        echo "Could not retrieve task ARN from ECS metadata"
    fi
else
    echo "ECS_CONTAINER_METADATA_URI_V4 not available"
fi

# Re-enable exit-on-error
set -e

# Export for the application to use
if [ -n "$INSTANCE_ID" ] && [ "$INSTANCE_ID" != "None" ]; then
    export INSTANCE_ID
    echo "INSTANCE_ID set to: $INSTANCE_ID"
else
    echo "WARNING: Could not retrieve EC2 instance ID. Free tier user tracking will not work."
    echo "This is expected in local development, but should not happen in production."
fi

# Start the application in background
echo "Starting application..."
poetry run python main.py --host="$BACKEND_HOST" --port="$BACKEND_PORT" --workers="$UVICORN_WORKERS" &
APP_PID=$!

# Allow initial startup time matching ECS health check startPeriod
echo "Waiting for initial startup..."
sleep 30

# Single initial health check before load balancer check
echo "Verifying initial health..."
if ! curl -v "http://${BACKEND_HOST}:${BACKEND_PORT}/health"; then
    echo "Initial health check failed"
    # Don't exit - let ECS handle it
fi

# Load balancer health check function
# Verifies that the application is accessible through the load balancer
check_load_balancer() {
    if [ -n "$AWS_SERVICE_URL" ]; then
        echo "Checking load balancer status..."
        readonly MAX_TRIES=30
        readonly WAIT_SECONDS=10
        local counter=0

        # Try for 5 minutes (30 attempts * 10 seconds)
        until curl -s -o /dev/null --max-time ${WAIT_SECONDS} "$AWS_SERVICE_URL"
        do
            sleep ${WAIT_SECONDS}
            [[ $counter -eq $MAX_TRIES ]] && echo "Load balancer not ready after 5 minutes" && return 1
            echo "Attempt $counter: Waiting for load balancer..."
            ((counter++))
        done
        echo "Load balancer is ready"
        return 0
    else
        echo "AWS_SERVICE_URL not provided, skipping load balancer check"
        return 0
    fi
}

# Run load balancer check in background
# This allows parallel checking while the application is starting
check_load_balancer &
LB_CHECK_PID=$!

# Wait for all background processes to complete
# This ensures the container keeps running as long as the application is running
wait $APP_PID
wait $LB_CHECK_PID