Health Monitor #1564

Workflow file for this run

.github/workflows/health-monitor.yml at 042cbfc

	# Protocol Guide Health Check Monitoring
	# Periodically checks the health of production services

	name: Health Monitor

	on:
	# Run every 30 minutes (reduced from 15 to prevent runner queue buildup)
	schedule:
	- cron: '/30 * * *'
	# Allow manual trigger
	workflow_dispatch:
	inputs:
	environment:
	description: 'Environment to check'
	required: true
	default: 'production'
	type: choice
	options:
	- production
	- staging

	# Cancel in-progress runs when a new run is triggered
	concurrency:
	group: health-monitor
	cancel-in-progress: true

	jobs:
	health-check:
	name: Health Check
	runs-on: ubuntu-latest
	timeout-minutes: 5

	steps:
	- name: Check production health endpoint
	id: health
	run: \|
	# Get the health check URL from environment input or use default
	HEALTH_URL="${HEALTH_CHECK_URL:-https://protocol-guide.netlify.app/api/health}"

	echo "Checking health endpoint: ${HEALTH_URL}"

	# Make the request and capture response
	HTTP_RESPONSE=$(curl -s -w "\n%{http_code}" --connect-timeout 10 --max-time 30 "${HEALTH_URL}" \|\| echo -e "\n000")

	# Split response body and status code
	HTTP_BODY=$(echo "${HTTP_RESPONSE}" \| head -n -1)
	HTTP_STATUS=$(echo "${HTTP_RESPONSE}" \| tail -n 1)

	echo "HTTP Status: ${HTTP_STATUS}"
	echo "Response: ${HTTP_BODY}"

	# Set outputs for other steps
	echo "status=${HTTP_STATUS}" >> $GITHUB_OUTPUT
	echo "body<<EOF" >> $GITHUB_OUTPUT
	echo "${HTTP_BODY}" >> $GITHUB_OUTPUT
	echo "EOF" >> $GITHUB_OUTPUT

	# Check if healthy
	if [ "${HTTP_STATUS}" -ge 200 ] && [ "${HTTP_STATUS}" -lt 300 ]; then
	echo "Health check passed"
	echo "healthy=true" >> $GITHUB_OUTPUT
	else
	echo "Health check failed"
	echo "healthy=false" >> $GITHUB_OUTPUT
	fi
	env:
	HEALTH_CHECK_URL: ${{ inputs.environment == 'staging' && 'https://staging-protocol-guide.netlify.app/api/health' \|\| 'https://protocol-guide.netlify.app/api/health' }}

	- name: Check liveness endpoint
	id: liveness
	run: \|
	LIVE_URL="${LIVENESS_URL:-https://protocol-guide.netlify.app/api/live}"

	echo "Checking liveness endpoint: ${LIVE_URL}"

	HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 --max-time 10 "${LIVE_URL}" \|\| echo "000")

	echo "HTTP Status: ${HTTP_STATUS}"

	if [ "${HTTP_STATUS}" = "200" ]; then
	echo "Liveness check passed"
	echo "alive=true" >> $GITHUB_OUTPUT
	else
	echo "Liveness check failed"
	echo "alive=false" >> $GITHUB_OUTPUT
	fi
	env:
	LIVENESS_URL: ${{ inputs.environment == 'staging' && 'https://staging-protocol-guide.netlify.app/api/live' \|\| 'https://protocol-guide.netlify.app/api/live' }}

	- name: Create health check summary
	env:
	HEALTH_STATUS: ${{ steps.health.outputs.status }}
	HEALTH_BODY: ${{ steps.health.outputs.body }}
	HEALTH_HEALTHY: ${{ steps.health.outputs.healthy }}
	LIVENESS_ALIVE: ${{ steps.liveness.outputs.alive }}
	run: \|
	echo "## Health Check Results" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY

	if [ "${HEALTH_HEALTHY}" = "true" ]; then
	echo "\| Check \| Status \|" >> $GITHUB_STEP_SUMMARY
	echo "\|-------\|--------\|" >> $GITHUB_STEP_SUMMARY
	echo "\| Health Endpoint \| Passed \|" >> $GITHUB_STEP_SUMMARY
	if [ "${LIVENESS_ALIVE}" = "true" ]; then
	echo "\| Liveness Endpoint \| Passed \|" >> $GITHUB_STEP_SUMMARY
	else
	echo "\| Liveness Endpoint \| Failed \|" >> $GITHUB_STEP_SUMMARY
	fi
	else
	echo "### Health Check Failed" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "HTTP Status: ${HEALTH_STATUS}" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo '```json' >> $GITHUB_STEP_SUMMARY
	echo "${HEALTH_BODY}" >> $GITHUB_STEP_SUMMARY
	echo '```' >> $GITHUB_STEP_SUMMARY
	fi

	- name: Fail if unhealthy
	if: steps.health.outputs.healthy != 'true'
	env:
	HEALTH_STATUS: ${{ steps.health.outputs.status }}
	run: \|
	echo "Health check failed - service may be down or degraded"
	echo "Status: ${HEALTH_STATUS}"
	exit 1

	notify-on-failure:
	name: Notify on Failure
	runs-on: ubuntu-latest
	needs: [health-check]
	if: failure()

	steps:
	- name: Create failure issue
	uses: actions/github-script@v7
	with:
	script: \|
	const title = `Health Check Failed - ${new Date().toISOString()}`;
	const body = `
	## Production Health Check Failed

	Time: ${new Date().toISOString()}
	Workflow Run: [View Details](${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID})

	### Action Required

	Please investigate the production environment health:

	1. Check the health endpoint manually
	2. Review recent deployments
	3. Check server logs for errors
	4. Verify database connectivity
	5. Check external service dependencies

	### Runbook

	- [Health Check Documentation](docs/runbook/health-checks.md)
	- [Incident Response](docs/runbook/incident-response.md)

	---
	This issue was automatically created by the health monitoring workflow.
	`;

	// Check for existing open health check issues
	const existingIssues = await github.rest.issues.listForRepo({
	owner: context.repo.owner,
	repo: context.repo.repo,
	state: 'open',
	labels: 'health-check-failure'
	});

	if (existingIssues.data.length === 0) {
	await github.rest.issues.create({
	owner: context.repo.owner,
	repo: context.repo.repo,
	title: title,
	body: body,
	labels: ['health-check-failure', 'priority:high', 'automated']
	});
	console.log('Created new health check failure issue');
	} else {
	// Add comment to existing issue
	const issue = existingIssues.data[0];
	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: issue.number,
	body: `### Health check failed again\n\nTime: ${new Date().toISOString()}\nRun: [View Details](${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID})`
	});
	console.log(`Added comment to existing issue #${issue.number}`);
	}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Health Monitor #1564

Workflow file

Health Monitor #1564

Uh oh!

Workflow file for this run