Skip to content

Merge remote-tracking branch 'origin/develop' into develop #26

Merge remote-tracking branch 'origin/develop' into develop

Merge remote-tracking branch 'origin/develop' into develop #26

name: Incident Response

Check failure on line 1 in .github/workflows/incident-response.yml

View workflow run for this annotation

GitHub Actions / .github/workflows/incident-response.yml

Invalid workflow file

(Line: 4, Col: 3): Unexpected value 'branches', (Line: 4, Col: 13): A sequence was not expected
on:
branches: [ main ]
workflow_dispatch:
inputs:
incident_type:
description: 'Type of incident'
required: true
type: choice
options:
- production_down
- performance_degradation
- security_breach
- data_corruption
- deployment_failure
- high_error_rate
severity:
description: 'Incident severity'
required: true
type: choice
options:
- critical
- high
- medium
- low
description:
description: 'Incident description'
required: true
type: string
affected_services:
description: 'Affected services (comma-separated)'
required: false
type: string
default: 'all'
jobs:
create-incident-issue:
name: Create Incident Issue
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Create incident issue
id: create_issue
uses: actions/github-script@v7
with:
script: |
const incidentType = '${{ github.event.inputs.incident_type }}';
const severity = '${{ github.event.inputs.severity }}';
const description = '${{ github.event.inputs.description }}';
const affectedServices = '${{ github.event.inputs.affected_services }}';
const timestamp = new Date().toISOString();
const issueBody = `
# Incident Report
**Type**: ${incidentType}
**Severity**: ${severity.toUpperCase()}
**Reported**: ${timestamp}
**Affected Services**: ${affectedServices}
## Description
${description}
## Status
- [x] Incident detected
- [ ] Team notified
- [ ] Investigation started
- [ ] Root cause identified
- [ ] Mitigation deployed
- [ ] Incident resolved
- [ ] Post-mortem scheduled
## Timeline
- ${timestamp}: Incident detected and reported
## Investigation Notes
(Add investigation notes here)
## Mitigation Steps
(Document mitigation steps here)
## Post-incident Actions
- [ ] Conduct post-mortem
- [ ] Update runbooks
- [ ] Implement preventive measures
- [ ] Update monitoring/alerts
---
*Auto-generated by Incident Response workflow*
`;
const issue = await github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.repo,
title: `[INCIDENT] ${incidentType} - ${severity.toUpperCase()}`,
body: issueBody,
labels: ['incident', `severity-${severity}`, `type-${incidentType}`]
});
console.log(`Created issue #${issue.data.number}`);
return issue.data.number;
outputs:
issue_number: ${{ steps.create_issue.outputs.result }}
gather-diagnostics:
name: Gather Diagnostic Information
runs-on: ubuntu-latest
needs: create-incident-issue
steps:
- uses: actions/checkout@v4
- name: Gather system information
run: |
cat << 'EOF' > diagnostics.md
# Diagnostic Information
**Timestamp**: $(date -u +"%Y-%m-%d %H:%M:%S UTC")
**Incident**: #${{ needs.create-incident-issue.outputs.issue_number }}
## Git Information
- Branch: ${{ github.ref }}
- Commit: ${{ github.sha }}
- Last deployment: (Check deployment logs)
## Recent Commits
EOF
git log --oneline -10 >> diagnostics.md
cat << 'EOF' >> diagnostics.md
## Recent Workflow Runs
(Check Actions tab for recent deployments and CI runs)
## Configuration Status
- Django settings: (Check for DEBUG, ALLOWED_HOSTS)
- Database: MySQL primary, PostgreSQL secondary
- Session backend: MySQL (django.contrib.sessions.backends.db)
- Redis usage: NONE (RNF-002)
## Monitoring Links
- Application logs: /var/log/iact/
- Database monitoring: (MySQL slow query log)
- Health endpoint: /api/health
---
*Generated by Incident Response workflow*
EOF
cat diagnostics.md
- name: Upload diagnostics
uses: actions/upload-artifact@v4
with:
name: incident-diagnostics
path: diagnostics.md
retention-days: 90
- name: Comment diagnostics on incident issue
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const diagnostics = fs.readFileSync('diagnostics.md', 'utf8');
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: ${{ needs.create-incident-issue.outputs.issue_number }},
body: diagnostics
});
execute-incident-playbook:
name: Execute Incident Playbook
runs-on: ubuntu-latest
needs: [create-incident-issue, gather-diagnostics]
steps:
- uses: actions/checkout@v4
- name: Execute playbook based on incident type
run: |
INCIDENT_TYPE="${{ github.event.inputs.incident_type }}"
echo "Executing playbook for: $INCIDENT_TYPE"
case $INCIDENT_TYPE in
production_down)
echo "=== Production Down Playbook ==="
echo "1. Check health endpoint"
echo "2. Check application server status"
echo "3. Check database connectivity"
echo "4. Check disk space"
echo "5. Review recent deployments"
echo "6. Consider rollback if recent deployment"
;;
performance_degradation)
echo "=== Performance Degradation Playbook ==="
echo "1. Check database query performance"
echo "2. Check MySQL session table size"
echo "3. Review slow query log"
echo "4. Check memory and CPU usage"
echo "5. Review recent code changes"
echo "6. Consider scaling resources"
;;
security_breach)
echo "=== Security Breach Playbook ==="
echo "1. CRITICAL: Isolate affected systems"
echo "2. Preserve logs for forensics"
echo "3. Reset credentials"
echo "4. Scan for malware"
echo "5. Review access logs"
echo "6. Notify security team"
;;
data_corruption)
echo "=== Data Corruption Playbook ==="
echo "1. Stop writes to affected data"
echo "2. Identify scope of corruption"
echo "3. Restore from backup (if necessary)"
echo "4. Identify root cause"
echo "5. Verify data integrity"
echo "6. Resume operations"
;;
deployment_failure)
echo "=== Deployment Failure Playbook ==="
echo "1. Execute rollback plan"
echo "2. Restore database from backup (if needed)"
echo "3. Verify rollback successful"
echo "4. Review deployment logs"
echo "5. Identify failure cause"
echo "6. Fix and re-deploy"
;;
high_error_rate)
echo "=== High Error Rate Playbook ==="
echo "1. Check error logs"
echo "2. Identify error patterns"
echo "3. Check database connectivity"
echo "4. Check external dependencies"
echo "5. Review recent changes"
echo "6. Apply hotfix if needed"
;;
*)
echo "Unknown incident type: $INCIDENT_TYPE"
;;
esac
echo ""
echo "Playbook steps documented. Execute manually as needed."
- name: Generate incident playbook
run: |
INCIDENT_TYPE="${{ github.event.inputs.incident_type }}"
cat << EOF > playbook-$INCIDENT_TYPE.md
# Incident Playbook: $INCIDENT_TYPE
**Severity**: ${{ github.event.inputs.severity }}
**Incident**: #${{ needs.create-incident-issue.outputs.issue_number }}
## Immediate Actions
$(case $INCIDENT_TYPE in
production_down)
echo "1. Verify production is down via health endpoint"
echo "2. Check application server: sudo systemctl status gunicorn-iact-production"
echo "3. Check database: mysql -h \$DB_HOST -u root -p -e 'SELECT 1;'"
echo "4. Check logs: sudo tail -f /var/log/iact/error.log"
;;
performance_degradation)
echo "1. Check session table: SELECT COUNT(*) FROM django_session;"
echo "2. Run cleanup: python manage.py clearsessions"
echo "3. Check slow queries: SELECT * FROM mysql.slow_log LIMIT 10;"
echo "4. Monitor CPU/memory: top, htop"
;;
security_breach)
echo "1. ISOLATE affected systems immediately"
echo "2. Preserve logs: cp /var/log/iact/* /backup/forensics/"
echo "3. Reset all passwords"
echo "4. Scan: sudo clamscan -r /var/www/iact"
;;
deployment_failure)
echo "1. Execute rollback: bash scripts/rollback.sh"
echo "2. Verify rollback: curl -f https://iact.example.com/api/health"
echo "3. Check logs: sudo journalctl -u gunicorn-iact-production -n 100"
;;
*)
echo "Refer to incident type specific runbook"
;;
esac)
## Communication
- Notify team via Slack/Teams
- Update status page
- Notify affected users (if customer-facing)
## Investigation Checklist
- [ ] Review logs
- [ ] Check recent deployments
- [ ] Check recent configuration changes
- [ ] Identify root cause
- [ ] Document timeline
## Mitigation Checklist
- [ ] Apply immediate fix
- [ ] Verify fix deployed
- [ ] Monitor for recurrence
- [ ] Update incident issue
## Resolution Checklist
- [ ] Incident resolved
- [ ] Service fully restored
- [ ] Schedule post-mortem
- [ ] Update runbooks
- [ ] Implement preventive measures
---
*Generated by Incident Response workflow*
EOF
cat playbook-$INCIDENT_TYPE.md
- name: Upload playbook
uses: actions/upload-artifact@v4
with:
name: incident-playbook
path: playbook-*.md
retention-days: 90
- name: Comment playbook on incident issue
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const files = fs.readdirSync('.').filter(f => f.startsWith('playbook-'));
if (files.length > 0) {
const playbook = fs.readFileSync(files[0], 'utf8');
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: ${{ needs.create-incident-issue.outputs.issue_number }},
body: playbook
});
}
notify-team:
name: Notify Team
runs-on: ubuntu-latest
needs: [create-incident-issue, execute-incident-playbook]
if: always()
steps:
- name: Prepare notification
run: |
SEVERITY="${{ github.event.inputs.severity }}"
INCIDENT_TYPE="${{ github.event.inputs.incident_type }}"
ISSUE_NUMBER="${{ needs.create-incident-issue.outputs.issue_number }}"
cat << EOF > notification.txt
INCIDENT ALERT
Severity: $SEVERITY
Type: $INCIDENT_TYPE
Issue: #$ISSUE_NUMBER
Description: ${{ github.event.inputs.description }}
Affected Services: ${{ github.event.inputs.affected_services }}
Actions Required:
1. Review incident issue #$ISSUE_NUMBER
2. Follow incident playbook
3. Update incident issue with progress
Incident Issue: ${{ github.server_url }}/${{ github.repository }}/issues/$ISSUE_NUMBER
EOF
cat notification.txt
- name: Notify via InternalMessage (IACT compliant)
run: |
echo "Notification prepared (send via InternalMessage system)"
echo "Note: NO EMAIL per IACT restrictions"
echo "Use InternalMessage.objects.create() to notify users"
- name: Comment notification sent
uses: actions/github-script@v7
with:
script: |
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: ${{ needs.create-incident-issue.outputs.issue_number }},
body: '[PASS] Team notified via InternalMessage system (IACT RNF-002 compliant - NO EMAIL)'
});
summary:
name: Incident Response Summary
runs-on: ubuntu-latest
needs: [create-incident-issue, gather-diagnostics, execute-incident-playbook, notify-team]
if: always()
steps:
- name: Generate summary
run: |
echo "========================================="
echo "INCIDENT RESPONSE SUMMARY"
echo "========================================="
echo "Incident Type: ${{ github.event.inputs.incident_type }}"
echo "Severity: ${{ github.event.inputs.severity }}"
echo "Incident Issue: #${{ needs.create-incident-issue.outputs.issue_number }}"
echo ""
echo "Status:"
echo " Issue Created: ${{ needs.create-incident-issue.result }}"
echo " Diagnostics Gathered: ${{ needs.gather-diagnostics.result }}"
echo " Playbook Executed: ${{ needs.execute-incident-playbook.result }}"
echo " Team Notified: ${{ needs.notify-team.result }}"
echo ""
echo "Next Steps:"
echo "1. Follow playbook steps manually"
echo "2. Update incident issue with progress"
echo "3. Resolve incident"
echo "4. Schedule post-mortem"
echo "========================================="