Merge remote-tracking branch 'origin/develop' into develop #26
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Incident Response | ||
| on: | ||
| branches: [ main ] | ||
| workflow_dispatch: | ||
| inputs: | ||
| incident_type: | ||
| description: 'Type of incident' | ||
| required: true | ||
| type: choice | ||
| options: | ||
| - production_down | ||
| - performance_degradation | ||
| - security_breach | ||
| - data_corruption | ||
| - deployment_failure | ||
| - high_error_rate | ||
| severity: | ||
| description: 'Incident severity' | ||
| required: true | ||
| type: choice | ||
| options: | ||
| - critical | ||
| - high | ||
| - medium | ||
| - low | ||
| description: | ||
| description: 'Incident description' | ||
| required: true | ||
| type: string | ||
| affected_services: | ||
| description: 'Affected services (comma-separated)' | ||
| required: false | ||
| type: string | ||
| default: 'all' | ||
| jobs: | ||
| create-incident-issue: | ||
| name: Create Incident Issue | ||
| runs-on: ubuntu-latest | ||
| steps: | ||
| - uses: actions/checkout@v4 | ||
| - name: Create incident issue | ||
| id: create_issue | ||
| uses: actions/github-script@v7 | ||
| with: | ||
| script: | | ||
| const incidentType = '${{ github.event.inputs.incident_type }}'; | ||
| const severity = '${{ github.event.inputs.severity }}'; | ||
| const description = '${{ github.event.inputs.description }}'; | ||
| const affectedServices = '${{ github.event.inputs.affected_services }}'; | ||
| const timestamp = new Date().toISOString(); | ||
| const issueBody = ` | ||
| # Incident Report | ||
| **Type**: ${incidentType} | ||
| **Severity**: ${severity.toUpperCase()} | ||
| **Reported**: ${timestamp} | ||
| **Affected Services**: ${affectedServices} | ||
| ## Description | ||
| ${description} | ||
| ## Status | ||
| - [x] Incident detected | ||
| - [ ] Team notified | ||
| - [ ] Investigation started | ||
| - [ ] Root cause identified | ||
| - [ ] Mitigation deployed | ||
| - [ ] Incident resolved | ||
| - [ ] Post-mortem scheduled | ||
| ## Timeline | ||
| - ${timestamp}: Incident detected and reported | ||
| ## Investigation Notes | ||
| (Add investigation notes here) | ||
| ## Mitigation Steps | ||
| (Document mitigation steps here) | ||
| ## Post-incident Actions | ||
| - [ ] Conduct post-mortem | ||
| - [ ] Update runbooks | ||
| - [ ] Implement preventive measures | ||
| - [ ] Update monitoring/alerts | ||
| --- | ||
| *Auto-generated by Incident Response workflow* | ||
| `; | ||
| const issue = await github.rest.issues.create({ | ||
| owner: context.repo.owner, | ||
| repo: context.repo.repo, | ||
| title: `[INCIDENT] ${incidentType} - ${severity.toUpperCase()}`, | ||
| body: issueBody, | ||
| labels: ['incident', `severity-${severity}`, `type-${incidentType}`] | ||
| }); | ||
| console.log(`Created issue #${issue.data.number}`); | ||
| return issue.data.number; | ||
| outputs: | ||
| issue_number: ${{ steps.create_issue.outputs.result }} | ||
| gather-diagnostics: | ||
| name: Gather Diagnostic Information | ||
| runs-on: ubuntu-latest | ||
| needs: create-incident-issue | ||
| steps: | ||
| - uses: actions/checkout@v4 | ||
| - name: Gather system information | ||
| run: | | ||
| cat << 'EOF' > diagnostics.md | ||
| # Diagnostic Information | ||
| **Timestamp**: $(date -u +"%Y-%m-%d %H:%M:%S UTC") | ||
| **Incident**: #${{ needs.create-incident-issue.outputs.issue_number }} | ||
| ## Git Information | ||
| - Branch: ${{ github.ref }} | ||
| - Commit: ${{ github.sha }} | ||
| - Last deployment: (Check deployment logs) | ||
| ## Recent Commits | ||
| EOF | ||
| git log --oneline -10 >> diagnostics.md | ||
| cat << 'EOF' >> diagnostics.md | ||
| ## Recent Workflow Runs | ||
| (Check Actions tab for recent deployments and CI runs) | ||
| ## Configuration Status | ||
| - Django settings: (Check for DEBUG, ALLOWED_HOSTS) | ||
| - Database: MySQL primary, PostgreSQL secondary | ||
| - Session backend: MySQL (django.contrib.sessions.backends.db) | ||
| - Redis usage: NONE (RNF-002) | ||
| ## Monitoring Links | ||
| - Application logs: /var/log/iact/ | ||
| - Database monitoring: (MySQL slow query log) | ||
| - Health endpoint: /api/health | ||
| --- | ||
| *Generated by Incident Response workflow* | ||
| EOF | ||
| cat diagnostics.md | ||
| - name: Upload diagnostics | ||
| uses: actions/upload-artifact@v4 | ||
| with: | ||
| name: incident-diagnostics | ||
| path: diagnostics.md | ||
| retention-days: 90 | ||
| - name: Comment diagnostics on incident issue | ||
| uses: actions/github-script@v7 | ||
| with: | ||
| script: | | ||
| const fs = require('fs'); | ||
| const diagnostics = fs.readFileSync('diagnostics.md', 'utf8'); | ||
| await github.rest.issues.createComment({ | ||
| owner: context.repo.owner, | ||
| repo: context.repo.repo, | ||
| issue_number: ${{ needs.create-incident-issue.outputs.issue_number }}, | ||
| body: diagnostics | ||
| }); | ||
| execute-incident-playbook: | ||
| name: Execute Incident Playbook | ||
| runs-on: ubuntu-latest | ||
| needs: [create-incident-issue, gather-diagnostics] | ||
| steps: | ||
| - uses: actions/checkout@v4 | ||
| - name: Execute playbook based on incident type | ||
| run: | | ||
| INCIDENT_TYPE="${{ github.event.inputs.incident_type }}" | ||
| echo "Executing playbook for: $INCIDENT_TYPE" | ||
| case $INCIDENT_TYPE in | ||
| production_down) | ||
| echo "=== Production Down Playbook ===" | ||
| echo "1. Check health endpoint" | ||
| echo "2. Check application server status" | ||
| echo "3. Check database connectivity" | ||
| echo "4. Check disk space" | ||
| echo "5. Review recent deployments" | ||
| echo "6. Consider rollback if recent deployment" | ||
| ;; | ||
| performance_degradation) | ||
| echo "=== Performance Degradation Playbook ===" | ||
| echo "1. Check database query performance" | ||
| echo "2. Check MySQL session table size" | ||
| echo "3. Review slow query log" | ||
| echo "4. Check memory and CPU usage" | ||
| echo "5. Review recent code changes" | ||
| echo "6. Consider scaling resources" | ||
| ;; | ||
| security_breach) | ||
| echo "=== Security Breach Playbook ===" | ||
| echo "1. CRITICAL: Isolate affected systems" | ||
| echo "2. Preserve logs for forensics" | ||
| echo "3. Reset credentials" | ||
| echo "4. Scan for malware" | ||
| echo "5. Review access logs" | ||
| echo "6. Notify security team" | ||
| ;; | ||
| data_corruption) | ||
| echo "=== Data Corruption Playbook ===" | ||
| echo "1. Stop writes to affected data" | ||
| echo "2. Identify scope of corruption" | ||
| echo "3. Restore from backup (if necessary)" | ||
| echo "4. Identify root cause" | ||
| echo "5. Verify data integrity" | ||
| echo "6. Resume operations" | ||
| ;; | ||
| deployment_failure) | ||
| echo "=== Deployment Failure Playbook ===" | ||
| echo "1. Execute rollback plan" | ||
| echo "2. Restore database from backup (if needed)" | ||
| echo "3. Verify rollback successful" | ||
| echo "4. Review deployment logs" | ||
| echo "5. Identify failure cause" | ||
| echo "6. Fix and re-deploy" | ||
| ;; | ||
| high_error_rate) | ||
| echo "=== High Error Rate Playbook ===" | ||
| echo "1. Check error logs" | ||
| echo "2. Identify error patterns" | ||
| echo "3. Check database connectivity" | ||
| echo "4. Check external dependencies" | ||
| echo "5. Review recent changes" | ||
| echo "6. Apply hotfix if needed" | ||
| ;; | ||
| *) | ||
| echo "Unknown incident type: $INCIDENT_TYPE" | ||
| ;; | ||
| esac | ||
| echo "" | ||
| echo "Playbook steps documented. Execute manually as needed." | ||
| - name: Generate incident playbook | ||
| run: | | ||
| INCIDENT_TYPE="${{ github.event.inputs.incident_type }}" | ||
| cat << EOF > playbook-$INCIDENT_TYPE.md | ||
| # Incident Playbook: $INCIDENT_TYPE | ||
| **Severity**: ${{ github.event.inputs.severity }} | ||
| **Incident**: #${{ needs.create-incident-issue.outputs.issue_number }} | ||
| ## Immediate Actions | ||
| $(case $INCIDENT_TYPE in | ||
| production_down) | ||
| echo "1. Verify production is down via health endpoint" | ||
| echo "2. Check application server: sudo systemctl status gunicorn-iact-production" | ||
| echo "3. Check database: mysql -h \$DB_HOST -u root -p -e 'SELECT 1;'" | ||
| echo "4. Check logs: sudo tail -f /var/log/iact/error.log" | ||
| ;; | ||
| performance_degradation) | ||
| echo "1. Check session table: SELECT COUNT(*) FROM django_session;" | ||
| echo "2. Run cleanup: python manage.py clearsessions" | ||
| echo "3. Check slow queries: SELECT * FROM mysql.slow_log LIMIT 10;" | ||
| echo "4. Monitor CPU/memory: top, htop" | ||
| ;; | ||
| security_breach) | ||
| echo "1. ISOLATE affected systems immediately" | ||
| echo "2. Preserve logs: cp /var/log/iact/* /backup/forensics/" | ||
| echo "3. Reset all passwords" | ||
| echo "4. Scan: sudo clamscan -r /var/www/iact" | ||
| ;; | ||
| deployment_failure) | ||
| echo "1. Execute rollback: bash scripts/rollback.sh" | ||
| echo "2. Verify rollback: curl -f https://iact.example.com/api/health" | ||
| echo "3. Check logs: sudo journalctl -u gunicorn-iact-production -n 100" | ||
| ;; | ||
| *) | ||
| echo "Refer to incident type specific runbook" | ||
| ;; | ||
| esac) | ||
| ## Communication | ||
| - Notify team via Slack/Teams | ||
| - Update status page | ||
| - Notify affected users (if customer-facing) | ||
| ## Investigation Checklist | ||
| - [ ] Review logs | ||
| - [ ] Check recent deployments | ||
| - [ ] Check recent configuration changes | ||
| - [ ] Identify root cause | ||
| - [ ] Document timeline | ||
| ## Mitigation Checklist | ||
| - [ ] Apply immediate fix | ||
| - [ ] Verify fix deployed | ||
| - [ ] Monitor for recurrence | ||
| - [ ] Update incident issue | ||
| ## Resolution Checklist | ||
| - [ ] Incident resolved | ||
| - [ ] Service fully restored | ||
| - [ ] Schedule post-mortem | ||
| - [ ] Update runbooks | ||
| - [ ] Implement preventive measures | ||
| --- | ||
| *Generated by Incident Response workflow* | ||
| EOF | ||
| cat playbook-$INCIDENT_TYPE.md | ||
| - name: Upload playbook | ||
| uses: actions/upload-artifact@v4 | ||
| with: | ||
| name: incident-playbook | ||
| path: playbook-*.md | ||
| retention-days: 90 | ||
| - name: Comment playbook on incident issue | ||
| uses: actions/github-script@v7 | ||
| with: | ||
| script: | | ||
| const fs = require('fs'); | ||
| const files = fs.readdirSync('.').filter(f => f.startsWith('playbook-')); | ||
| if (files.length > 0) { | ||
| const playbook = fs.readFileSync(files[0], 'utf8'); | ||
| await github.rest.issues.createComment({ | ||
| owner: context.repo.owner, | ||
| repo: context.repo.repo, | ||
| issue_number: ${{ needs.create-incident-issue.outputs.issue_number }}, | ||
| body: playbook | ||
| }); | ||
| } | ||
| notify-team: | ||
| name: Notify Team | ||
| runs-on: ubuntu-latest | ||
| needs: [create-incident-issue, execute-incident-playbook] | ||
| if: always() | ||
| steps: | ||
| - name: Prepare notification | ||
| run: | | ||
| SEVERITY="${{ github.event.inputs.severity }}" | ||
| INCIDENT_TYPE="${{ github.event.inputs.incident_type }}" | ||
| ISSUE_NUMBER="${{ needs.create-incident-issue.outputs.issue_number }}" | ||
| cat << EOF > notification.txt | ||
| INCIDENT ALERT | ||
| Severity: $SEVERITY | ||
| Type: $INCIDENT_TYPE | ||
| Issue: #$ISSUE_NUMBER | ||
| Description: ${{ github.event.inputs.description }} | ||
| Affected Services: ${{ github.event.inputs.affected_services }} | ||
| Actions Required: | ||
| 1. Review incident issue #$ISSUE_NUMBER | ||
| 2. Follow incident playbook | ||
| 3. Update incident issue with progress | ||
| Incident Issue: ${{ github.server_url }}/${{ github.repository }}/issues/$ISSUE_NUMBER | ||
| EOF | ||
| cat notification.txt | ||
| - name: Notify via InternalMessage (IACT compliant) | ||
| run: | | ||
| echo "Notification prepared (send via InternalMessage system)" | ||
| echo "Note: NO EMAIL per IACT restrictions" | ||
| echo "Use InternalMessage.objects.create() to notify users" | ||
| - name: Comment notification sent | ||
| uses: actions/github-script@v7 | ||
| with: | ||
| script: | | ||
| await github.rest.issues.createComment({ | ||
| owner: context.repo.owner, | ||
| repo: context.repo.repo, | ||
| issue_number: ${{ needs.create-incident-issue.outputs.issue_number }}, | ||
| body: '[PASS] Team notified via InternalMessage system (IACT RNF-002 compliant - NO EMAIL)' | ||
| }); | ||
| summary: | ||
| name: Incident Response Summary | ||
| runs-on: ubuntu-latest | ||
| needs: [create-incident-issue, gather-diagnostics, execute-incident-playbook, notify-team] | ||
| if: always() | ||
| steps: | ||
| - name: Generate summary | ||
| run: | | ||
| echo "=========================================" | ||
| echo "INCIDENT RESPONSE SUMMARY" | ||
| echo "=========================================" | ||
| echo "Incident Type: ${{ github.event.inputs.incident_type }}" | ||
| echo "Severity: ${{ github.event.inputs.severity }}" | ||
| echo "Incident Issue: #${{ needs.create-incident-issue.outputs.issue_number }}" | ||
| echo "" | ||
| echo "Status:" | ||
| echo " Issue Created: ${{ needs.create-incident-issue.result }}" | ||
| echo " Diagnostics Gathered: ${{ needs.gather-diagnostics.result }}" | ||
| echo " Playbook Executed: ${{ needs.execute-incident-playbook.result }}" | ||
| echo " Team Notified: ${{ needs.notify-team.result }}" | ||
| echo "" | ||
| echo "Next Steps:" | ||
| echo "1. Follow playbook steps manually" | ||
| echo "2. Update incident issue with progress" | ||
| echo "3. Resolve incident" | ||
| echo "4. Schedule post-mortem" | ||
| echo "=========================================" | ||