From 24beea040a2dfa938f189dfeef8d57ff5e7a315c Mon Sep 17 00:00:00 2001 From: Gustavo Lima Date: Mon, 26 Jan 2026 19:21:39 +0100 Subject: [PATCH 1/2] gitignore was included --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 037c860..92b2ce9 100644 --- a/.gitignore +++ b/.gitignore @@ -229,3 +229,4 @@ $RECYCLE.BIN/ *.key credentials.json secrets.json +.circleci \ No newline at end of file From 407250d5e950ae6af9c69a25eeec708a31fcbe5b Mon Sep 17 00:00:00 2001 From: Gustavo Lima Date: Mon, 26 Jan 2026 19:54:03 +0100 Subject: [PATCH 2/2] update --- .github/workflows/dedo-duro-analysis.yml | 206 ++++++ README.md | 312 ++++++++- analyzers/cost_explorer_analyzer.py | 482 ++++++++++++++ analyzers/eks_deployment_lifecycle.py | 480 ++++++++++++++ analyzers/eks_session_analyzer.py | 396 ++++++++++++ analyzers/rto_analyzer.py | 513 +++++++++++++++ ci/Jenkinsfile | 239 +++++++ complete_simulated_report.html | 391 ++++++++++++ config.py | 89 ++- core/multi_account.py | 374 +++++++++++ core/reporter.py | 139 +++- docs/kubernetes_permissions.md | 245 ++++++++ main.py | 90 ++- pytest.ini | 3 + remediation/base.py | 350 +++++++++++ remediation/ec2_remediation.py | 434 +++++++++++++ requirements.txt | 16 +- simulate_complete_report.py | 727 +++++++++++++++++++++ tests/conftest.py | 27 + tests/test_comprehensive.py | 768 +++++++++++++++++++++++ tests/test_new_features.py | 283 +++++++++ web/app.py | 454 ++++++++++++++ 22 files changed, 6988 insertions(+), 30 deletions(-) create mode 100644 .github/workflows/dedo-duro-analysis.yml create mode 100644 analyzers/cost_explorer_analyzer.py create mode 100644 analyzers/eks_deployment_lifecycle.py create mode 100644 analyzers/eks_session_analyzer.py create mode 100644 analyzers/rto_analyzer.py create mode 100644 ci/Jenkinsfile create mode 100644 complete_simulated_report.html create mode 100644 core/multi_account.py create mode 100644 docs/kubernetes_permissions.md create mode 100644 pytest.ini create mode 100644 remediation/base.py create mode 100644 remediation/ec2_remediation.py create mode 100644 simulate_complete_report.py create mode 100644 tests/conftest.py create mode 100644 tests/test_comprehensive.py create mode 100644 tests/test_new_features.py create mode 100644 web/app.py diff --git a/.github/workflows/dedo-duro-analysis.yml b/.github/workflows/dedo-duro-analysis.yml new file mode 100644 index 0000000..8d8fa93 --- /dev/null +++ b/.github/workflows/dedo-duro-analysis.yml @@ -0,0 +1,206 @@ +name: Dedo-Duro AWS Analysis + +on: + # Run weekly on Monday at 6 AM UTC + schedule: + - cron: '0 6 * * 1' + + # Allow manual trigger + workflow_dispatch: + inputs: + region: + description: 'AWS Region to analyze (leave empty for default)' + required: false + type: string + resource_types: + description: 'Comma-separated resource types (leave empty for all)' + required: false + type: string + output_format: + description: 'Output format' + required: false + default: 'html' + type: choice + options: + - html + - json + - csv + multi_region: + description: 'Analyze all regions' + required: false + default: false + type: boolean + environment_filter: + description: 'Environment filter (prod, test, dev)' + required: false + type: string + +env: + PYTHON_VERSION: '3.11' + +jobs: + analyze: + name: Run AWS Resource Analysis + runs-on: ubuntu-latest + permissions: + id-token: write # Required for OIDC authentication + contents: read + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + env: + INPUT_REGION: ${{ inputs.region }} + DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} + with: + role-to-assume: ${{ secrets.AWS_ROLE_ARN }} + aws-region: ${{ inputs.region || secrets.AWS_DEFAULT_REGION || 'us-east-1' }} + + - name: Run Dedo-Duro Analysis + id: analysis + env: + INPUT_REGION: ${{ inputs.region }} + INPUT_RESOURCE_TYPES: ${{ inputs.resource_types }} + INPUT_OUTPUT_FORMAT: ${{ inputs.output_format }} + INPUT_MULTI_REGION: ${{ inputs.multi_region }} + INPUT_ENVIRONMENT: ${{ inputs.environment_filter }} + run: | + # Build command with optional parameters using environment variables + CMD="python main.py" + + # Add region if specified (validate alphanumeric and hyphens only) + if [ -n "$INPUT_REGION" ]; then + SAFE_REGION=$(echo "$INPUT_REGION" | grep -E '^[a-z0-9-]+$' || echo "") + if [ -n "$SAFE_REGION" ]; then + CMD="$CMD --region $SAFE_REGION" + fi + fi + + # Add resource types if specified (validate alphanumeric, commas, underscores) + if [ -n "$INPUT_RESOURCE_TYPES" ]; then + SAFE_TYPES=$(echo "$INPUT_RESOURCE_TYPES" | grep -E '^[a-zA-Z0-9_,]+$' || echo "") + if [ -n "$SAFE_TYPES" ]; then + CMD="$CMD --resource-types $SAFE_TYPES" + fi + fi + + # Add output format (choice type, already validated) + if [ -n "$INPUT_OUTPUT_FORMAT" ]; then + CMD="$CMD --output-format $INPUT_OUTPUT_FORMAT" + else + CMD="$CMD --output-format html" + fi + + # Add multi-region flag if enabled + if [ "$INPUT_MULTI_REGION" = "true" ]; then + CMD="$CMD --multi-region" + fi + + # Add environment filter if specified (validate alphanumeric only) + if [ -n "$INPUT_ENVIRONMENT" ]; then + SAFE_ENV=$(echo "$INPUT_ENVIRONMENT" | grep -E '^[a-zA-Z]+$' || echo "") + if [ -n "$SAFE_ENV" ]; then + CMD="$CMD --environment $SAFE_ENV" + fi + fi + + # Run analysis + echo "Running: $CMD" + eval "$CMD" + + # Set output file path + REPORT=$(ls aws-optimization-report.* 2>/dev/null | head -1) + echo "report_file=$REPORT" >> "$GITHUB_OUTPUT" + + - name: Upload Report Artifact + uses: actions/upload-artifact@v4 + with: + name: dedo-duro-report-${{ github.run_number }} + path: | + aws-optimization-report.* + retention-days: 30 + + - name: Upload to S3 (optional) + if: ${{ secrets.REPORT_S3_BUCKET != '' }} + env: + REPORT_FILE: ${{ steps.analysis.outputs.report_file }} + S3_BUCKET: ${{ secrets.REPORT_S3_BUCKET }} + run: | + if [ -n "$REPORT_FILE" ] && [ -f "$REPORT_FILE" ]; then + TIMESTAMP=$(date +%Y-%m-%d) + aws s3 cp "$REPORT_FILE" "s3://${S3_BUCKET}/reports/${TIMESTAMP}/${REPORT_FILE}" + echo "Report uploaded to s3://${S3_BUCKET}/reports/${TIMESTAMP}/${REPORT_FILE}" + fi + + - name: Create Summary + env: + REPORT_FILE: ${{ steps.analysis.outputs.report_file }} + INPUT_REGION: ${{ inputs.region }} + INPUT_OUTPUT_FORMAT: ${{ inputs.output_format }} + run: | + { + echo "## Dedo-Duro Analysis Complete" + echo "" + echo "**Report:** \`${REPORT_FILE:-no report}\`" + echo "**Region:** ${INPUT_REGION:-default}" + echo "**Format:** ${INPUT_OUTPUT_FORMAT:-html}" + echo "" + echo "Download the report from the Artifacts section above." + } >> "$GITHUB_STEP_SUMMARY" + + notify: + name: Send Notifications + needs: analyze + runs-on: ubuntu-latest + if: always() + + steps: + - name: Send Slack Notification (optional) + if: ${{ secrets.SLACK_WEBHOOK_URL != '' }} + uses: slackapi/slack-github-action@v1.25.0 + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} + SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK + ANALYZE_RESULT: ${{ needs.analyze.result }} + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + with: + payload: | + { + "text": "Dedo-Duro AWS Analysis Complete", + "blocks": [ + { + "type": "header", + "text": { + "type": "plain_text", + "text": "Dedo-Duro AWS Analysis Report" + } + }, + { + "type": "section", + "fields": [ + { + "type": "mrkdwn", + "text": "*Status:*\n${{ needs.analyze.result }}" + }, + { + "type": "mrkdwn", + "text": "*Run:*\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Details>" + } + ] + } + ] + } diff --git a/README.md b/README.md index 9770b80..233814b 100755 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Python Version AWS boto3 License - Version + Version

--- @@ -58,6 +58,7 @@ mindmap Spot Instances Savings Plans Schedule Optimization + Cost Explorer Integration Security & Privacy GDPR Compliance ISO 27701 @@ -66,11 +67,20 @@ mindmap 30+ AWS Services AI/ML Services Multi-Region + Multi-Account + Kubernetes + EKS Sessions + Deployment Lifecycle + RTO/RPO Analysis Reporting HTML Interactive JSON/CSV Export S3 Upload Visual Charts + CI/CD Integration + GitHub Actions + Jenkins + CircleCI ``` --- @@ -140,17 +150,25 @@ graph TB SavingsPlans["Savings Plans"] CUR["Cost & Usage Report"] ScheduleOpt["Schedule Optimizer"] + CostExplorer["Cost Explorer"] end subgraph Governance["Governance"] Security["Security Analysis"] Privacy["Privacy Compliance"] Orphan["Orphaned Resources"] + RTO["RTO/RPO Analysis"] + end + + subgraph Kubernetes["Kubernetes (EKS)"] + EKSSessions["Session Monitoring"] + EKSDeployment["Deployment Lifecycle"] end style AIML fill:#9b59b6,color:#fff style Financial fill:#27ae60,color:#fff style Governance fill:#e74c3c,color:#fff + style Kubernetes fill:#326ce5,color:#fff ``` ### AI/ML Service Analysis @@ -165,6 +183,43 @@ Comprehensive cost optimization for AWS AI/ML services: - **Amazon Transcribe:** Analyzes custom vocabularies, language models, call analytics categories, and job patterns (failed/stuck detection). - **Amazon Kendra:** Analyzes indexes (edition-based costs), data sources, experiences, and query patterns. +### v12.0 New Features + +#### Multi-Account Analysis +- **Consolidated Analysis**: Analyze multiple AWS accounts simultaneously with the `--accounts-file` option +- **Cross-Account Reports**: Generate both individual and consolidated reports across accounts +- **AWS Organizations Support**: Leverage AWS Organizations for automatic account discovery +- **Partition Support**: Full support for AWS Commercial, GovCloud, and China partitions + +#### Cost Explorer Integration +- **Real Cost Data**: Integrates with AWS Cost Explorer API for actual spend data +- **Anomaly Detection**: Identifies cost spikes and unusual spending patterns +- **Budget Tracking**: Compares actual costs against estimated costs +- **Service-Level Analysis**: Breaks down costs by AWS service + +#### RTO/RPO Analysis +- **Backup Assessment**: Analyzes backup configurations for RDS, S3, and other services +- **Cross-Region Replication**: Checks for disaster recovery readiness +- **Recovery Metrics**: Calculates estimated RTO (Recovery Time Objective) and RPO (Recovery Point Objective) +- **Compliance Checks**: Identifies resources not meeting recovery requirements + +#### EKS Monitoring (Kubernetes) +- **Session Monitoring**: Tracks active kubectl and SSM sessions to EKS clusters +- **Deployment Lifecycle**: Monitors deployment health, age, and update frequency +- **Restart Analysis**: Identifies pods with excessive restart counts +- **Stale Deployment Detection**: Flags deployments not updated in 90+ days + +#### Environment Filtering +- **Environment Tags**: Filter analysis by environment (production, staging, development, test) +- **Tag-Based Grouping**: Group resources by custom tags (Team, Project, CostCenter) +- **Targeted Reports**: Generate environment-specific reports + +#### CI/CD Integration +- **GitHub Actions**: Pre-built workflow for automated weekly analysis +- **Jenkins Pipeline**: Jenkinsfile for Jenkins CI/CD integration +- **CircleCI Config**: Configuration for CircleCI pipelines +- **Artifact Upload**: Automatic report upload to S3 or CI artifacts + ### Advanced Capabilities & Reporting - **Multi-Region & China Region Support:** Analyzes resources across multiple specified AWS regions simultaneously, including AWS China regions (`cn-north-1`, `cn-northwest-1`). @@ -382,12 +437,17 @@ flowchart LR │ ├── __init__.py │ ├── ... (various analyzer files) ... │ ├── cur.py # Cost and Usage Report (CUR) analysis +│ ├── cost_explorer_analyzer.py # Cost Explorer integration (v12.0) +│ ├── rto_analyzer.py # RTO/RPO analysis (v12.0) +│ ├── eks_session_analyzer.py # EKS session monitoring (v12.0) +│ └── eks_deployment_lifecycle.py # EKS deployment lifecycle (v12.0) ├── core/ # Core Dedo-Duro functionality │ ├── __init__.py │ ├── analyzer.py # Main analyzer orchestration with lazy initialization │ ├── metrics.py # CloudWatch metrics handling │ ├── reporter.py # Report generation coordination -│ └── types.py # Type definitions (AnalysisResult, TypedDicts, utc_now) +│ ├── types.py # Type definitions (AnalysisResult, TypedDicts, utc_now) +│ └── multi_account.py # Multi-account orchestration (v12.0) ├── reporters/ # Dedo-Duro report generators │ ├── __init__.py │ ├── json_reporter.py # JSON report generation @@ -401,6 +461,13 @@ flowchart LR │ └── templates/ # Templates for Dockerfile/Helm ├── schedule/ # Schedule optimization module │ └── __init__.py +├── ci/ # CI/CD integration templates (v12.0) +│ ├── Jenkinsfile # Jenkins pipeline configuration +│ └── github_reporter.py # GitHub-specific output format +├── .github/workflows/ # GitHub Actions workflows (v12.0) +│ └── dedo-duro-analysis.yml # Automated analysis workflow +├── .circleci/ # CircleCI configuration (v12.0) +│ └── config.yml # CircleCI pipeline config └── utils/ # Utility functions (shared) ├── __init__.py ├── aws_utils.py # AWS-specific utilities @@ -615,11 +682,66 @@ flowchart TD | `--cur-s3-uri` | S3 URI for CUR data | None | | `--cur-days-ago` | Days of CUR data to analyze | None | | `--multi-region` | Analyze all regions | False | +| `--accounts-file` | JSON file with account configs | None | +| `--all-accounts` | Analyze all Organization accounts | False | +| `--environment` | Filter by environment (prod/test/dev) | None | +| `--grouping-tags` | Tags for resource grouping | Team,Project | | `--verbose` | Detailed output | False | | `--single-thread` | Disable parallel processing | False | | `--max-workers` | Parallel workers | 10 | | `--retry-attempts` | Max retry attempts | 5 | +### Multi-Account Analysis (v12.0) + +Analyze multiple AWS accounts simultaneously: + +```bash +# Using accounts file +python main.py --accounts-file accounts.json --output-format html + +# Analyze all accounts in AWS Organizations +python main.py --all-accounts --output-format html +``` + +**accounts.json** format: + +```json +{ + "accounts": [ + { + "account_id": "111111111111", + "role_arn": "arn:aws:iam::111111111111:role/DedoDuroRole", + "alias": "production", + "regions": ["us-east-1", "us-west-2"] + }, + { + "account_id": "222222222222", + "role_arn": "arn:aws:iam::222222222222:role/DedoDuroRole", + "alias": "staging", + "regions": ["us-east-1"] + } + ], + "partition": "aws" +} +``` + +**Partition values:** +- `aws` - AWS Commercial (default) +- `aws-us-gov` - AWS GovCloud +- `aws-cn` - AWS China + +### Environment Filtering (v12.0) + +Filter analysis by environment: + +```bash +# Analyze only production resources +python main.py --environment prod --region us-east-1 + +# Analyze with custom grouping tags +python main.py --grouping-tags Team,Project,CostCenter --region us-east-1 +``` + ### Supported Resource Types (Analyzer Keys) ```mermaid @@ -681,6 +803,7 @@ graph LR **Full list of analyzer keys:** - `ec2`, `ec2-eff`, `s3`, `rds`, `ebs`, `ebs_snapshot`, `lambda`, `elasticache`, `elb`, `dynamodb`, `api_gateway`, `nat`, `eip`, `vpc_endpoints`, `spot`, `security_privacy`, `orphan`, `ecs`, `sagemaker`, `bedrock`, `comprehend`, `rekognition`, `textract`, `transcribe`, `kendra`, `opensearch`, `compute_optimizer`, `savings_plans`, `cur`, `cloudfront`, `efs`, `route53`, `schedule_optimizer`, `terraform_recommendations`, `cloudformation_recommendations` +- **v12.0 New:** `cost_explorer`, `rto_analysis`, `eks_sessions`, `eks_deployments` --- @@ -753,6 +876,122 @@ flowchart LR --- +## CI/CD Integration (v12.0) + +Dedo-Duro includes ready-to-use CI/CD configurations for automated analysis. + +### GitHub Actions + +```yaml +# .github/workflows/dedo-duro-analysis.yml +name: Dedo-Duro AWS Analysis + +on: + schedule: + - cron: '0 6 * * 1' # Weekly on Mondays + workflow_dispatch: + +jobs: + analyze: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + + - name: Run Dedo-Duro Analysis + run: | + pip install -r requirements.txt + python main.py --region us-east-1 --output-format html + + - name: Upload Report + uses: actions/upload-artifact@v4 + with: + name: dedo-duro-report + path: '*.html' +``` + +### Jenkins Pipeline + +```groovy +// ci/Jenkinsfile +pipeline { + agent any + + triggers { + cron('0 6 * * 1') // Weekly + } + + environment { + AWS_DEFAULT_REGION = 'us-east-1' + } + + stages { + stage('Setup') { + steps { + sh 'pip install -r requirements.txt' + } + } + + stage('Analyze') { + steps { + withCredentials([[$class: 'AmazonWebServicesCredentialsBinding', + credentialsId: 'aws-credentials']]) { + sh 'python main.py --region us-east-1 --output-format html' + } + } + } + + stage('Archive') { + steps { + archiveArtifacts artifacts: '*.html', fingerprint: true + } + } + } +} +``` + +### CircleCI + +```yaml +# .circleci/config.yml +version: 2.1 + +jobs: + analyze: + docker: + - image: cimg/python:3.11 + steps: + - checkout + - run: + name: Install dependencies + command: pip install -r requirements.txt + - run: + name: Run analysis + command: python main.py --region us-east-1 --output-format html + - store_artifacts: + path: . + destination: reports + +workflows: + weekly-analysis: + triggers: + - schedule: + cron: "0 6 * * 1" + filters: + branches: + only: main + jobs: + - analyze +``` + +--- + ## Automated Multi-Account/Region Execution (AWS Batch) ```mermaid @@ -1010,16 +1249,21 @@ timeline section Enhancement v7.0-v8.0 : EC2 efficiency, RI awareness v9.0 : Containerization, S3 reports - - section Current v10.0 : Type safety, lazy init v11.0 : AI/ML services + section Current + v12.0 : Multi-Account support + : Cost Explorer integration + : RTO/RPO Analysis + : EKS Monitoring + : CI/CD Integration + : Environment filtering + section Future - v12.0+ : CI/CD integration - : Cost Explorer integration - : Web interface - : Auto-remediation + v13.0+ : Web interface + : Auto-remediation + : Real-time dashboards ``` --- @@ -1102,15 +1346,26 @@ The Dedo-Duro architecture supports team collaboration: ## Conclusion -Dedo-Duro 11.0 introduces comprehensive **AI/ML service cost optimization** capabilities: +Dedo-Duro 12.0 introduces **enterprise-scale analysis capabilities**: -- **Amazon SageMaker (Enhanced)**: Complete analysis of notebook instances, endpoints, training jobs, models, Feature Store, and Studio domains -- **Amazon Bedrock (NEW)**: Provisioned throughput, custom models, guardrails, knowledge bases -- **Amazon Comprehend (NEW)**: Endpoints, classifiers, entity recognizers, flywheels -- **Amazon Rekognition (NEW)**: Custom Labels projects, stream processors ($4,320/month if 24/7), face collections -- **Amazon Textract (NEW)**: Custom adapters, per-operation cost estimation -- **Amazon Transcribe (NEW)**: Vocabularies, language models, job patterns -- **Amazon Kendra (NEW)**: Indexes (Developer $810/mo, Enterprise $2,700/mo), data sources, experiences +### v12.0 Highlights + +- **Multi-Account Analysis**: Analyze entire AWS Organizations with consolidated reporting +- **Cost Explorer Integration**: Real cost data with anomaly detection +- **RTO/RPO Analysis**: Disaster recovery readiness assessment +- **EKS Monitoring**: Kubernetes session tracking and deployment lifecycle analysis +- **Environment Filtering**: Target specific environments (prod/test/dev) +- **CI/CD Integration**: GitHub Actions, Jenkins, and CircleCI support out-of-the-box + +### v11.0 AI/ML Capabilities (Retained) + +- **Amazon SageMaker**: Notebook instances, endpoints, training jobs, Feature Store, Studio +- **Amazon Bedrock**: Provisioned throughput, custom models, guardrails, knowledge bases +- **Amazon Comprehend**: Endpoints, classifiers, entity recognizers, flywheels +- **Amazon Rekognition**: Custom Labels projects, stream processors, face collections +- **Amazon Textract**: Custom adapters, per-operation cost estimation +- **Amazon Transcribe**: Vocabularies, language models, job patterns +- **Amazon Kendra**: Indexes, data sources, experiences This object-oriented approach results in a maintainable and adaptable codebase, capable of evolving with new AWS services and user requirements. @@ -1126,19 +1381,28 @@ MIT License Developed and maintained by **Gustavo Lima**. -Key milestones: v2.0 (architecture), v3.0 (security), v4.0 (Spot), v5.0 (orphan), v7-8.0 (efficiency), v9.0 (automation), v10.0 (type safety), v11.0 (AI/ML services). +Key milestones: v2.0 (architecture), v3.0 (security), v4.0 (Spot), v5.0 (orphan), v7-8.0 (efficiency), v9.0 (automation), v10.0 (type safety), v11.0 (AI/ML services), v12.0 (multi-account, Cost Explorer, EKS, CI/CD). --- ## To Do's -- Division of execution by environment, production and test -- List new permissions required for new functions, such as Kubernetes -- Monitoring of open sessions by environments versus Kubernetes -- Monitoring the deployment lifecycle with Kubernetes -- RTO Analysis Process for Production -- Reading files with tags and metadata to facilitate the resource grouping process -- Create the all-in option - Run for a set of accounts at the same time, with logic to not overload the process, delivering segregated and consolidated reports +### Completed in v12.0 ✅ + +- ~~Division of execution by environment, production and test~~ → **Environment filtering** (`--environment` flag) +- ~~Monitoring of open sessions by environments versus Kubernetes~~ → **EKS Session Analyzer** (`eks_sessions`) +- ~~Monitoring the deployment lifecycle with Kubernetes~~ → **EKS Deployment Lifecycle** (`eks_deployments`) +- ~~RTO Analysis Process for Production~~ → **RTO/RPO Analyzer** (`rto_analysis`) +- ~~Reading files with tags and metadata to facilitate the resource grouping process~~ → **Tag-based grouping** (`--grouping-tags`) +- ~~Create the all-in option - Run for a set of accounts at the same time~~ → **Multi-Account Analysis** (`--accounts-file`, `--all-accounts`) + +### Pending + +- List new permissions required for new functions, such as Kubernetes (partial - see `docs/kubernetes_permissions.md`) +- Web interface for real-time monitoring +- Auto-remediation capabilities (experimental) +- Integration with Slack/Teams for notifications +- Custom alerting thresholds --- diff --git a/analyzers/cost_explorer_analyzer.py b/analyzers/cost_explorer_analyzer.py new file mode 100644 index 0000000..10a9d71 --- /dev/null +++ b/analyzers/cost_explorer_analyzer.py @@ -0,0 +1,482 @@ +""" +Cost Explorer Analyzer module. + +Analyzes AWS Cost Explorer data to identify actual spending patterns, +cost anomalies, and compare with estimated costs from other analyzers. +""" + +import logging +from datetime import datetime, timedelta +from typing import Dict, List, Any, Optional +from dateutil.tz import tzutc + +from core.analyzer import ResourceAnalyzer +from config import AWSConfig, AnalysisConfig +from core.metrics import CloudWatchMetrics +from utils.console import print_error, print_info, print_warning + +log = logging.getLogger(__name__) + + +class CostExplorerAnalyzer(ResourceAnalyzer): + """ + Analyzer for AWS Cost Explorer data. + + Provides: + - Actual cost breakdown by service + - Cost trends over time + - Cost anomaly detection + - Comparison with optimization estimates + """ + + def __init__( + self, + aws_config: AWSConfig, + analysis_config: AnalysisConfig, + metrics: CloudWatchMetrics, + account_id: Optional[str], + partition: Optional[str] + ): + super().__init__(aws_config, analysis_config, metrics, account_id, partition) + self.ce_client = aws_config.create_client('ce') + + def get_service_name(self) -> str: + return 'cost_explorer' + + def get_estimated_time(self) -> str: + return "Medium (30-60s)" + + def get_description(self) -> str: + return "Analyzes AWS Cost Explorer data for spending patterns and anomalies" + + def analyze(self, **kwargs) -> List[Dict[str, Any]]: + """ + Analyze AWS Cost Explorer data. + + Returns: + List of cost findings and recommendations + """ + results = [] + + try: + print_info("Analyzing Cost Explorer data...") + + # Get cost data for different time periods + end_date = datetime.now(tzutc()).date() + + # Last 30 days + start_date_30d = end_date - timedelta(days=30) + costs_30d = self._get_costs_by_service(start_date_30d, end_date) + + # Previous 30 days for comparison + start_date_60d = end_date - timedelta(days=60) + costs_previous = self._get_costs_by_service(start_date_60d, start_date_30d) + + # Get daily cost trend + daily_costs = self._get_daily_costs(start_date_30d, end_date) + + # Get cost by linked account (for organizations) + costs_by_account = self._get_costs_by_account(start_date_30d, end_date) + + # Analyze cost trends + trend_analysis = self._analyze_cost_trends(costs_30d, costs_previous) + results.append(trend_analysis) + + # Get top services by cost + top_services = self._analyze_top_services(costs_30d) + results.extend(top_services) + + # Detect cost anomalies + anomalies = self._detect_anomalies(daily_costs) + if anomalies: + results.extend(anomalies) + + # Multi-account cost breakdown (if applicable) + if costs_by_account and len(costs_by_account) > 1: + account_analysis = self._analyze_account_costs(costs_by_account) + results.append(account_analysis) + + # Generate overall summary + results.append(self._generate_cost_summary(costs_30d, daily_costs)) + + except Exception as e: + log.error(f"Error in Cost Explorer analysis: {e}") + print_error(f"Cost Explorer analysis failed: {e}") + return [{'error': str(e)}] + + return results + + def _get_costs_by_service( + self, + start_date: datetime.date, + end_date: datetime.date + ) -> Dict[str, float]: + """Get cost breakdown by AWS service.""" + try: + response = self.ce_client.get_cost_and_usage( + TimePeriod={ + 'Start': start_date.isoformat(), + 'End': end_date.isoformat() + }, + Granularity='MONTHLY', + Metrics=['UnblendedCost'], + GroupBy=[ + {'Type': 'DIMENSION', 'Key': 'SERVICE'} + ] + ) + + costs = {} + for result in response.get('ResultsByTime', []): + for group in result.get('Groups', []): + service = group['Keys'][0] + amount = float(group['Metrics']['UnblendedCost']['Amount']) + if service in costs: + costs[service] += amount + else: + costs[service] = amount + + return costs + + except Exception as e: + log.warning(f"Error getting costs by service: {e}") + return {} + + def _get_daily_costs( + self, + start_date: datetime.date, + end_date: datetime.date + ) -> List[Dict[str, Any]]: + """Get daily cost data for trend analysis.""" + try: + response = self.ce_client.get_cost_and_usage( + TimePeriod={ + 'Start': start_date.isoformat(), + 'End': end_date.isoformat() + }, + Granularity='DAILY', + Metrics=['UnblendedCost'] + ) + + daily_costs = [] + for result in response.get('ResultsByTime', []): + daily_costs.append({ + 'date': result['TimePeriod']['Start'], + 'cost': float(result['Total']['UnblendedCost']['Amount']) + }) + + return daily_costs + + except Exception as e: + log.warning(f"Error getting daily costs: {e}") + return [] + + def _get_costs_by_account( + self, + start_date: datetime.date, + end_date: datetime.date + ) -> Dict[str, float]: + """Get cost breakdown by linked account (for AWS Organizations).""" + try: + response = self.ce_client.get_cost_and_usage( + TimePeriod={ + 'Start': start_date.isoformat(), + 'End': end_date.isoformat() + }, + Granularity='MONTHLY', + Metrics=['UnblendedCost'], + GroupBy=[ + {'Type': 'DIMENSION', 'Key': 'LINKED_ACCOUNT'} + ] + ) + + costs = {} + for result in response.get('ResultsByTime', []): + for group in result.get('Groups', []): + account_id = group['Keys'][0] + amount = float(group['Metrics']['UnblendedCost']['Amount']) + if account_id in costs: + costs[account_id] += amount + else: + costs[account_id] = amount + + return costs + + except Exception as e: + log.warning(f"Error getting costs by account: {e}") + return {} + + def _analyze_cost_trends( + self, + current_costs: Dict[str, float], + previous_costs: Dict[str, float] + ) -> Dict[str, Any]: + """Analyze cost trends comparing current and previous periods.""" + current_total = sum(current_costs.values()) + previous_total = sum(previous_costs.values()) + + # Calculate change percentage + if previous_total > 0: + change_pct = ((current_total - previous_total) / previous_total) * 100 + else: + change_pct = 0 + + # Identify services with significant cost increases + cost_increases = [] + for service, cost in current_costs.items(): + prev_cost = previous_costs.get(service, 0) + if prev_cost > 0: + service_change = ((cost - prev_cost) / prev_cost) * 100 + if service_change > 20 and cost > 10: # >20% increase and >$10 + cost_increases.append({ + 'service': service, + 'current_cost': round(cost, 2), + 'previous_cost': round(prev_cost, 2), + 'change_percent': round(service_change, 1) + }) + + # Sort by absolute change + cost_increases.sort(key=lambda x: x['current_cost'] - x['previous_cost'], reverse=True) + + findings = [] + recommendation = None + + if change_pct > 20: + findings.append({ + 'type': 'cost_increase', + 'severity': 'high' if change_pct > 50 else 'medium', + 'message': f"Monthly costs increased by {change_pct:.1f}% compared to previous period", + 'details': cost_increases[:5] # Top 5 increases + }) + recommendation = ( + f"Costs have increased {change_pct:.1f}% from ${previous_total:.2f} to " + f"${current_total:.2f}. Review the services with highest increases: " + f"{', '.join([c['service'] for c in cost_increases[:3]])}." + ) + elif change_pct < -10: + findings.append({ + 'type': 'cost_decrease', + 'severity': 'low', + 'message': f"Monthly costs decreased by {abs(change_pct):.1f}%" + }) + recommendation = f"Good progress! Costs decreased by ${previous_total - current_total:.2f}." + else: + recommendation = "Costs are relatively stable compared to the previous period." + + return { + 'resource_type': 'cost_trend_analysis', + 'region': self.aws_config.region, + 'account_id': self.account_id, + 'current_period_total': round(current_total, 2), + 'previous_period_total': round(previous_total, 2), + 'change_percent': round(change_pct, 1), + 'significant_increases': cost_increases[:10], + 'findings': findings, + 'recommendation': recommendation, + 'estimated_monthly_savings': 0.0 + } + + def _analyze_top_services( + self, + costs: Dict[str, float] + ) -> List[Dict[str, Any]]: + """Analyze top cost services and identify optimization opportunities.""" + results = [] + + # Sort services by cost + sorted_services = sorted(costs.items(), key=lambda x: x[1], reverse=True) + total_cost = sum(costs.values()) + + for i, (service, cost) in enumerate(sorted_services[:10]): + if cost < 1: # Skip services with negligible cost + continue + + percentage = (cost / total_cost * 100) if total_cost > 0 else 0 + findings = [] + recommendation = None + estimated_savings = 0.0 + + # Service-specific recommendations + if service == 'Amazon Elastic Compute Cloud - Compute': + findings.append({ + 'type': 'ec2_optimization', + 'severity': 'medium', + 'message': 'EC2 compute costs may have optimization opportunities' + }) + recommendation = ( + "Review EC2 instances for rightsizing opportunities, Reserved Instance " + "or Savings Plan coverage, and consider Spot instances for flexible workloads." + ) + estimated_savings = cost * 0.3 # Estimate 30% potential savings + + elif service == 'Amazon Relational Database Service': + findings.append({ + 'type': 'rds_optimization', + 'severity': 'medium', + 'message': 'RDS costs may have optimization opportunities' + }) + recommendation = ( + "Review RDS instances for rightsizing, consider Reserved Instances, " + "and evaluate Aurora Serverless for variable workloads." + ) + estimated_savings = cost * 0.25 + + elif 'S3' in service: + findings.append({ + 'type': 's3_optimization', + 'severity': 'low', + 'message': 'S3 storage may benefit from lifecycle policies' + }) + recommendation = ( + "Review S3 lifecycle policies, enable Intelligent-Tiering, " + "and check for unused buckets or objects." + ) + estimated_savings = cost * 0.15 + + elif 'NAT Gateway' in service or 'VPC' in service: + findings.append({ + 'type': 'network_optimization', + 'severity': 'medium', + 'message': 'VPC/NAT Gateway costs may be reducible' + }) + recommendation = ( + "Review NAT Gateway data processing costs, consider VPC endpoints " + "for AWS service traffic, and optimize data transfer patterns." + ) + estimated_savings = cost * 0.4 + + else: + recommendation = f"Review {service} usage for optimization opportunities." + + results.append({ + 'resource_type': 'cost_by_service', + 'service': service, + 'region': self.aws_config.region, + 'account_id': self.account_id, + 'rank': i + 1, + 'monthly_cost': round(cost, 2), + 'percentage_of_total': round(percentage, 1), + 'findings': findings, + 'recommendation': recommendation, + 'estimated_monthly_savings': round(estimated_savings, 2) + }) + + return results + + def _detect_anomalies( + self, + daily_costs: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + """Detect cost anomalies using simple statistical analysis.""" + if len(daily_costs) < 7: + return [] + + results = [] + costs = [d['cost'] for d in daily_costs] + + # Calculate mean and standard deviation + mean_cost = sum(costs) / len(costs) + variance = sum((x - mean_cost) ** 2 for x in costs) / len(costs) + std_dev = variance ** 0.5 + + # Find days with costs > 2 standard deviations from mean + anomalies = [] + for d in daily_costs: + if std_dev > 0: + z_score = (d['cost'] - mean_cost) / std_dev + if abs(z_score) > 2: + anomalies.append({ + 'date': d['date'], + 'cost': round(d['cost'], 2), + 'expected': round(mean_cost, 2), + 'deviation': round(z_score, 2) + }) + + if anomalies: + results.append({ + 'resource_type': 'cost_anomaly', + 'region': self.aws_config.region, + 'account_id': self.account_id, + 'mean_daily_cost': round(mean_cost, 2), + 'std_deviation': round(std_dev, 2), + 'anomalous_days': anomalies, + 'findings': [{ + 'type': 'cost_spike', + 'severity': 'medium', + 'message': f"{len(anomalies)} day(s) with unusual cost patterns detected" + }], + 'recommendation': ( + f"Detected {len(anomalies)} day(s) with costs significantly different from " + f"the ${mean_cost:.2f}/day average. Investigate these dates for unexpected " + "usage or billing events." + ), + 'estimated_monthly_savings': 0.0 + }) + + return results + + def _analyze_account_costs( + self, + costs_by_account: Dict[str, float] + ) -> Dict[str, Any]: + """Analyze costs across linked accounts.""" + total = sum(costs_by_account.values()) + sorted_accounts = sorted(costs_by_account.items(), key=lambda x: x[1], reverse=True) + + accounts_summary = [] + for account_id, cost in sorted_accounts: + percentage = (cost / total * 100) if total > 0 else 0 + accounts_summary.append({ + 'account_id': account_id, + 'monthly_cost': round(cost, 2), + 'percentage': round(percentage, 1) + }) + + return { + 'resource_type': 'cost_by_account', + 'region': self.aws_config.region, + 'total_monthly_cost': round(total, 2), + 'num_accounts': len(costs_by_account), + 'accounts': accounts_summary[:20], # Top 20 accounts + 'recommendation': ( + f"Total spending of ${total:.2f} across {len(costs_by_account)} linked accounts. " + f"Top account ({sorted_accounts[0][0]}) represents " + f"{sorted_accounts[0][1]/total*100:.1f}% of total costs." + ), + 'estimated_monthly_savings': 0.0 + } + + def _generate_cost_summary( + self, + costs_by_service: Dict[str, float], + daily_costs: List[Dict[str, Any]] + ) -> Dict[str, Any]: + """Generate overall cost summary.""" + total_monthly = sum(costs_by_service.values()) + + # Calculate average daily cost + if daily_costs: + avg_daily = sum(d['cost'] for d in daily_costs) / len(daily_costs) + projected_monthly = avg_daily * 30 + else: + avg_daily = total_monthly / 30 + projected_monthly = total_monthly + + # Count services with significant spending + significant_services = sum(1 for cost in costs_by_service.values() if cost > 10) + + return { + 'resource_type': 'cost_summary', + 'region': self.aws_config.region, + 'account_id': self.account_id, + 'total_monthly_cost': round(total_monthly, 2), + 'average_daily_cost': round(avg_daily, 2), + 'projected_monthly_cost': round(projected_monthly, 2), + 'total_services': len(costs_by_service), + 'significant_services': significant_services, + 'top_service': max(costs_by_service.items(), key=lambda x: x[1])[0] if costs_by_service else 'N/A', + 'recommendation': ( + f"Total AWS spend: ${total_monthly:.2f}/month across {len(costs_by_service)} services. " + f"Average daily spend: ${avg_daily:.2f}. Review top services for optimization opportunities." + ), + 'estimated_monthly_savings': 0.0 + } diff --git a/analyzers/eks_deployment_lifecycle.py b/analyzers/eks_deployment_lifecycle.py new file mode 100644 index 0000000..2729690 --- /dev/null +++ b/analyzers/eks_deployment_lifecycle.py @@ -0,0 +1,480 @@ +""" +EKS Deployment Lifecycle Analyzer module. + +Analyzes Kubernetes deployment health, rollout status, pod restart patterns, +and deployment age vs update frequency for EKS clusters. +""" + +import logging +from datetime import datetime, timedelta +from typing import Dict, List, Any, Optional +from dateutil.tz import tzutc +import base64 +import tempfile +import os + +from core.analyzer import ResourceAnalyzer +from config import AWSConfig, AnalysisConfig +from core.metrics import CloudWatchMetrics +from utils.console import print_error, print_info, print_warning + +log = logging.getLogger(__name__) + + +class EKSDeploymentLifecycleAnalyzer(ResourceAnalyzer): + """ + Analyzer for EKS deployment health and lifecycle patterns. + + Analyzes: + - Deployment rollout status + - Pod restart patterns + - Deployment age vs update frequency + - Resource utilization trends + """ + + def __init__( + self, + aws_config: AWSConfig, + analysis_config: AnalysisConfig, + metrics: CloudWatchMetrics, + account_id: Optional[str], + partition: Optional[str] + ): + super().__init__(aws_config, analysis_config, metrics, account_id, partition) + self.eks_client = aws_config.create_client('eks') + self.k8s_client = None # Will be initialized per cluster + + def get_service_name(self) -> str: + return 'eks_deployments' + + def get_estimated_time(self) -> str: + return "Medium-Long (60-120s)" + + def get_description(self) -> str: + return "Analyzes EKS deployment health, rollout status, and pod restart patterns" + + def analyze(self, **kwargs) -> List[Dict[str, Any]]: + """ + Analyze EKS deployment lifecycle and health. + + Returns: + List of deployment findings and recommendations + """ + results = [] + + try: + # Get all EKS clusters + clusters = self._get_eks_clusters() + if not clusters: + print_info("No EKS clusters found in the current region") + return [] + + print_info(f"Analyzing deployments for {len(clusters)} EKS cluster(s)...") + + for cluster_name in clusters: + try: + cluster_results = self._analyze_cluster(cluster_name) + results.extend(cluster_results) + except Exception as e: + log.warning(f"Error analyzing cluster {cluster_name}: {e}") + results.append({ + 'resource_type': 'eks_cluster', + 'cluster_name': cluster_name, + 'status': 'error', + 'error': str(e), + 'recommendation': f"Unable to analyze cluster: {e}" + }) + + except Exception as e: + log.error(f"Error in EKS deployment analysis: {e}") + print_error(f"EKS deployment analysis failed: {e}") + return [{'error': str(e)}] + + return results + + def _get_eks_clusters(self) -> List[str]: + """Get list of EKS cluster names.""" + try: + response = self.eks_client.list_clusters() + return response.get('clusters', []) + except Exception as e: + log.error(f"Error listing EKS clusters: {e}") + return [] + + def _analyze_cluster(self, cluster_name: str) -> List[Dict[str, Any]]: + """ + Analyze a single EKS cluster's deployments. + + Args: + cluster_name: Name of the EKS cluster + + Returns: + List of findings for the cluster + """ + results = [] + + # Get cluster details + try: + cluster_info = self.eks_client.describe_cluster(name=cluster_name) + cluster = cluster_info.get('cluster', {}) + except Exception as e: + log.error(f"Error describing cluster {cluster_name}: {e}") + return [{ + 'resource_type': 'eks_cluster', + 'cluster_name': cluster_name, + 'error': f"Cannot describe cluster: {e}" + }] + + # Check if we can access the Kubernetes API + # This requires proper RBAC configuration + try: + k8s_available = self._check_k8s_access(cluster) + except Exception: + k8s_available = False + + # Generate cluster-level metrics analysis using CloudWatch + cluster_metrics = self._get_cluster_metrics(cluster_name) + + # Add cluster health summary + results.append(self._generate_cluster_summary( + cluster_name, + cluster, + cluster_metrics, + k8s_available + )) + + # If K8s API is not available, provide recommendations based on CloudWatch only + if not k8s_available: + results.append({ + 'resource_type': 'eks_access_warning', + 'cluster_name': cluster_name, + 'region': self.aws_config.region, + 'recommendation': ( + "Kubernetes API access not available. To enable detailed deployment " + "analysis, configure RBAC permissions as documented in " + "docs/kubernetes_permissions.md. CloudWatch metrics analysis available." + ), + 'estimated_monthly_savings': 0.0 + }) + + # Analyze node groups + nodegroup_results = self._analyze_nodegroups(cluster_name) + results.extend(nodegroup_results) + + return results + + def _check_k8s_access(self, cluster: Dict[str, Any]) -> bool: + """ + Check if we have access to the Kubernetes API. + + Note: Full K8s API access requires additional setup (kubectl credentials). + This is a placeholder for when K8s client is configured. + """ + # For now, return False - K8s API access requires additional configuration + # In a full implementation, this would attempt to connect to the K8s API + return False + + def _get_cluster_metrics(self, cluster_name: str) -> Dict[str, Any]: + """ + Get CloudWatch metrics for the cluster. + + Args: + cluster_name: Name of the EKS cluster + + Returns: + Dictionary of cluster metrics + """ + metrics = { + 'cpu_utilization': None, + 'memory_utilization': None, + 'pod_count': None, + 'node_count': None + } + + try: + # Get Container Insights metrics if available + namespace = 'ContainerInsights' + + # CPU utilization + cpu_metrics = self.metrics.get_custom_metric( + namespace=namespace, + metric_name='pod_cpu_utilization', + dimensions=[ + {'Name': 'ClusterName', 'Value': cluster_name} + ], + period_days=7 + ) + if cpu_metrics: + metrics['cpu_utilization'] = cpu_metrics + + # Memory utilization + memory_metrics = self.metrics.get_custom_metric( + namespace=namespace, + metric_name='pod_memory_utilization', + dimensions=[ + {'Name': 'ClusterName', 'Value': cluster_name} + ], + period_days=7 + ) + if memory_metrics: + metrics['memory_utilization'] = memory_metrics + + # Pod count + pod_metrics = self.metrics.get_custom_metric( + namespace=namespace, + metric_name='pod_number_of_running_pods', + dimensions=[ + {'Name': 'ClusterName', 'Value': cluster_name} + ], + period_days=7 + ) + if pod_metrics: + metrics['pod_count'] = pod_metrics + + except Exception as e: + log.warning(f"Error getting cluster metrics for {cluster_name}: {e}") + + return metrics + + def _generate_cluster_summary( + self, + cluster_name: str, + cluster: Dict[str, Any], + metrics: Dict[str, Any], + k8s_available: bool + ) -> Dict[str, Any]: + """Generate a cluster health summary.""" + now = datetime.now(tzutc()) + + # Get cluster age + created_at = cluster.get('createdAt') + cluster_age_days = None + if created_at: + if isinstance(created_at, datetime): + cluster_age_days = (now - created_at).days + + # Determine cluster status + status = cluster.get('status', 'UNKNOWN') + version = cluster.get('version', 'unknown') + + # Check version for updates + findings = [] + recommendation = None + + # Kubernetes version check (versions older than 1.27 as of 2024) + try: + major, minor = version.split('.')[:2] + minor_int = int(minor) + if minor_int < 27: + findings.append({ + 'type': 'outdated_version', + 'severity': 'medium', + 'message': f"Cluster running Kubernetes {version}. Consider upgrading to a newer version.", + 'current_version': version + }) + recommendation = ( + f"Cluster is running Kubernetes {version}. AWS recommends keeping " + "clusters updated to receive security patches and new features. " + "Plan an upgrade to a supported version." + ) + except (ValueError, IndexError): + pass + + # Check if Container Insights is enabled (based on metrics availability) + if not metrics.get('cpu_utilization') and not metrics.get('memory_utilization'): + findings.append({ + 'type': 'no_container_insights', + 'severity': 'low', + 'message': "Container Insights metrics not detected. Enable for better monitoring.", + 'recommendation': "Enable Container Insights for detailed pod and container metrics." + }) + + if not recommendation: + if findings: + recommendation = "Review findings for potential improvements." + else: + recommendation = "Cluster appears healthy. Continue monitoring." + + return { + 'resource_type': 'eks_cluster_summary', + 'cluster_name': cluster_name, + 'region': self.aws_config.region, + 'account_id': self.account_id, + 'status': status, + 'version': version, + 'cluster_age_days': cluster_age_days, + 'endpoint': cluster.get('endpoint', ''), + 'k8s_api_accessible': k8s_available, + 'metrics': { + 'cpu_avg': metrics.get('cpu_utilization', {}).get('average'), + 'memory_avg': metrics.get('memory_utilization', {}).get('average'), + 'pod_count_avg': metrics.get('pod_count', {}).get('average') + }, + 'findings': findings, + 'recommendation': recommendation, + 'estimated_monthly_savings': 0.0 + } + + def _analyze_nodegroups(self, cluster_name: str) -> List[Dict[str, Any]]: + """ + Analyze node groups for the cluster. + + Args: + cluster_name: Name of the EKS cluster + + Returns: + List of nodegroup findings + """ + results = [] + + try: + nodegroups_response = self.eks_client.list_nodegroups( + clusterName=cluster_name + ) + nodegroup_names = nodegroups_response.get('nodegroups', []) + + for nodegroup_name in nodegroup_names: + try: + nodegroup_result = self._analyze_single_nodegroup( + cluster_name, + nodegroup_name + ) + results.append(nodegroup_result) + except Exception as e: + log.warning(f"Error analyzing nodegroup {nodegroup_name}: {e}") + + except Exception as e: + log.warning(f"Error listing nodegroups for {cluster_name}: {e}") + + return results + + def _analyze_single_nodegroup( + self, + cluster_name: str, + nodegroup_name: str + ) -> Dict[str, Any]: + """ + Analyze a single node group. + + Args: + cluster_name: Name of the EKS cluster + nodegroup_name: Name of the node group + + Returns: + Analysis result for the node group + """ + nodegroup_info = self.eks_client.describe_nodegroup( + clusterName=cluster_name, + nodegroupName=nodegroup_name + ) + nodegroup = nodegroup_info.get('nodegroup', {}) + + now = datetime.now(tzutc()) + findings = [] + recommendation = None + estimated_savings = 0.0 + + # Basic nodegroup info + status = nodegroup.get('status', 'UNKNOWN') + instance_types = nodegroup.get('instanceTypes', []) + scaling_config = nodegroup.get('scalingConfig', {}) + min_size = scaling_config.get('minSize', 0) + max_size = scaling_config.get('maxSize', 0) + desired_size = scaling_config.get('desiredSize', 0) + capacity_type = nodegroup.get('capacityType', 'ON_DEMAND') + + # Check for potential optimizations + # 1. Check if nodegroup could use Spot instances + if capacity_type == 'ON_DEMAND' and min_size > 1: + findings.append({ + 'type': 'spot_opportunity', + 'severity': 'low', + 'message': "Node group uses On-Demand instances. Consider Spot for non-critical workloads.", + 'instance_types': instance_types + }) + # Rough estimate: 60% savings on Spot + if instance_types: + from utils.cost_estimator import CostEstimator + on_demand_cost = CostEstimator.get_instance_cost(instance_types[0]) * desired_size + estimated_savings = on_demand_cost * 0.6 + + # 2. Check scaling configuration + if min_size == max_size and desired_size == min_size: + findings.append({ + 'type': 'no_autoscaling', + 'severity': 'low', + 'message': f"Node group has fixed size ({desired_size} nodes). Consider enabling autoscaling.", + 'current_size': desired_size + }) + + # 3. Check for old AMI release version + release_version = nodegroup.get('releaseVersion', '') + ami_type = nodegroup.get('amiType', 'AL2_x86_64') + + # 4. Check node group age + created_at = nodegroup.get('createdAt') + nodegroup_age_days = None + if created_at: + if isinstance(created_at, datetime): + nodegroup_age_days = (now - created_at).days + if nodegroup_age_days > 180: + findings.append({ + 'type': 'old_nodegroup', + 'severity': 'low', + 'message': f"Node group is {nodegroup_age_days} days old. Review for potential refresh.", + 'age_days': nodegroup_age_days + }) + + # 5. Check for Graviton opportunity + if ami_type in ['AL2_x86_64', 'BOTTLEROCKET_x86_64']: + findings.append({ + 'type': 'graviton_opportunity', + 'severity': 'low', + 'message': "Node group uses x86_64 architecture. Consider Graviton (ARM64) for cost savings.", + 'current_ami_type': ami_type + }) + + # Generate recommendation + if findings: + primary_finding = findings[0] + if primary_finding['type'] == 'spot_opportunity': + recommendation = ( + f"Consider using Spot capacity for cost savings. Current configuration " + f"uses {desired_size} On-Demand {', '.join(instance_types)} nodes. " + f"Estimated savings: ${estimated_savings:.2f}/month with Spot." + ) + elif primary_finding['type'] == 'graviton_opportunity': + recommendation = ( + "Consider migrating to Graviton (ARM64) instances for improved " + "price-performance. Verify application compatibility first." + ) + elif primary_finding['type'] == 'no_autoscaling': + recommendation = ( + "Enable Cluster Autoscaler and configure scaling policies to " + "optimize costs during low-demand periods." + ) + else: + recommendation = "Review findings for potential optimizations." + else: + recommendation = "Node group configuration appears optimal." + + return { + 'resource_type': 'eks_nodegroup', + 'cluster_name': cluster_name, + 'nodegroup_name': nodegroup_name, + 'region': self.aws_config.region, + 'account_id': self.account_id, + 'status': status, + 'instance_types': instance_types, + 'capacity_type': capacity_type, + 'ami_type': ami_type, + 'scaling': { + 'min_size': min_size, + 'max_size': max_size, + 'desired_size': desired_size + }, + 'age_days': nodegroup_age_days, + 'findings': findings, + 'recommendation': recommendation, + 'estimated_monthly_savings': round(estimated_savings, 2) + } diff --git a/analyzers/eks_session_analyzer.py b/analyzers/eks_session_analyzer.py new file mode 100644 index 0000000..053a704 --- /dev/null +++ b/analyzers/eks_session_analyzer.py @@ -0,0 +1,396 @@ +""" +EKS Session Analyzer module. + +Monitors active kubectl sessions and SSM sessions to EKS nodes +for security and operational insights. +""" + +import logging +from datetime import datetime, timedelta +from typing import Dict, List, Any, Optional +from dateutil.tz import tzutc + +from core.analyzer import ResourceAnalyzer +from config import AWSConfig, AnalysisConfig +from core.metrics import CloudWatchMetrics +from utils.aws_utils import safe_api_call, safe_get_tag_value +from utils.console import print_error, print_info, print_warning + +log = logging.getLogger(__name__) + + +class EKSSessionAnalyzer(ResourceAnalyzer): + """ + Analyzer for EKS cluster sessions and access patterns. + + Monitors: + - Active SSM sessions to EKS nodes + - kubectl session patterns (via CloudTrail if available) + - Unusual access patterns + """ + + def __init__( + self, + aws_config: AWSConfig, + analysis_config: AnalysisConfig, + metrics: CloudWatchMetrics, + account_id: Optional[str], + partition: Optional[str] + ): + super().__init__(aws_config, analysis_config, metrics, account_id, partition) + self.ssm_client = aws_config.create_client('ssm') + self.eks_client = aws_config.create_client('eks') + self.ec2_client = aws_config.create_client('ec2') + + def get_service_name(self) -> str: + return 'eks_sessions' + + def get_estimated_time(self) -> str: + return "Medium (30-60s)" + + def get_description(self) -> str: + return "Monitors active EKS/kubectl sessions and detects unusual access patterns" + + def analyze(self, **kwargs) -> List[Dict[str, Any]]: + """ + Analyze EKS sessions and access patterns. + + Returns: + List of session findings and recommendations + """ + results = [] + + try: + # Get all EKS clusters + clusters = self._get_eks_clusters() + if not clusters: + print_info("No EKS clusters found in the current region") + return [] + + print_info(f"Analyzing sessions for {len(clusters)} EKS cluster(s)...") + + # Get EKS node instance IDs + eks_node_ids = self._get_eks_node_instance_ids(clusters) + + # Get active SSM sessions + active_sessions = self._get_active_ssm_sessions() + + # Analyze sessions on EKS nodes + eks_sessions = self._filter_eks_sessions(active_sessions, eks_node_ids) + + # Generate findings + for cluster_name in clusters: + cluster_sessions = [ + s for s in eks_sessions + if s.get('cluster_name') == cluster_name + ] + + cluster_result = self._analyze_cluster_sessions( + cluster_name, + cluster_sessions, + eks_node_ids.get(cluster_name, []) + ) + results.append(cluster_result) + + # Add overall session summary + if eks_sessions: + results.append(self._generate_session_summary(eks_sessions)) + + except Exception as e: + log.error(f"Error in EKS session analysis: {e}") + print_error(f"EKS session analysis failed: {e}") + return [{'error': str(e)}] + + return results + + def _get_eks_clusters(self) -> List[str]: + """Get list of EKS cluster names.""" + try: + response = self.eks_client.list_clusters() + return response.get('clusters', []) + except Exception as e: + log.error(f"Error listing EKS clusters: {e}") + return [] + + def _get_eks_node_instance_ids(self, clusters: List[str]) -> Dict[str, List[str]]: + """ + Get EC2 instance IDs for EKS node groups. + + Args: + clusters: List of EKS cluster names + + Returns: + Dictionary mapping cluster names to lists of instance IDs + """ + cluster_nodes: Dict[str, List[str]] = {} + + for cluster_name in clusters: + instance_ids = [] + + try: + # Get managed node groups + nodegroups_response = self.eks_client.list_nodegroups( + clusterName=cluster_name + ) + + for nodegroup_name in nodegroups_response.get('nodegroups', []): + try: + nodegroup = self.eks_client.describe_nodegroup( + clusterName=cluster_name, + nodegroupName=nodegroup_name + ) + # Get instances from the Auto Scaling group + asg_name = nodegroup.get('nodegroup', {}).get('resources', {}).get( + 'autoScalingGroups', [{}] + )[0].get('name') + + if asg_name: + asg_instances = self._get_asg_instances(asg_name) + instance_ids.extend(asg_instances) + except Exception as e: + log.warning(f"Error getting nodegroup {nodegroup_name}: {e}") + + # Also look for self-managed nodes by tag + tagged_instances = self._get_tagged_eks_nodes(cluster_name) + instance_ids.extend(tagged_instances) + + cluster_nodes[cluster_name] = list(set(instance_ids)) + + except Exception as e: + log.warning(f"Error getting nodes for cluster {cluster_name}: {e}") + cluster_nodes[cluster_name] = [] + + return cluster_nodes + + def _get_asg_instances(self, asg_name: str) -> List[str]: + """Get instance IDs from an Auto Scaling group.""" + try: + asg_client = self.aws_config.create_client('autoscaling') + response = asg_client.describe_auto_scaling_groups( + AutoScalingGroupNames=[asg_name] + ) + instances = [] + for asg in response.get('AutoScalingGroups', []): + for instance in asg.get('Instances', []): + instance_id = instance.get('InstanceId') + if instance_id: + instances.append(instance_id) + return instances + except Exception as e: + log.warning(f"Error getting ASG instances for {asg_name}: {e}") + return [] + + def _get_tagged_eks_nodes(self, cluster_name: str) -> List[str]: + """Get EC2 instances tagged as EKS nodes for a cluster.""" + try: + response = self.ec2_client.describe_instances( + Filters=[ + { + 'Name': f'tag:kubernetes.io/cluster/{cluster_name}', + 'Values': ['owned', 'shared'] + }, + { + 'Name': 'instance-state-name', + 'Values': ['running'] + } + ] + ) + instances = [] + for reservation in response.get('Reservations', []): + for instance in reservation.get('Instances', []): + instances.append(instance['InstanceId']) + return instances + except Exception as e: + log.warning(f"Error getting tagged EKS nodes for {cluster_name}: {e}") + return [] + + def _get_active_ssm_sessions(self) -> List[Dict[str, Any]]: + """Get all active SSM sessions.""" + try: + sessions = [] + paginator = self.ssm_client.get_paginator('describe_sessions') + + for page in paginator.paginate(State='Active'): + for session in page.get('Sessions', []): + sessions.append({ + 'session_id': session.get('SessionId'), + 'target': session.get('Target'), + 'status': session.get('Status'), + 'start_time': session.get('StartDate'), + 'owner': session.get('Owner'), + 'document_name': session.get('DocumentName'), + 'reason': session.get('Reason', '') + }) + + return sessions + except Exception as e: + log.warning(f"Error getting SSM sessions: {e}") + return [] + + def _filter_eks_sessions( + self, + sessions: List[Dict[str, Any]], + cluster_nodes: Dict[str, List[str]] + ) -> List[Dict[str, Any]]: + """Filter sessions to only those on EKS nodes.""" + eks_sessions = [] + + # Create reverse mapping: instance_id -> cluster_name + instance_to_cluster = {} + for cluster_name, instance_ids in cluster_nodes.items(): + for instance_id in instance_ids: + instance_to_cluster[instance_id] = cluster_name + + for session in sessions: + target = session.get('target', '') + # SSM target format is typically i-xxxxxxxxx or mi-xxxxxxxxx + instance_id = target.replace('mi-', 'i-') if target.startswith('mi-') else target + + if instance_id in instance_to_cluster: + session['cluster_name'] = instance_to_cluster[instance_id] + session['instance_id'] = instance_id + eks_sessions.append(session) + + return eks_sessions + + def _analyze_cluster_sessions( + self, + cluster_name: str, + sessions: List[Dict[str, Any]], + node_ids: List[str] + ) -> Dict[str, Any]: + """ + Analyze sessions for a specific cluster. + + Args: + cluster_name: Name of the EKS cluster + sessions: List of sessions on this cluster's nodes + node_ids: List of node instance IDs + + Returns: + Analysis result for the cluster + """ + now = datetime.now(tzutc()) + findings = [] + risk_level = 'low' + + # Check for active sessions + if sessions: + # Flag long-running sessions (> 1 hour) + long_sessions = [] + for session in sessions: + start_time = session.get('start_time') + if start_time: + if isinstance(start_time, datetime): + duration = now - start_time + if duration > timedelta(hours=1): + long_sessions.append({ + 'session_id': session.get('session_id'), + 'owner': session.get('owner'), + 'duration_hours': round(duration.total_seconds() / 3600, 1), + 'instance_id': session.get('instance_id') + }) + + if long_sessions: + findings.append({ + 'type': 'long_running_sessions', + 'severity': 'medium', + 'message': f"{len(long_sessions)} session(s) running for more than 1 hour", + 'details': long_sessions + }) + risk_level = 'medium' + + # Check for multiple simultaneous sessions + if len(sessions) > 3: + findings.append({ + 'type': 'multiple_sessions', + 'severity': 'medium', + 'message': f"{len(sessions)} simultaneous sessions detected", + 'details': [ + {'session_id': s.get('session_id'), 'owner': s.get('owner')} + for s in sessions + ] + }) + risk_level = 'medium' + + # Check for sessions from unusual document names + unusual_docs = [ + s for s in sessions + if s.get('document_name') and 'SSM-SessionManagerRunShell' not in s.get('document_name', '') + ] + if unusual_docs: + findings.append({ + 'type': 'unusual_session_type', + 'severity': 'low', + 'message': f"{len(unusual_docs)} session(s) using non-standard document", + 'details': [ + {'session_id': s.get('session_id'), 'document': s.get('document_name')} + for s in unusual_docs + ] + }) + + recommendation = None + if risk_level == 'medium': + recommendation = ( + "Review active sessions on EKS nodes. Long-running or multiple simultaneous " + "sessions may indicate operational issues or security concerns. Consider " + "implementing session time limits and audit logging." + ) + elif sessions: + recommendation = ( + "Active sessions detected on EKS nodes. Ensure all sessions are authorized " + "and follow security best practices." + ) + else: + recommendation = "No active sessions on EKS nodes. Session access appears secure." + + return { + 'resource_type': 'eks_cluster_sessions', + 'cluster_name': cluster_name, + 'region': self.aws_config.region, + 'account_id': self.account_id, + 'total_nodes': len(node_ids), + 'active_sessions': len(sessions), + 'risk_level': risk_level, + 'findings': findings, + 'sessions': [ + { + 'session_id': s.get('session_id'), + 'owner': s.get('owner'), + 'instance_id': s.get('instance_id'), + 'start_time': s.get('start_time').isoformat() if isinstance(s.get('start_time'), datetime) else str(s.get('start_time')), + 'status': s.get('status') + } + for s in sessions + ], + 'recommendation': recommendation, + 'estimated_monthly_savings': 0.0 # Session analysis is security-focused + } + + def _generate_session_summary( + self, + all_sessions: List[Dict[str, Any]] + ) -> Dict[str, Any]: + """Generate an overall session summary.""" + # Group sessions by owner + owners = {} + for session in all_sessions: + owner = session.get('owner', 'unknown') + if owner not in owners: + owners[owner] = 0 + owners[owner] += 1 + + return { + 'resource_type': 'eks_session_summary', + 'region': self.aws_config.region, + 'account_id': self.account_id, + 'total_active_sessions': len(all_sessions), + 'unique_owners': len(owners), + 'sessions_by_owner': owners, + 'recommendation': ( + f"Total of {len(all_sessions)} active sessions across EKS nodes. " + f"Sessions from {len(owners)} unique user(s). Review for compliance " + "with access policies." + ), + 'estimated_monthly_savings': 0.0 + } diff --git a/analyzers/rto_analyzer.py b/analyzers/rto_analyzer.py new file mode 100644 index 0000000..13afb9f --- /dev/null +++ b/analyzers/rto_analyzer.py @@ -0,0 +1,513 @@ +""" +RTO (Recovery Time Objective) Analyzer module. + +Analyzes backup configurations, cross-region replication, and disaster +recovery readiness to evaluate Recovery Time Objectives (RTO) and +Recovery Point Objectives (RPO). +""" + +import logging +from datetime import datetime, timedelta +from typing import Dict, List, Any, Optional +from dateutil.tz import tzutc + +from core.analyzer import ResourceAnalyzer +from config import AWSConfig, AnalysisConfig +from core.metrics import CloudWatchMetrics +from utils.console import print_error, print_info, print_warning + +log = logging.getLogger(__name__) + + +class RTOAnalyzer(ResourceAnalyzer): + """ + Analyzer for Recovery Time Objective (RTO) and Recovery Point Objective (RPO). + + Evaluates: + - RDS automated backups and snapshots + - EC2/EBS snapshot policies + - S3 cross-region replication + - DynamoDB backup configurations + - AWS Backup plan coverage + - Cross-region disaster recovery readiness + """ + + def __init__( + self, + aws_config: AWSConfig, + analysis_config: AnalysisConfig, + metrics: CloudWatchMetrics, + account_id: Optional[str], + partition: Optional[str] + ): + super().__init__(aws_config, analysis_config, metrics, account_id, partition) + self.rds_client = aws_config.create_client('rds') + self.ec2_client = aws_config.create_client('ec2') + self.s3_client = aws_config.create_client('s3') + self.dynamodb_client = aws_config.create_client('dynamodb') + self.backup_client = aws_config.create_client('backup') + + def get_service_name(self) -> str: + return 'rto_analysis' + + def get_estimated_time(self) -> str: + return "Medium-Long (60-120s)" + + def get_description(self) -> str: + return "Analyzes backup configurations and disaster recovery readiness" + + def analyze(self, **kwargs) -> List[Dict[str, Any]]: + """ + Analyze RTO/RPO configurations across AWS services. + + Returns: + List of DR findings and recommendations + """ + results = [] + + try: + print_info("Analyzing disaster recovery configurations...") + + # Analyze RDS backup configurations + rds_results = self._analyze_rds_backups() + results.extend(rds_results) + + # Analyze EBS snapshot policies + ebs_results = self._analyze_ebs_snapshots() + results.extend(ebs_results) + + # Analyze S3 replication + s3_results = self._analyze_s3_replication() + results.extend(s3_results) + + # Analyze DynamoDB backups + dynamodb_results = self._analyze_dynamodb_backups() + results.extend(dynamodb_results) + + # Analyze AWS Backup coverage + backup_results = self._analyze_aws_backup() + results.extend(backup_results) + + # Generate overall DR summary + results.append(self._generate_dr_summary(results)) + + except Exception as e: + log.error(f"Error in RTO analysis: {e}") + print_error(f"RTO analysis failed: {e}") + return [{'error': str(e)}] + + return results + + def _analyze_rds_backups(self) -> List[Dict[str, Any]]: + """Analyze RDS automated backup configurations.""" + results = [] + + try: + paginator = self.rds_client.get_paginator('describe_db_instances') + + for page in paginator.paginate(): + for db in page.get('DBInstances', []): + db_id = db['DBInstanceIdentifier'] + backup_retention = db.get('BackupRetentionPeriod', 0) + multi_az = db.get('MultiAZ', False) + engine = db.get('Engine', 'unknown') + storage_encrypted = db.get('StorageEncrypted', False) + + # Check for read replicas in other regions + read_replicas = db.get('ReadReplicaDBInstanceIdentifiers', []) + has_cross_region_replica = False + + findings = [] + rto_estimate = 'Unknown' + rpo_estimate = 'Unknown' + + # Evaluate backup configuration + if backup_retention == 0: + findings.append({ + 'type': 'no_backup', + 'severity': 'critical', + 'message': 'Automated backups are disabled' + }) + rpo_estimate = 'Infinite (no backups)' + elif backup_retention < 7: + findings.append({ + 'type': 'short_retention', + 'severity': 'medium', + 'message': f'Backup retention is only {backup_retention} days' + }) + rpo_estimate = f'{backup_retention} days max' + else: + rpo_estimate = f'{backup_retention} days max' + + # Evaluate Multi-AZ + if not multi_az: + findings.append({ + 'type': 'single_az', + 'severity': 'medium', + 'message': 'Database is not Multi-AZ (higher RTO on failure)' + }) + rto_estimate = '10-30 minutes (point-in-time recovery)' + else: + rto_estimate = '1-2 minutes (automatic failover)' + + # Check encryption + if not storage_encrypted: + findings.append({ + 'type': 'unencrypted', + 'severity': 'low', + 'message': 'Storage is not encrypted' + }) + + # Generate recommendation + if findings: + if any(f['severity'] == 'critical' for f in findings): + recommendation = ( + f"CRITICAL: Enable automated backups for {db_id} immediately. " + "Current configuration provides no disaster recovery capability." + ) + elif not multi_az: + recommendation = ( + f"Enable Multi-AZ for {db_id} to reduce RTO from ~30 minutes " + "to ~2 minutes. Consider increasing backup retention to 14+ days." + ) + else: + recommendation = ( + f"Review {db_id} backup retention ({backup_retention} days) " + "and consider cross-region read replicas for regional DR." + ) + else: + recommendation = f"{db_id} has good DR configuration." + + results.append({ + 'resource_type': 'rds_dr_analysis', + 'resource_id': db_id, + 'region': self.aws_config.region, + 'account_id': self.account_id, + 'engine': engine, + 'backup_retention_days': backup_retention, + 'multi_az': multi_az, + 'storage_encrypted': storage_encrypted, + 'read_replicas': len(read_replicas), + 'estimated_rto': rto_estimate, + 'estimated_rpo': rpo_estimate, + 'findings': findings, + 'recommendation': recommendation, + 'estimated_monthly_savings': 0.0 + }) + + except Exception as e: + log.warning(f"Error analyzing RDS backups: {e}") + + return results + + def _analyze_ebs_snapshots(self) -> List[Dict[str, Any]]: + """Analyze EBS volume snapshot coverage.""" + results = [] + + try: + # Get all volumes + volumes_paginator = self.ec2_client.get_paginator('describe_volumes') + volumes = [] + for page in volumes_paginator.paginate(): + volumes.extend(page.get('Volumes', [])) + + if not volumes: + return results + + # Get recent snapshots + snapshots_paginator = self.ec2_client.get_paginator('describe_snapshots') + recent_snapshots = {} + cutoff_date = datetime.now(tzutc()) - timedelta(days=7) + + for page in snapshots_paginator.paginate(OwnerIds=['self']): + for snapshot in page.get('Snapshots', []): + vol_id = snapshot.get('VolumeId') + start_time = snapshot.get('StartTime') + if vol_id and start_time and start_time > cutoff_date: + if vol_id not in recent_snapshots: + recent_snapshots[vol_id] = start_time + elif start_time > recent_snapshots[vol_id]: + recent_snapshots[vol_id] = start_time + + # Analyze each volume + volumes_without_snapshots = [] + volumes_with_old_snapshots = [] + + for volume in volumes: + vol_id = volume['VolumeId'] + vol_size = volume.get('Size', 0) + vol_state = volume.get('State', 'unknown') + + if vol_id not in recent_snapshots: + # Check for any snapshot + volumes_without_snapshots.append({ + 'volume_id': vol_id, + 'size_gb': vol_size, + 'state': vol_state + }) + + # Create summary finding + if volumes_without_snapshots: + results.append({ + 'resource_type': 'ebs_snapshot_coverage', + 'region': self.aws_config.region, + 'account_id': self.account_id, + 'total_volumes': len(volumes), + 'volumes_without_recent_snapshot': len(volumes_without_snapshots), + 'coverage_percent': round((len(volumes) - len(volumes_without_snapshots)) / len(volumes) * 100, 1), + 'unprotected_volumes': volumes_without_snapshots[:20], # Top 20 + 'estimated_rpo': 'Unknown (no recent snapshots for some volumes)', + 'findings': [{ + 'type': 'missing_snapshots', + 'severity': 'high' if len(volumes_without_snapshots) > len(volumes) * 0.5 else 'medium', + 'message': f'{len(volumes_without_snapshots)} volumes have no snapshots in the last 7 days' + }], + 'recommendation': ( + f"Enable automated snapshots for {len(volumes_without_snapshots)} unprotected volumes. " + "Consider using AWS Backup or Data Lifecycle Manager for consistent protection." + ), + 'estimated_monthly_savings': 0.0 + }) + else: + results.append({ + 'resource_type': 'ebs_snapshot_coverage', + 'region': self.aws_config.region, + 'account_id': self.account_id, + 'total_volumes': len(volumes), + 'volumes_without_recent_snapshot': 0, + 'coverage_percent': 100.0, + 'estimated_rpo': '7 days or better', + 'findings': [], + 'recommendation': 'All EBS volumes have recent snapshots. Good DR posture.', + 'estimated_monthly_savings': 0.0 + }) + + except Exception as e: + log.warning(f"Error analyzing EBS snapshots: {e}") + + return results + + def _analyze_s3_replication(self) -> List[Dict[str, Any]]: + """Analyze S3 bucket replication configurations.""" + results = [] + + try: + response = self.s3_client.list_buckets() + buckets = response.get('Buckets', []) + + buckets_without_replication = [] + buckets_with_replication = [] + + for bucket in buckets: + bucket_name = bucket['Name'] + + try: + # Check for replication configuration + replication = self.s3_client.get_bucket_replication(Bucket=bucket_name) + rules = replication.get('ReplicationConfiguration', {}).get('Rules', []) + if rules: + buckets_with_replication.append({ + 'bucket': bucket_name, + 'rules': len(rules) + }) + except self.s3_client.exceptions.ClientError as e: + if 'ReplicationConfigurationNotFoundError' in str(e): + buckets_without_replication.append(bucket_name) + + total_buckets = len(buckets) + replicated = len(buckets_with_replication) + + findings = [] + if buckets_without_replication: + severity = 'medium' if replicated > 0 else 'high' + findings.append({ + 'type': 'no_replication', + 'severity': severity, + 'message': f'{len(buckets_without_replication)} buckets have no cross-region replication' + }) + + results.append({ + 'resource_type': 's3_replication_analysis', + 'region': self.aws_config.region, + 'account_id': self.account_id, + 'total_buckets': total_buckets, + 'buckets_with_replication': replicated, + 'replication_coverage_percent': round(replicated / total_buckets * 100, 1) if total_buckets > 0 else 0, + 'buckets_without_replication': buckets_without_replication[:20], + 'estimated_rpo': 'Minutes (for replicated)' if replicated > 0 else 'Unknown (no replication)', + 'findings': findings, + 'recommendation': ( + f"{replicated}/{total_buckets} buckets have replication enabled. " + "Enable cross-region replication for critical data buckets." + ) if buckets_without_replication else 'S3 replication is well configured.', + 'estimated_monthly_savings': 0.0 + }) + + except Exception as e: + log.warning(f"Error analyzing S3 replication: {e}") + + return results + + def _analyze_dynamodb_backups(self) -> List[Dict[str, Any]]: + """Analyze DynamoDB table backup configurations.""" + results = [] + + try: + paginator = self.dynamodb_client.get_paginator('list_tables') + tables = [] + for page in paginator.paginate(): + tables.extend(page.get('TableNames', [])) + + if not tables: + return results + + tables_without_pitr = [] + tables_with_pitr = [] + + for table_name in tables: + try: + response = self.dynamodb_client.describe_continuous_backups( + TableName=table_name + ) + pitr = response.get('ContinuousBackupsDescription', {}).get( + 'PointInTimeRecoveryDescription', {} + ).get('PointInTimeRecoveryStatus', 'DISABLED') + + if pitr == 'ENABLED': + tables_with_pitr.append(table_name) + else: + tables_without_pitr.append(table_name) + + except Exception as e: + log.warning(f"Error checking PITR for table {table_name}: {e}") + tables_without_pitr.append(table_name) + + findings = [] + if tables_without_pitr: + findings.append({ + 'type': 'pitr_disabled', + 'severity': 'high' if len(tables_without_pitr) > len(tables) * 0.5 else 'medium', + 'message': f'{len(tables_without_pitr)} DynamoDB tables lack point-in-time recovery' + }) + + results.append({ + 'resource_type': 'dynamodb_backup_analysis', + 'region': self.aws_config.region, + 'account_id': self.account_id, + 'total_tables': len(tables), + 'tables_with_pitr': len(tables_with_pitr), + 'pitr_coverage_percent': round(len(tables_with_pitr) / len(tables) * 100, 1) if tables else 0, + 'tables_without_pitr': tables_without_pitr[:20], + 'estimated_rpo': '5 minutes (for PITR-enabled)' if tables_with_pitr else 'Unknown (no PITR)', + 'findings': findings, + 'recommendation': ( + f"Enable Point-in-Time Recovery for {len(tables_without_pitr)} DynamoDB tables " + "to achieve 5-minute RPO. PITR provides continuous backups with 35-day retention." + ) if tables_without_pitr else 'All DynamoDB tables have PITR enabled.', + 'estimated_monthly_savings': 0.0 + }) + + except Exception as e: + log.warning(f"Error analyzing DynamoDB backups: {e}") + + return results + + def _analyze_aws_backup(self) -> List[Dict[str, Any]]: + """Analyze AWS Backup plan coverage.""" + results = [] + + try: + # List backup plans + plans_response = self.backup_client.list_backup_plans() + plans = plans_response.get('BackupPlansList', []) + + # Get protected resources + protected_resources = [] + try: + selections_response = self.backup_client.list_protected_resources() + protected_resources = selections_response.get('Results', []) + except Exception as e: + log.warning(f"Error listing protected resources: {e}") + + findings = [] + if not plans: + findings.append({ + 'type': 'no_backup_plans', + 'severity': 'high', + 'message': 'No AWS Backup plans configured' + }) + + results.append({ + 'resource_type': 'aws_backup_analysis', + 'region': self.aws_config.region, + 'account_id': self.account_id, + 'backup_plans': len(plans), + 'protected_resources': len(protected_resources), + 'plan_names': [p.get('BackupPlanName', 'Unknown') for p in plans], + 'findings': findings, + 'recommendation': ( + "No AWS Backup plans detected. Create a backup plan to centralize " + "protection for EC2, RDS, EFS, DynamoDB, and other resources." + ) if not plans else ( + f"{len(plans)} backup plan(s) protecting {len(protected_resources)} resources. " + "Review coverage to ensure all critical resources are included." + ), + 'estimated_monthly_savings': 0.0 + }) + + except Exception as e: + log.warning(f"Error analyzing AWS Backup: {e}") + + return results + + def _generate_dr_summary( + self, + all_results: List[Dict[str, Any]] + ) -> Dict[str, Any]: + """Generate overall disaster recovery summary.""" + # Count findings by severity + critical_count = 0 + high_count = 0 + medium_count = 0 + + for result in all_results: + for finding in result.get('findings', []): + severity = finding.get('severity', 'low') + if severity == 'critical': + critical_count += 1 + elif severity == 'high': + high_count += 1 + elif severity == 'medium': + medium_count += 1 + + # Determine overall DR readiness + if critical_count > 0: + dr_status = 'Critical' + dr_score = 'Poor' + elif high_count > 2: + dr_status = 'Needs Improvement' + dr_score = 'Fair' + elif high_count > 0 or medium_count > 3: + dr_status = 'Moderate' + dr_score = 'Good' + else: + dr_status = 'Good' + dr_score = 'Excellent' + + return { + 'resource_type': 'dr_summary', + 'region': self.aws_config.region, + 'account_id': self.account_id, + 'dr_status': dr_status, + 'dr_score': dr_score, + 'critical_findings': critical_count, + 'high_findings': high_count, + 'medium_findings': medium_count, + 'total_findings': critical_count + high_count + medium_count, + 'recommendation': ( + f"Overall DR Status: {dr_status}. " + f"Found {critical_count} critical, {high_count} high, and {medium_count} medium findings. " + "Address critical and high findings first to improve disaster recovery posture." + ), + 'estimated_monthly_savings': 0.0 + } diff --git a/ci/Jenkinsfile b/ci/Jenkinsfile new file mode 100644 index 0000000..53d0bda --- /dev/null +++ b/ci/Jenkinsfile @@ -0,0 +1,239 @@ +/** + * Dedo-Duro AWS Resource Analysis Pipeline + * + * This Jenkinsfile runs the Dedo-Duro analyzer to identify AWS cost + * optimization opportunities and generate reports. + * + * Prerequisites: + * - AWS credentials configured (via IAM role or credentials plugin) + * - Python 3.9+ available on agent + * - pip installed + * + * Parameters: + * - AWS_REGION: Target AWS region (optional) + * - RESOURCE_TYPES: Comma-separated analyzer keys (optional) + * - OUTPUT_FORMAT: Report format (html, json, csv) + * - MULTI_REGION: Analyze all regions (boolean) + * - ENVIRONMENT_FILTER: Filter by environment tag (prod, test, dev) + */ + +pipeline { + agent any + + parameters { + string( + name: 'AWS_REGION', + defaultValue: '', + description: 'AWS region to analyze (leave empty for default)' + ) + string( + name: 'RESOURCE_TYPES', + defaultValue: '', + description: 'Comma-separated list of analyzer keys (e.g., ec2,s3,rds)' + ) + choice( + name: 'OUTPUT_FORMAT', + choices: ['html', 'json', 'csv'], + description: 'Report output format' + ) + booleanParam( + name: 'MULTI_REGION', + defaultValue: false, + description: 'Analyze resources across all AWS regions' + ) + string( + name: 'ENVIRONMENT_FILTER', + defaultValue: '', + description: 'Filter resources by environment (prod, test, dev)' + ) + string( + name: 'S3_BUCKET', + defaultValue: '', + description: 'S3 bucket for report upload (optional)' + ) + } + + environment { + PYTHON_VERSION = '3.11' + REPORT_DIR = 'reports' + } + + triggers { + // Run weekly on Monday at 6 AM + cron('H 6 * * 1') + } + + options { + buildDiscarder(logRotator(numToKeepStr: '30')) + timestamps() + timeout(time: 60, unit: 'MINUTES') + disableConcurrentBuilds() + } + + stages { + stage('Checkout') { + steps { + checkout scm + } + } + + stage('Setup Python Environment') { + steps { + sh ''' + python3 -m venv venv + . venv/bin/activate + pip install --upgrade pip + pip install -r requirements.txt + ''' + } + } + + stage('Validate Parameters') { + steps { + script { + // Validate region format if provided + if (params.AWS_REGION) { + if (!(params.AWS_REGION ==~ /^[a-z]{2}-[a-z]+-\d+$/)) { + error("Invalid AWS region format: ${params.AWS_REGION}") + } + } + + // Validate resource types format if provided + if (params.RESOURCE_TYPES) { + if (!(params.RESOURCE_TYPES ==~ /^[a-zA-Z0-9_,]+$/)) { + error("Invalid resource types format: ${params.RESOURCE_TYPES}") + } + } + + // Validate environment filter if provided + if (params.ENVIRONMENT_FILTER) { + def validEnvs = ['prod', 'production', 'test', 'testing', 'dev', 'development', 'staging', 'qa'] + if (!(params.ENVIRONMENT_FILTER.toLowerCase() in validEnvs)) { + error("Invalid environment filter: ${params.ENVIRONMENT_FILTER}") + } + } + } + } + } + + stage('Run Analysis') { + steps { + withAWS(credentials: 'aws-credentials', region: params.AWS_REGION ?: 'us-east-1') { + script { + def cmd = '. venv/bin/activate && python main.py' + + // Add optional parameters + if (params.AWS_REGION) { + cmd += " --region ${params.AWS_REGION}" + } + + if (params.RESOURCE_TYPES) { + cmd += " --resource-types ${params.RESOURCE_TYPES}" + } + + cmd += " --output-format ${params.OUTPUT_FORMAT}" + + if (params.MULTI_REGION) { + cmd += ' --multi-region' + } + + if (params.ENVIRONMENT_FILTER) { + cmd += " --environment ${params.ENVIRONMENT_FILTER}" + } + + sh cmd + } + } + } + } + + stage('Archive Reports') { + steps { + script { + // Find and archive the report + def reportPattern = "aws-optimization-report.${params.OUTPUT_FORMAT}" + + archiveArtifacts( + artifacts: reportPattern, + allowEmptyArchive: false, + fingerprint: true + ) + + // Publish HTML report if format is HTML + if (params.OUTPUT_FORMAT == 'html') { + publishHTML(target: [ + allowMissing: false, + alwaysLinkToLastBuild: true, + keepAll: true, + reportDir: '.', + reportFiles: 'aws-optimization-report.html', + reportName: 'Dedo-Duro Report' + ]) + } + } + } + } + + stage('Upload to S3') { + when { + expression { params.S3_BUCKET?.trim() } + } + steps { + withAWS(credentials: 'aws-credentials', region: params.AWS_REGION ?: 'us-east-1') { + script { + def timestamp = new Date().format('yyyy-MM-dd') + def reportFile = "aws-optimization-report.${params.OUTPUT_FORMAT}" + def s3Key = "dedo-duro-reports/${timestamp}/${reportFile}" + + s3Upload( + bucket: params.S3_BUCKET, + file: reportFile, + path: s3Key + ) + + echo "Report uploaded to s3://${params.S3_BUCKET}/${s3Key}" + } + } + } + } + } + + post { + always { + cleanWs() + } + + success { + echo 'Dedo-Duro analysis completed successfully!' + + // Send Slack notification if configured + script { + try { + slackSend( + channel: '#aws-cost-alerts', + color: 'good', + message: "Dedo-Duro Analysis Complete - ${env.BUILD_URL}" + ) + } catch (Exception e) { + echo "Slack notification skipped: ${e.message}" + } + } + } + + failure { + echo 'Dedo-Duro analysis failed!' + + script { + try { + slackSend( + channel: '#aws-cost-alerts', + color: 'danger', + message: "Dedo-Duro Analysis Failed - ${env.BUILD_URL}" + ) + } catch (Exception e) { + echo "Slack notification skipped: ${e.message}" + } + } + } + } +} diff --git a/complete_simulated_report.html b/complete_simulated_report.html new file mode 100644 index 0000000..03bdba0 --- /dev/null +++ b/complete_simulated_report.html @@ -0,0 +1,391 @@ + + + + + + + AWS Resource Optimization Report + + + + + +
+ +
+

Executive Summary

+

This report identifies potential cost-saving opportunities and security findings across your AWS resources.

+
+
AWS Account ID
123456789012
+
AWS Account Alias
simulated-account
+ +
Analyzed Regions
eu-west-1, us-east-1, us-west-2
+ +
Analysis Date
2026-01-26 18:50:06 UTC
+
Resources Analyzed
84
+
Optimization Opportunities
20 (23.8%)
+
Est. Monthly Savings (Spot Priority)
$534.97
+
Est. Annual Savings (Spot Priority)
$6419.66
+
Est. Monthly Savings (Right-Sizing Priority)
$891.62
+
Est. Annual Savings (Right-Sizing Priority)
$10699.44
+ +
Est. Annual Savings (Schedule)
$89192.84
+ +
Old EBS Snapshots (> 1 Year)
24
+
Est. Monthly Cost (Old Snapshots)
$309.85
+ +
Security Findings
14
+ +
+
+ +
+

Visual Analytics

+

Interactive charts showing cost optimization opportunities and resource distribution.

+
+ +
+

Top Savings by Resource Type (Monthly $)

+ +
+ +
+

Resources Analyzed by Type

+ +
+ +
+

AI/ML Services Resources

+ +
+ +
+
+ + +
+
+

Top 10 Savings Opportunities

+
+

Focus on these items for the highest potential impact based on estimated monthly savings.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
RankResource TypeIdentifier (ID/Name)Current ConfigEst. Monthly SavingsRecommendation
1Cost Exploreranomaly-6552 (Amazon EC2)N/A$13848.16Investigate 140.0% cost increase
2Cost Exploreranomaly-2328 (Amazon EC2)N/A$2541.51Investigate 41.0% cost increase
3Cost Exploreranomaly-4036 (AWS Lambda)N/A$2283.17Investigate 64.0% cost increase
4Cost Exploreranomaly-5958 (Amazon RDS)N/A$1775.38Investigate 58.0% cost increase
5Bedrockpt-claude-instant-prod (Claude Instant Production)N/A$1200.00Consider on-demand pricing for variable workloads
6Cost Exploreranomaly-9783 (Amazon RDS)N/A$1083.85Investigate 85.0% cost increase
7Ec2i-0537d818 (dev-web-01)t3.medium$394.47Consider stopping
8Sagemakernotebook-experimentation-3 (notebook-3)ml.t3.medium$309.13Stop idle notebook instance
9Ec2i-023f4017 (prod-api-02)m5.xlarge$216.67Consider stopping
10S3bucket-3N/A$212.01Add lifecycle policy for cost optimization
+
+
+
+ +
+

Table of Contents

+ +
+

Ec2 (15 findings)

Account IDAccount AliasInstance IDNameRegionNike OwnerCurrent TypeRecommended TypeOSCPU Utilization (%)Network I/O (MB/s)StatusRecommendationEst. Monthly Savings
123456789012simulated-accounti-00d8b28fdev-web-01eu-west-1devops-teamt3.mediumt3.microUbuntu 22.0445.50%93.2IdleConsider stopping$394.47
123456789012simulated-accounti-035609e7prod-api-02eu-west-1platform-teamm5.xlargem6g.mediumUbuntu 22.048.00%14.7IdleConsider stopping$216.67
123456789012simulated-accounti-03358932prod-worker-03us-west-2data-teamm5.xlargem5.xlargeUbuntu 22.0418.50%8.8ActiveConsider no action needed$0.00
123456789012simulated-accounti-02ad82a3dev-worker-04eu-west-1ml-teamc5.xlarget3.microWindows Server 202232.50%7.4OversizedConsider downsizing$112.28
123456789012simulated-accounti-01114819prod-api-05us-west-2platform-teamt3.smallt3.smallAmazon Linux 202358.90%43.5ActiveConsider no action needed$0.00
123456789012simulated-accounti-02d29aa0dev-api-06eu-west-1platform-teamr5.xlarger5.xlargeWindows Server 202236.90%52.1ActiveConsider no action needed$0.00
123456789012simulated-accounti-015221cdstaging-web-07us-east-1devops-teamm5.xlargem5.xlargeRHEL 871.10%41.9ActiveConsider no action needed$0.00
123456789012simulated-accounti-04abd119prod-db-08us-west-2devops-teamm5.largem6g.largeUbuntu 22.048.80%46.9IdleConsider stopping$168.20
123456789012simulated-accounti-05f074beprod-api-09us-east-1backend-teamt3.smallt3.smallAmazon Linux 242.90%74.5ActiveConsider no action needed$0.00
123456789012simulated-accounti-04923c56prod-api-10us-west-2ml-teamt3.smallt3.smallRHEL 822.50%121.2ActiveConsider no action needed$0.00
123456789012simulated-accounti-03103acdstaging-worker-11us-east-1ml-teamr5.xlarger5.xlargeAmazon Linux 202316.60%70.3ActiveConsider no action needed$0.00
123456789012simulated-accounti-0407a0dedev-db-12eu-west-1devops-teamr5.larger5.largeAmazon Linux 202326.30%80.8ActiveConsider no action needed$0.00
123456789012simulated-accounti-00b3ab99dev-worker-13us-east-1devops-teamr5.larger5.largeRHEL 873.60%44.0ActiveConsider no action needed$0.00
123456789012simulated-accounti-01d666b1dev-api-14us-west-2backend-teamt3.microt3.microAmazon Linux 202327.60%17.9ActiveConsider no action needed$0.00
123456789012simulated-accounti-01068c9adev-db-15eu-west-1platform-teamt3.mediumt3.mediumAmazon Linux 202353.80%89.8ActiveConsider no action needed$0.00

Ebs (12 findings)

Account IDAccount AliasVolume IDNameRegionNike OwnerCurrent TypeSize (GB)IOPSThroughput (MB/s)Attached InstanceRecommendationEst. Monthly Savings
123456789012simulated-accountvol-031e9a90volume-temp-1us-west-2devops-teamgp317483102405i-01f825dfOK$0.00
123456789012simulated-accountvol-04d839c3volume-data-2us-east-1devops-teamgp2956790161i-01ffeabbMigrate to gp3$19.12
123456789012simulated-accountvol-05ecdf46volume-temp-3us-west-2devops-teamst11416137565i-04d2a439OK$0.00
123456789012simulated-accountvol-03764972volume-temp-4us-west-2backend-teamst111722510154i-05cd53d3OK$0.00
123456789012simulated-accountvol-040d127avolume-backup-5us-west-2devops-teamio112027267130i-029eb240OK$0.00
123456789012simulated-accountvol-04daf759volume-backup-6us-east-1backend-teamsc112782148158N/ADelete or snapshot unattached volume$102.24
123456789012simulated-accountvol-049a3f47volume-data-7us-west-2platform-teamgp2604184764i-02840dd9Migrate to gp3$12.08
123456789012simulated-accountvol-020a341cvolume-temp-8us-west-2backend-teamsc11911418136i-026f333fOK$0.00
123456789012simulated-accountvol-05229e60volume-data-9us-east-1devops-teamio1150014110189i-04fc40d0OK$0.00
123456789012simulated-accountvol-011417cbvolume-temp-10us-west-2data-teamio214294546208N/ADelete or snapshot unattached volume$114.32
123456789012simulated-accountvol-00de0275volume-temp-11us-west-2data-teamio115991453855i-03d7b010OK$0.00
123456789012simulated-accountvol-0176441avolume-root-12us-west-2platform-teamsc11221386170i-0135cdc5OK$0.00

S3 (10 findings)

Account IDAccount AliasBucket NameRegionNike OwnerStorage (GB)Lifecycle PolicyRecommendationEst. Monthly Savings
123456789012simulated-accountprod-logs-5399us-east-1platform-team43485ConfiguredWell configured$0.00
123456789012simulated-accountprod-static-assets-9432us-east-1backend-team24597ConfiguredWell configured$0.00
123456789012simulated-accountprod-temp-9053us-east-1data-team15363Not ConfiguredAdd lifecycle policy for cost optimization$212.01
123456789012simulated-accountprod-backups-5866us-east-1devops-team8898Not ConfiguredAdd lifecycle policy for cost optimization$122.79
123456789012simulated-accountdev-archives-3333us-east-1platform-team17386ConfiguredWell configured$0.00
123456789012simulated-accountprod-backups-1289us-east-1platform-team43061ConfiguredWell configured$0.00
123456789012simulated-accountprod-temp-8289us-east-1platform-team17821ConfiguredWell configured$0.00
123456789012simulated-accountdev-archives-4799us-east-1platform-team22562ConfiguredWell configured$0.00
123456789012simulated-accountstaging-backups-5279us-east-1devops-team14901ConfiguredWell configured$0.00
123456789012simulated-accountprod-data-lake-9344us-east-1platform-team12395Not ConfiguredAdd lifecycle policy for cost optimization$171.05

Rds (8 findings)

Account IDAccount AliasDB InstanceNameRegionNike OwnerCurrent TypeStorage (GB)CPU Utilization (%)ConnectionsRecommendationEst. Monthly Savings
123456789012simulated-accountdb-prod-replica-01database-1us-west-2data-teamdb.t3.medium34547.00%301Well utilized$0.00
123456789012simulated-accountdb-staging-main-02database-2us-west-2data-teamdb.t3.small4971.20%385Well utilized$0.00
123456789012simulated-accountdb-prod-replica-03database-3us-east-1analytics-teamdb.r5.xlarge60035.50%169Well utilized$0.00
123456789012simulated-accountdb-dev-analytics-04database-4us-east-1backend-teamdb.r5.large25749.60%52Well utilized$0.00
123456789012simulated-accountdb-dev-analytics-05database-5us-west-2analytics-teamdb.m5.xlarge20749.40%260Well utilized$0.00
123456789012simulated-accountdb-dev-analytics-06database-6us-west-2data-teamdb.t3.micro55256.90%178Well utilized$0.00
123456789012simulated-accountdb-prod-main-07database-7us-west-2platform-teamdb.t3.micro66317.60%223Consider downsizing instance class$42.03
123456789012simulated-accountdb-dev-analytics-08database-8us-east-1backend-teamdb.t3.medium44433.50%336Well utilized$0.00

SageMaker Analysis (5 findings)

Analysis of SageMaker endpoint and notebook instance usage.

Account IDAccount AliasResource ARNNameRegionNike OwnerTypeStatusRecommendationEst. Monthly Savings
123456789012simulated-accountarn:aws:sagemaker:us-east-1:123456789012:notebook-instance/notebook-data-science-1notebook-1us-east-1ml-teamNotebook InstanceIdleStop idle notebook instance$0.00
123456789012simulated-accountarn:aws:sagemaker:us-east-1:123456789012:notebook-instance/notebook-data-science-2notebook-2us-east-1ml-teamNotebook InstanceIdleStop idle notebook instance$0.00
123456789012simulated-accountarn:aws:sagemaker:us-east-1:123456789012:notebook-instance/notebook-experimentation-3notebook-3us-east-1ml-teamNotebook InstanceIdleStop idle notebook instance$309.13
123456789012simulated-accountarn:aws:sagemaker:us-east-1:123456789012:endpoint/endpoint-prediction-1endpoint-1us-east-1ml-teamInference EndpointActiveWell utilized endpoint$0.00
123456789012simulated-accountarn:aws:sagemaker:us-east-1:123456789012:endpoint/endpoint-inference-2endpoint-2us-east-1ml-teamInference EndpointActiveWell utilized endpoint$0.00

Lambda Analysis (10 findings)

Analysis of Lambda function configurations and usage.

Account IDAccount AliasFunction NameRegionNike OwnerRuntimeMemory Size (MB)RecommendationEst. Monthly Savings
123456789012simulated-accountprod-worker-1us-east-1backend-teamjava171024OK$0.00
123456789012simulated-accountdev-handler-2us-east-1data-teamgo1.x512OK$0.00
123456789012simulated-accountdev-processor-3us-east-1data-teamgo1.x3072OK$0.00
123456789012simulated-accountprod-api-4us-east-1backend-teamnodejs18.x3072OK$0.00
123456789012simulated-accountprod-handler-5us-east-1ml-teampython3.93072OK$0.00
123456789012simulated-accountdev-handler-6us-east-1data-teampython3.9128OK$0.00
123456789012simulated-accountdev-handler-7us-east-1backend-teamjava173072Reduce memory allocation$47.33
123456789012simulated-accountdev-processor-8us-east-1backend-teamnodejs20.x512OK$0.00
123456789012simulated-accountdev-api-9us-east-1backend-teampython3.9512OK$0.00
123456789012simulated-accountprod-api-10us-east-1ml-teamnodejs18.x512OK$0.00

Bedrock (1 findings)

Account IDAccount AliasResource IDNameRegionNike OwnerDetailsRecommendationEst. Monthly Savings
123456789012simulated-accountpt-claude-instant-prodClaude Instant Productionus-east-1ml-teamN/AConsider on-demand pricing for variable workloads$1,200.00

Cost Explorer (5 findings)

Account IDAccount AliasResource IDNameRegionNike OwnerDetailsRecommendationEst. Monthly Savings
123456789012simulated-accountanomaly-4036AWS Lambdaus-east-1finops-teamN/AInvestigate 64.0% cost increase$2,283.17
123456789012simulated-accountanomaly-5958Amazon RDSus-east-1finops-teamN/AInvestigate 58.0% cost increase$1,775.38
123456789012simulated-accountanomaly-6552Amazon EC2us-east-1finops-teamN/AInvestigate 140.0% cost increase$13,848.16
123456789012simulated-accountanomaly-9783Amazon RDSus-east-1finops-teamN/AInvestigate 85.0% cost increase$1,083.85
123456789012simulated-accountanomaly-2328Amazon EC2us-east-1finops-teamN/AInvestigate 41.0% cost increase$2,541.51

Eks Deployments (5 findings)

Account IDAccount AliasResource IDNameRegionNike OwnerDetailsRecommendationEst. Monthly Savings
123456789012simulated-accountprod-frontendfrontendus-east-1frontend-teamN/ADeployment healthy$0.00
123456789012simulated-accountprod-backend-apibackend-apius-east-1platform-teamN/AInvestigate restart issues$0.00
123456789012simulated-accountprod-workerworkerus-east-1devops-teamN/ADeployment healthy$0.00
123456789012simulated-accountstaging-schedulerschedulerus-east-1frontend-teamN/AReview and update stale deployment$0.00
123456789012simulated-accountstaging-gatewaygatewayus-east-1platform-teamN/AInvestigate restart issues$0.00

Eks Sessions (6 findings)

Account IDAccount AliasResource IDNameRegionNike OwnerDetailsRecommendationEst. Monthly Savings
123456789012simulated-accountsession-313248staging-cluster-session-1us-east-1devops-teamN/AReview long-running session for security$0.00
123456789012simulated-accountsession-800338dev-cluster-session-2us-east-1devops-teamN/AReview long-running session for security$0.00
123456789012simulated-accountsession-228980staging-cluster-session-3us-east-1devops-teamN/AReview long-running session for security$0.00
123456789012simulated-accountsession-256279prod-cluster-session-4us-east-1devops-teamN/ANormal session activity$0.00
123456789012simulated-accountsession-127425dev-cluster-session-5us-east-1devops-teamN/AReview long-running session for security$0.00
123456789012simulated-accountsession-559543prod-cluster-session-6us-east-1devops-teamN/AReview long-running session for security$0.00

Rto Analysis (7 findings)

Account IDAccount AliasResource IDNameRegionNike OwnerDetailsRecommendationEst. Monthly Savings
123456789012simulated-accountdb-prod-1database-1us-east-1data-teamN/AEnable cross-region backup for disaster recovery$0.00
123456789012simulated-accountdb-staging-2database-2us-east-1platform-teamN/AEnable cross-region backup for disaster recovery$0.00
123456789012simulated-accountdb-staging-3database-3us-east-1devops-teamN/ABackup configuration adequate$0.00
123456789012simulated-accountdb-staging-4database-4us-east-1platform-teamN/ABackup configuration adequate$0.00
123456789012simulated-accountbucket-critical-1critical-data-bucket-1us-east-1data-teamN/AEnable cross-region replication for critical data$0.00
123456789012simulated-accountbucket-critical-2critical-data-bucket-2us-east-1platform-teamN/AEnable cross-region replication for critical data$0.00
123456789012simulated-accountbucket-critical-3critical-data-bucket-3us-east-1data-teamN/AReplication configured$0.00
+
+
+

CloudWatch Agent Information

+
+

The CloudWatch Agent allows for more detailed metrics collection, including memory utilization, which can improve the accuracy of EC2 efficiency recommendations from both the EC2 Efficiency analyzer and AWS Compute Optimizer.

+

Benefits:

+
    +
  • Collects system-level metrics (CPU, memory, disk, network).
  • +
  • Collects custom metrics from your applications.
  • +
  • Collects logs from your instances.
  • +
+

Installation:

+

Refer to the official AWS documentation for installing and configuring the CloudWatch Agent:

+ +

Ensure the necessary IAM permissions are granted to your EC2 instances to send metrics and logs to CloudWatch.

+
+
+
+ +
+
+

Security Findings (14 findings)

+
+

Review the security findings identified during the analysis. Detailed findings are listed under the 'Security and Privacy' section if available.

+
+
+
+ + ↑ Top +
+
+

© 2026 Dedo Duro AWS Analyzer. All rights reserved.

+

Report generated on: 2026-01-26 18:50:06 UTC

+
+ + + + \ No newline at end of file diff --git a/config.py b/config.py index 0c6e65f..fb62cde 100755 --- a/config.py +++ b/config.py @@ -67,6 +67,13 @@ class AnalysisConfig: regions: Optional[List[str]] = None # List of regions to analyze in multi-region mode max_workers: int = 5 # Reduced from 8 + # Environment filtering: 'prod', 'test', 'dev', 'staging', or None for all + environment_filter: Optional[str] = None + # Tag keys to check for environment detection + environment_tags: List[str] = field(default_factory=lambda: [ + 'Environment', 'Env', 'Stage', 'environment', 'env', 'stage' + ]) + # API rate limiting delays (seconds) api_throttling: Dict[str, float] = field(default_factory=lambda: { 's3': 0.2, @@ -90,6 +97,27 @@ def get_batch_size_for_service(self, service: str) -> int: """Get the appropriate batch size for a service.""" return self.batch_sizes.get(service, self.batch_sizes['default']) + def matches_environment(self, tags: List[Dict[str, str]]) -> bool: + """ + Check if resource tags match the configured environment filter. + Returns True if no filter is set, or if tags match the filter. + """ + if not self.environment_filter: + return True # No filter, include all resources + + # Convert tags list to dict for easier lookup + tags_dict = {tag.get('Key', ''): tag.get('Value', '') for tag in tags if isinstance(tag, dict)} + + # Check each environment tag key + for tag_key in self.environment_tags: + if tag_key in tags_dict: + tag_value = tags_dict[tag_key].lower() + filter_value = self.environment_filter.lower() + # Support partial matching (e.g., 'production' matches 'prod') + if filter_value in tag_value or tag_value.startswith(filter_value): + return True + return False + @dataclass class ReportConfig: """Configuration for report generation.""" @@ -98,6 +126,11 @@ class ReportConfig: include_charts: bool = True output_s3_bucket: Optional[str] = None output_s3_prefix: str = "" + # Tag keys to use for grouping resources in reports + grouping_tags: List[str] = field(default_factory=lambda: [ + 'Team', 'Project', 'Application', 'Owner', 'CostCenter', + 'team', 'project', 'application', 'owner', 'cost-center' + ]) def get_default_filename(self) -> str: """ @@ -109,4 +142,58 @@ def get_default_filename(self) -> str: timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M") base_name = 'aws-optimization-report' extension = self.output_format - return f"{base_name}_{timestamp}.{extension}" \ No newline at end of file + return f"{base_name}_{timestamp}.{extension}" + + +@dataclass +class MultiAccountConfig: + """Configuration for multi-account analysis.""" + enabled: bool = False + # List of account configurations: each dict has 'account_id', 'role_name', 'alias' (optional) + accounts: List[Dict[str, str]] = field(default_factory=list) + # Path to accounts JSON/YAML file + accounts_file: Optional[str] = None + # External ID for assume role (if required) + external_id: Optional[str] = None + # Session duration in seconds (default: 1 hour) + session_duration: int = 3600 + # Run accounts in parallel + parallel_accounts: bool = True + # Maximum parallel account workers + max_account_workers: int = 3 + # AWS partition (aws, aws-cn for China, aws-us-gov for GovCloud) + partition: str = 'aws' + + def load_accounts_from_file(self) -> List[Dict[str, str]]: + """Load account configurations from a JSON or YAML file.""" + import json + if not self.accounts_file: + return self.accounts + + try: + with open(self.accounts_file, 'r') as f: + if self.accounts_file.endswith('.yaml') or self.accounts_file.endswith('.yml'): + try: + import yaml + data = yaml.safe_load(f) + except ImportError: + raise ImportError("PyYAML is required to load YAML account files. Install with: pip install pyyaml") + else: + data = json.load(f) + + if isinstance(data, dict) and 'accounts' in data: + self.accounts = data['accounts'] + elif isinstance(data, list): + self.accounts = data + else: + raise ValueError("Accounts file must contain a list of accounts or a dict with 'accounts' key") + + return self.accounts + except FileNotFoundError: + raise FileNotFoundError(f"Accounts file not found: {self.accounts_file}") + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in accounts file: {e}") + + def get_assume_role_arn(self, account_id: str, role_name: str) -> str: + """Generate the ARN for assuming a role in another account.""" + return f"arn:{self.partition}:iam::{account_id}:role/{role_name}" \ No newline at end of file diff --git a/core/multi_account.py b/core/multi_account.py new file mode 100644 index 0000000..762e096 --- /dev/null +++ b/core/multi_account.py @@ -0,0 +1,374 @@ +""" +Multi-account orchestration for Dedo-Duro AWS Resource Analyzer. + +This module provides functionality to analyze resources across multiple AWS accounts +using AWS STS assume-role for cross-account access. +""" + +import logging +from typing import Dict, List, Any, Optional, Callable +from concurrent.futures import ThreadPoolExecutor, as_completed +import boto3 +from botocore.exceptions import ClientError + +from config import AWSConfig, AnalysisConfig, MultiAccountConfig +from utils.console import print_error, print_info, print_success, print_warning + +log = logging.getLogger(__name__) + + +class MultiAccountOrchestrator: + """Orchestrates analysis across multiple AWS accounts.""" + + def __init__( + self, + base_aws_config: AWSConfig, + analysis_config: AnalysisConfig, + multi_account_config: MultiAccountConfig + ): + """ + Initialize the multi-account orchestrator. + + Args: + base_aws_config: Base AWS configuration for the source account + analysis_config: Analysis configuration + multi_account_config: Multi-account configuration with target accounts + """ + self.base_aws_config = base_aws_config + self.analysis_config = analysis_config + self.multi_account_config = multi_account_config + self.sts_client = base_aws_config.create_client('sts') + self.results: Dict[str, Dict[str, Any]] = {} + self.account_credentials: Dict[str, Dict[str, str]] = {} + + def assume_role(self, account_id: str, role_name: str, session_name: str = 'DedoDuroAnalysis') -> Optional[Dict[str, str]]: + """ + Assume a role in the target account and return temporary credentials. + + Args: + account_id: Target AWS account ID + role_name: IAM role name to assume + session_name: Name for the assumed role session + + Returns: + Dictionary with AccessKeyId, SecretAccessKey, SessionToken, or None on failure + """ + role_arn = self.multi_account_config.get_assume_role_arn(account_id, role_name) + log.info(f"Assuming role {role_arn} in account {account_id}") + + try: + assume_role_kwargs = { + 'RoleArn': role_arn, + 'RoleSessionName': session_name, + 'DurationSeconds': self.multi_account_config.session_duration + } + + # Add external ID if provided + if self.multi_account_config.external_id: + assume_role_kwargs['ExternalId'] = self.multi_account_config.external_id + + response = self.sts_client.assume_role(**assume_role_kwargs) + + credentials = response['Credentials'] + return { + 'AccessKeyId': credentials['AccessKeyId'], + 'SecretAccessKey': credentials['SecretAccessKey'], + 'SessionToken': credentials['SessionToken'], + 'Expiration': credentials['Expiration'].isoformat() + } + + except ClientError as e: + error_code = e.response.get('Error', {}).get('Code', 'Unknown') + error_msg = e.response.get('Error', {}).get('Message', str(e)) + log.error(f"Failed to assume role in account {account_id}: {error_code} - {error_msg}") + print_error(f"Cannot access account {account_id}: {error_code}") + return None + except Exception as e: + log.error(f"Unexpected error assuming role in account {account_id}: {e}") + print_error(f"Unexpected error accessing account {account_id}: {e}") + return None + + def create_account_config(self, account_id: str, credentials: Dict[str, str]) -> AWSConfig: + """ + Create an AWSConfig for a target account using assumed role credentials. + + Args: + account_id: Target AWS account ID + credentials: Temporary credentials from assume_role + + Returns: + AWSConfig configured for the target account + """ + # Create a new config with the same settings but using temporary credentials + # We need to create a session with explicit credentials + return AWSConfigWithCredentials( + region=self.base_aws_config.region, + profile=None, # Don't use profile when using temporary credentials + max_attempts=self.base_aws_config.max_attempts, + retry_mode=self.base_aws_config.retry_mode, + max_pool_connections=self.base_aws_config.max_pool_connections, + connect_timeout=self.base_aws_config.connect_timeout, + read_timeout=self.base_aws_config.read_timeout, + credentials=credentials, + account_id=account_id + ) + + def analyze_single_account( + self, + account_config: Dict[str, str], + analysis_func: Callable[[AWSConfig, str], Dict[str, Any]] + ) -> Dict[str, Any]: + """ + Analyze resources in a single target account. + + Args: + account_config: Account configuration dict with 'account_id', 'role_name', 'alias' + analysis_func: Function to run analysis, takes (AWSConfig, account_id) and returns results + + Returns: + Analysis results for the account + """ + account_id = account_config['account_id'] + role_name = account_config.get('role_name', 'OrganizationAccountAccessRole') + account_alias = account_config.get('alias', account_id) + + print_info(f"Starting analysis for account: {account_alias} ({account_id})") + + # Assume role and get credentials + credentials = self.assume_role(account_id, role_name) + if not credentials: + return { + 'account_id': account_id, + 'account_alias': account_alias, + 'status': 'failed', + 'error': f'Failed to assume role {role_name}', + 'results': {} + } + + # Store credentials for potential reuse + self.account_credentials[account_id] = credentials + + # Create account-specific config + account_aws_config = self.create_account_config(account_id, credentials) + + try: + # Run the analysis function + results = analysis_func(account_aws_config, account_id) + print_success(f"Completed analysis for account: {account_alias}") + return { + 'account_id': account_id, + 'account_alias': account_alias, + 'status': 'success', + 'results': results + } + except Exception as e: + log.error(f"Error analyzing account {account_id}: {e}") + print_error(f"Analysis failed for account {account_alias}: {e}") + return { + 'account_id': account_id, + 'account_alias': account_alias, + 'status': 'error', + 'error': str(e), + 'results': {} + } + + def analyze_all_accounts( + self, + analysis_func: Callable[[AWSConfig, str], Dict[str, Any]] + ) -> Dict[str, Dict[str, Any]]: + """ + Analyze resources across all configured accounts. + + Args: + analysis_func: Function to run analysis, takes (AWSConfig, account_id) and returns results + + Returns: + Dictionary mapping account IDs to their analysis results + """ + accounts = self.multi_account_config.accounts + if not accounts: + print_warning("No accounts configured for multi-account analysis") + return {} + + print_info(f"Starting multi-account analysis for {len(accounts)} accounts") + + if self.multi_account_config.parallel_accounts and len(accounts) > 1: + # Run accounts in parallel + return self._analyze_accounts_parallel(accounts, analysis_func) + else: + # Run accounts sequentially + return self._analyze_accounts_sequential(accounts, analysis_func) + + def _analyze_accounts_sequential( + self, + accounts: List[Dict[str, str]], + analysis_func: Callable[[AWSConfig, str], Dict[str, Any]] + ) -> Dict[str, Dict[str, Any]]: + """Analyze accounts sequentially.""" + results = {} + for account in accounts: + account_id = account['account_id'] + result = self.analyze_single_account(account, analysis_func) + results[account_id] = result + return results + + def _analyze_accounts_parallel( + self, + accounts: List[Dict[str, str]], + analysis_func: Callable[[AWSConfig, str], Dict[str, Any]] + ) -> Dict[str, Dict[str, Any]]: + """Analyze accounts in parallel.""" + results = {} + max_workers = min( + self.multi_account_config.max_account_workers, + len(accounts) + ) + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit all account analyses + future_to_account = { + executor.submit(self.analyze_single_account, account, analysis_func): account + for account in accounts + } + + # Collect results as they complete + for future in as_completed(future_to_account): + account = future_to_account[future] + account_id = account['account_id'] + try: + result = future.result() + results[account_id] = result + except Exception as e: + log.error(f"Exception analyzing account {account_id}: {e}") + results[account_id] = { + 'account_id': account_id, + 'account_alias': account.get('alias', account_id), + 'status': 'error', + 'error': str(e), + 'results': {} + } + + return results + + def aggregate_results(self, account_results: Dict[str, Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]: + """ + Aggregate results from multiple accounts into a single structure. + + Args: + account_results: Dictionary of results per account + + Returns: + Aggregated results with account information added to each resource + """ + aggregated: Dict[str, List[Dict[str, Any]]] = {} + + for account_id, account_data in account_results.items(): + if account_data.get('status') != 'success': + continue + + account_alias = account_data.get('account_alias', account_id) + results = account_data.get('results', {}) + + for resource_type, resources in results.items(): + if not isinstance(resources, list): + continue + + if resource_type not in aggregated: + aggregated[resource_type] = [] + + # Add account info to each resource + for resource in resources: + if isinstance(resource, dict): + resource['account_id'] = account_id + resource['account_alias'] = account_alias + aggregated[resource_type].append(resource) + + return aggregated + + +class AWSConfigWithCredentials(AWSConfig): + """AWSConfig that uses explicit temporary credentials instead of profiles.""" + + def __init__( + self, + credentials: Dict[str, str], + account_id: str, + **kwargs + ): + """ + Initialize with explicit credentials. + + Args: + credentials: Dict with AccessKeyId, SecretAccessKey, SessionToken + account_id: AWS account ID these credentials are for + **kwargs: Other AWSConfig arguments + """ + super().__init__(**kwargs) + self.credentials = credentials + self.account_id = account_id + self.account_alias = kwargs.get('account_alias', account_id) + + def create_session(self) -> boto3.Session: + """Creates a boto3 session using temporary credentials.""" + return boto3.Session( + aws_access_key_id=self.credentials['AccessKeyId'], + aws_secret_access_key=self.credentials['SecretAccessKey'], + aws_session_token=self.credentials['SessionToken'], + region_name=self.region + ) + + def create_client(self, service_name: str) -> Any: + """Creates a boto3 client using temporary credentials.""" + session = self.create_session() + return session.client( + service_name, + config=self.get_boto3_config() + ) + + +def get_organization_accounts(sts_client, org_client=None) -> List[Dict[str, str]]: + """ + Get all accounts in the AWS Organization. + + Args: + sts_client: STS client for getting current account + org_client: Organizations client (optional, created if not provided) + + Returns: + List of account configurations + """ + if org_client is None: + org_client = boto3.client('organizations') + + accounts = [] + try: + # Get current account to exclude it + current_account = sts_client.get_caller_identity()['Account'] + + paginator = org_client.get_paginator('list_accounts') + for page in paginator.paginate(): + for account in page['Accounts']: + if account['Status'] == 'ACTIVE' and account['Id'] != current_account: + accounts.append({ + 'account_id': account['Id'], + 'alias': account.get('Name', account['Id']), + 'email': account.get('Email', ''), + 'role_name': 'OrganizationAccountAccessRole' + }) + + log.info(f"Found {len(accounts)} active accounts in organization") + return accounts + + except ClientError as e: + error_code = e.response.get('Error', {}).get('Code', 'Unknown') + log.error(f"Failed to list organization accounts: {error_code}") + if error_code == 'AWSOrganizationsNotInUseException': + print_warning("AWS Organizations is not enabled for this account") + elif error_code == 'AccessDeniedException': + print_warning("No permission to list organization accounts") + else: + print_error(f"Failed to list organization accounts: {error_code}") + return [] + except Exception as e: + log.error(f"Unexpected error listing organization accounts: {e}") + return [] diff --git a/core/reporter.py b/core/reporter.py index 92cd408..0205f41 100755 --- a/core/reporter.py +++ b/core/reporter.py @@ -147,4 +147,141 @@ def _group_by_region( if resource_type not in region_results[region]: region_results[region][resource_type] = [] region_results[region][resource_type].append(resource) - return region_results \ No newline at end of file + return region_results + + def _group_by_tags( + self, results: Dict[str, List[Dict[str, Any]]], grouping_tags: Optional[List[str]] = None + ) -> Dict[str, Dict[str, List[Dict[str, Any]]]]: + """ + Group results by tag values for tag-based organization in reports. + + Args: + results: Analysis results by resource type + grouping_tags: List of tag keys to check for grouping (uses config if not provided) + + Returns: + Dictionary mapping group names to their resources by type + """ + if grouping_tags is None: + grouping_tags = getattr(self.config, 'grouping_tags', [ + 'Team', 'Project', 'Application', 'Owner', 'CostCenter', + 'team', 'project', 'application', 'owner', 'cost-center' + ]) + + grouped_results: Dict[str, Dict[str, List[Dict[str, Any]]]] = {} + + for resource_type, resources in results.items(): + if not isinstance(resources, list): + continue + + for resource in resources: + if not isinstance(resource, dict): + continue + + # Extract tag value for grouping + group_name = self._get_tag_group(resource, grouping_tags) + + if group_name not in grouped_results: + grouped_results[group_name] = {} + if resource_type not in grouped_results[group_name]: + grouped_results[group_name][resource_type] = [] + + grouped_results[group_name][resource_type].append(resource) + + return grouped_results + + def _get_tag_group( + self, resource: Dict[str, Any], grouping_tags: List[str] + ) -> str: + """ + Get the group name for a resource based on its tags. + + Args: + resource: Resource dictionary + grouping_tags: List of tag keys to check + + Returns: + Group name from tag value, or 'Untagged' if no matching tag found + """ + # First check if tags are directly in the resource + tags = resource.get('tags', []) + + # Convert to dict if it's a list of {Key, Value} dicts + if isinstance(tags, list): + tags_dict = { + tag.get('Key', ''): tag.get('Value', '') + for tag in tags if isinstance(tag, dict) + } + elif isinstance(tags, dict): + tags_dict = tags + else: + tags_dict = {} + + # Check each grouping tag in order of priority + for tag_key in grouping_tags: + if tag_key in tags_dict and tags_dict[tag_key]: + return tags_dict[tag_key] + + return 'Untagged' + + def _group_by_account( + self, results: Dict[str, List[Dict[str, Any]]] + ) -> Dict[str, Dict[str, List[Dict[str, Any]]]]: + """ + Group results by AWS account for multi-account reports. + + Args: + results: Analysis results by resource type + + Returns: + Dictionary mapping account identifiers to their resources by type + """ + account_results: Dict[str, Dict[str, List[Dict[str, Any]]]] = {} + + for resource_type, resources in results.items(): + if not isinstance(resources, list): + continue + + for resource in resources: + if not isinstance(resource, dict): + continue + + # Get account identifier (prefer alias over ID) + account_alias = resource.get('account_alias', '') + account_id = resource.get('account_id', 'unknown') + account_key = account_alias if account_alias else account_id + + if account_key not in account_results: + account_results[account_key] = {} + if resource_type not in account_results[account_key]: + account_results[account_key][resource_type] = [] + + account_results[account_key][resource_type].append(resource) + + return account_results + + def _calculate_group_savings( + self, grouped_results: Dict[str, Dict[str, List[Dict[str, Any]]]] + ) -> Dict[str, float]: + """ + Calculate total savings per group. + + Args: + grouped_results: Results grouped by some criteria + + Returns: + Dictionary mapping group names to total savings + """ + group_savings = {} + + for group_name, resources_by_type in grouped_results.items(): + total_savings = 0.0 + for resource_type, resources in resources_by_type.items(): + for resource in resources: + if isinstance(resource, dict): + savings = resource.get('estimated_monthly_savings', 0) or 0 + potential = resource.get('potential_monthly_savings', 0) or 0 + total_savings += max(savings, potential) + group_savings[group_name] = round(total_savings, 2) + + return group_savings \ No newline at end of file diff --git a/docs/kubernetes_permissions.md b/docs/kubernetes_permissions.md new file mode 100644 index 0000000..2c6443b --- /dev/null +++ b/docs/kubernetes_permissions.md @@ -0,0 +1,245 @@ +# Kubernetes/EKS Permissions Guide + +This document outlines the required IAM and Kubernetes RBAC permissions for Dedo-Duro to analyze Amazon EKS clusters. + +## Overview + +Dedo-Duro's EKS analyzers require both AWS IAM permissions and Kubernetes RBAC permissions to analyze: +- EKS cluster configurations +- Active kubectl sessions +- Deployment lifecycle and health +- Pod restart patterns +- Resource utilization + +## AWS IAM Permissions + +### Minimum Required IAM Policy + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "EKSReadAccess", + "Effect": "Allow", + "Action": [ + "eks:DescribeCluster", + "eks:ListClusters", + "eks:ListNodegroups", + "eks:DescribeNodegroup", + "eks:ListFargateProfiles", + "eks:DescribeFargateProfile", + "eks:ListAddons", + "eks:DescribeAddon", + "eks:ListIdentityProviderConfigs", + "eks:DescribeIdentityProviderConfig" + ], + "Resource": "*" + }, + { + "Sid": "EKSSessionAccess", + "Effect": "Allow", + "Action": [ + "ssm:DescribeSessions", + "ssm:GetConnectionStatus" + ], + "Resource": "*" + }, + { + "Sid": "CloudWatchMetrics", + "Effect": "Allow", + "Action": [ + "cloudwatch:GetMetricStatistics", + "cloudwatch:GetMetricData", + "cloudwatch:ListMetrics" + ], + "Resource": "*" + }, + { + "Sid": "EC2ForNodegroups", + "Effect": "Allow", + "Action": [ + "ec2:DescribeInstances", + "autoscaling:DescribeAutoScalingGroups" + ], + "Resource": "*" + } + ] +} +``` + +### Session Monitoring Additional Permissions + +For monitoring active kubectl sessions via SSM: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "SSMSessionMonitoring", + "Effect": "Allow", + "Action": [ + "ssm:DescribeSessions", + "ssm:GetConnectionStatus", + "ssm:DescribeInstanceInformation" + ], + "Resource": "*" + } + ] +} +``` + +## Kubernetes RBAC Permissions + +### ClusterRole for Dedo-Duro + +Create a ClusterRole with read-only access to monitor deployments and pods: + +```yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: dedo-duro-reader +rules: + # Deployment and ReplicaSet access + - apiGroups: ["apps"] + resources: ["deployments", "replicasets", "daemonsets", "statefulsets"] + verbs: ["get", "list", "watch"] + + # Pod access for restart analysis + - apiGroups: [""] + resources: ["pods", "pods/log"] + verbs: ["get", "list", "watch"] + + # Events for deployment history + - apiGroups: [""] + resources: ["events"] + verbs: ["get", "list", "watch"] + + # Namespaces for scope + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list"] + + # Resource quotas and limits + - apiGroups: [""] + resources: ["resourcequotas", "limitranges"] + verbs: ["get", "list"] + + # Nodes for capacity analysis + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list"] + + # Metrics for utilization + - apiGroups: ["metrics.k8s.io"] + resources: ["pods", "nodes"] + verbs: ["get", "list"] +``` + +### ClusterRoleBinding + +Bind the ClusterRole to a ServiceAccount: + +```yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: dedo-duro-sa + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: dedo-duro-reader-binding +subjects: + - kind: ServiceAccount + name: dedo-duro-sa + namespace: monitoring +roleRef: + kind: ClusterRole + name: dedo-duro-reader + apiGroup: rbac.authorization.k8s.io +``` + +## Setting Up Access + +### Option 1: IAM Roles for Service Accounts (IRSA) - Recommended + +1. Create an IAM role with the required permissions +2. Associate the role with the Kubernetes ServiceAccount: + +```bash +eksctl create iamserviceaccount \ + --name dedo-duro-sa \ + --namespace monitoring \ + --cluster your-cluster-name \ + --attach-policy-arn arn:aws:iam::YOUR_ACCOUNT:policy/DedoDuroEKSPolicy \ + --approve +``` + +### Option 2: aws-auth ConfigMap + +Add the IAM user/role to the aws-auth ConfigMap: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: aws-auth + namespace: kube-system +data: + mapRoles: | + - rolearn: arn:aws:iam::YOUR_ACCOUNT:role/DedoDuroRole + username: dedo-duro + groups: + - dedo-duro-reader +``` + +## Verification + +### Verify IAM Permissions + +```bash +# Check EKS cluster access +aws eks describe-cluster --name your-cluster-name + +# Check SSM session access +aws ssm describe-sessions --state Active +``` + +### Verify Kubernetes RBAC + +```bash +# Test read access to deployments +kubectl auth can-i list deployments --as=system:serviceaccount:monitoring:dedo-duro-sa + +# Test read access to pods +kubectl auth can-i list pods --as=system:serviceaccount:monitoring:dedo-duro-sa + +# Test access to metrics +kubectl auth can-i list pods.metrics.k8s.io --as=system:serviceaccount:monitoring:dedo-duro-sa +``` + +## Troubleshooting + +### Common Issues + +1. **"Unauthorized" errors**: Check that the IAM role is correctly mapped in aws-auth ConfigMap +2. **"Forbidden" errors**: Verify the ClusterRoleBinding is correctly configured +3. **Missing metrics**: Ensure the Kubernetes Metrics Server is installed in the cluster + +### Enabling Metrics Server + +```bash +kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml +``` + +## Security Considerations + +- Always use read-only permissions for analysis +- Use IRSA for credential management when possible +- Implement least-privilege access by restricting to specific namespaces if needed +- Rotate credentials regularly for non-IRSA configurations +- Enable CloudTrail logging for audit purposes diff --git a/main.py b/main.py index 32771e0..e66b705 100755 --- a/main.py +++ b/main.py @@ -12,7 +12,7 @@ import traceback # Import the traceback module from typing import Dict, List, Any, Optional -from config import AWSConfig, AnalysisConfig, ReportConfig +from config import AWSConfig, AnalysisConfig, ReportConfig, MultiAccountConfig from core.analyzer import AWSResourceManager from core.reporter import ReportCoordinator # Import specific analyzers needed for registration mapping @@ -303,6 +303,26 @@ def main(args: Optional[argparse.Namespace] = None) -> None: 'class': 'analyzers.ebs_snapshot.EBSSnapshotAnalyzer', 'time': "Medium (20-40s)", 'desc': "Identifies EBS snapshots older than a year and estimates their cost." + }, + 'eks_sessions': { + 'class': 'analyzers.eks_session_analyzer.EKSSessionAnalyzer', + 'time': "Medium (30-60s)", + 'desc': "Monitors active EKS/kubectl sessions and detects unusual access patterns" + }, + 'eks_deployments': { + 'class': 'analyzers.eks_deployment_lifecycle.EKSDeploymentLifecycleAnalyzer', + 'time': "Medium-Long (60-120s)", + 'desc': "Analyzes EKS deployment health, rollout status, and pod restart patterns" + }, + 'cost_explorer': { + 'class': 'analyzers.cost_explorer_analyzer.CostExplorerAnalyzer', + 'time': "Medium (30-60s)", + 'desc': "Analyzes AWS Cost Explorer data for spending patterns and anomalies" + }, + 'rto_analysis': { + 'class': 'analyzers.rto_analyzer.RTOAnalyzer', + 'time': "Medium-Long (60-120s)", + 'desc': "Analyzes backup configurations and disaster recovery readiness" } } @@ -378,6 +398,33 @@ def main(args: Optional[argparse.Namespace] = None) -> None: '--schedule-timezone', default='UTC', help='Timezone for Terraform schedule (default: UTC)') + # Environment filtering arguments + parser.add_argument( + '--environment', '-e', + choices=['prod', 'production', 'test', 'testing', 'dev', 'development', 'staging', 'qa'], + help='Filter resources by environment tag (e.g., prod, test, dev, staging)') + parser.add_argument( + '--environment-tags', + help='Comma-separated list of tag keys to check for environment (default: Environment,Env,Stage)') + + # Multi-account arguments + parser.add_argument( + '--accounts-file', + help='Path to JSON/YAML file containing account configurations for multi-account analysis') + parser.add_argument( + '--all-accounts', action='store_true', + help='Run analysis across all accounts defined in the accounts file') + parser.add_argument( + '--account-role', + help='IAM role name to assume in target accounts (default: OrganizationAccountAccessRole)') + parser.add_argument( + '--external-id', + help='External ID for assuming roles in other accounts (if required)') + + # Tag-based grouping arguments + parser.add_argument( + '--group-by-tags', + help='Comma-separated list of tag keys to group resources in reports (e.g., Team,Project,Application)') if args is None: args = parser.parse_args() # Parse command line args if none provided @@ -406,20 +453,57 @@ def main(args: Optional[argparse.Namespace] = None) -> None: profile=args.profile, max_attempts=args.retry_attempts ) + # Parse environment tags if provided + env_tags = None + if hasattr(args, 'environment_tags') and args.environment_tags: + env_tags = [t.strip() for t in args.environment_tags.split(',')] + analysis_config = AnalysisConfig( verbose=args.verbose, single_thread=args.single_thread, multi_region=args.multi_region, resource_types=args.resource_types, - max_workers=args.max_workers + max_workers=args.max_workers, + environment_filter=getattr(args, 'environment', None), + environment_tags=env_tags if env_tags else ['Environment', 'Env', 'Stage', 'environment', 'env', 'stage'] ) + + # Parse grouping tags if provided + grouping_tags = None + if hasattr(args, 'group_by_tags') and args.group_by_tags: + grouping_tags = [t.strip() for t in args.group_by_tags.split(',')] + report_config = ReportConfig( output_format=args.output_format, output_file=args.output_file, output_s3_bucket=args.output_s3_bucket, - output_s3_prefix=args.output_s3_prefix + output_s3_prefix=args.output_s3_prefix, + grouping_tags=grouping_tags if grouping_tags else [ + 'Team', 'Project', 'Application', 'Owner', 'CostCenter', + 'team', 'project', 'application', 'owner', 'cost-center' + ] ) + # Configure multi-account if requested + multi_account_config = None + if hasattr(args, 'accounts_file') and args.accounts_file: + multi_account_config = MultiAccountConfig( + enabled=getattr(args, 'all_accounts', False), + accounts_file=args.accounts_file, + external_id=getattr(args, 'external_id', None) + ) + if multi_account_config.enabled: + try: + multi_account_config.load_accounts_from_file() + print_info(f"Loaded {len(multi_account_config.accounts)} accounts from {args.accounts_file}") + except (FileNotFoundError, ValueError) as e: + print_error(f"Failed to load accounts file: {e}") + sys.exit(1) + + # Log environment filter if set + if analysis_config.environment_filter: + print_info(f"Environment filter enabled: analyzing only '{analysis_config.environment_filter}' resources") + # Create resource manager manager = AWSResourceManager(aws_config, analysis_config, args) diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..c7b23ec --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +pythonpath = . +testpaths = tests diff --git a/remediation/base.py b/remediation/base.py new file mode 100644 index 0000000..4412c72 --- /dev/null +++ b/remediation/base.py @@ -0,0 +1,350 @@ +""" +Base Remediation Module. + +Provides the foundation for auto-remediation capabilities with safety controls. +All remediation actions require explicit approval by default. +""" + +import logging +from abc import ABC, abstractmethod +from datetime import datetime +from typing import Dict, List, Any, Optional +from dataclasses import dataclass, field +from enum import Enum + +from config import AWSConfig + +log = logging.getLogger(__name__) + + +class RemediationRisk(Enum): + """Risk level for remediation actions.""" + LOW = "low" # Safe operations: tagging, snapshots + MEDIUM = "medium" # Moderate risk: stop instances, modify configs + HIGH = "high" # High risk: terminate, delete + CRITICAL = "critical" # Destructive: delete with no recovery + + +class RemediationStatus(Enum): + """Status of a remediation action.""" + PENDING = "pending" + APPROVED = "approved" + IN_PROGRESS = "in_progress" + COMPLETED = "completed" + FAILED = "failed" + CANCELLED = "cancelled" + DRY_RUN = "dry_run" + + +@dataclass +class RemediationAction: + """Represents a single remediation action.""" + action_id: str + resource_type: str + resource_id: str + action_type: str + description: str + risk_level: RemediationRisk + status: RemediationStatus = RemediationStatus.PENDING + dry_run: bool = True + created_at: datetime = field(default_factory=datetime.utcnow) + approved_at: Optional[datetime] = None + completed_at: Optional[datetime] = None + approved_by: Optional[str] = None + result: Optional[Dict[str, Any]] = None + error: Optional[str] = None + rollback_info: Optional[Dict[str, Any]] = None + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary representation.""" + return { + 'action_id': self.action_id, + 'resource_type': self.resource_type, + 'resource_id': self.resource_id, + 'action_type': self.action_type, + 'description': self.description, + 'risk_level': self.risk_level.value, + 'status': self.status.value, + 'dry_run': self.dry_run, + 'created_at': self.created_at.isoformat() if self.created_at else None, + 'approved_at': self.approved_at.isoformat() if self.approved_at else None, + 'completed_at': self.completed_at.isoformat() if self.completed_at else None, + 'approved_by': self.approved_by, + 'result': self.result, + 'error': self.error, + 'rollback_info': self.rollback_info + } + + +class BaseRemediation(ABC): + """ + Base class for resource remediation. + + Provides common functionality for all remediation types with + safety controls and audit logging. + """ + + # Maximum risk level allowed without explicit approval + DEFAULT_MAX_AUTO_APPROVE_RISK = RemediationRisk.LOW + + def __init__( + self, + aws_config: AWSConfig, + dry_run: bool = True, + max_auto_approve_risk: RemediationRisk = None + ): + """ + Initialize the remediation handler. + + Args: + aws_config: AWS configuration + dry_run: If True, actions are simulated but not executed (default: True) + max_auto_approve_risk: Maximum risk level for auto-approval + """ + self.aws_config = aws_config + self.dry_run = dry_run + self.max_auto_approve_risk = max_auto_approve_risk or self.DEFAULT_MAX_AUTO_APPROVE_RISK + self.actions: List[RemediationAction] = [] + self.audit_log: List[Dict[str, Any]] = [] + + @abstractmethod + def get_resource_type(self) -> str: + """Return the AWS resource type this handler remediates.""" + pass + + @abstractmethod + def get_available_actions(self) -> List[Dict[str, Any]]: + """Return list of available remediation actions.""" + pass + + def _log_audit( + self, + action: str, + resource_id: str, + details: Dict[str, Any], + success: bool = True + ) -> None: + """Log an audit entry for tracking.""" + entry = { + 'timestamp': datetime.utcnow().isoformat(), + 'action': action, + 'resource_type': self.get_resource_type(), + 'resource_id': resource_id, + 'details': details, + 'success': success, + 'dry_run': self.dry_run, + 'region': self.aws_config.region + } + self.audit_log.append(entry) + log.info(f"Remediation audit: {action} on {resource_id} - {'DRY_RUN' if self.dry_run else 'EXECUTED'}") + + def create_action( + self, + resource_id: str, + action_type: str, + description: str, + risk_level: RemediationRisk, + rollback_info: Optional[Dict[str, Any]] = None + ) -> RemediationAction: + """ + Create a new remediation action. + + Args: + resource_id: ID of the resource to remediate + action_type: Type of remediation action + description: Human-readable description + risk_level: Risk level of this action + rollback_info: Information needed to rollback this action + + Returns: + The created RemediationAction + """ + from uuid import uuid4 + + action = RemediationAction( + action_id=str(uuid4())[:8], + resource_type=self.get_resource_type(), + resource_id=resource_id, + action_type=action_type, + description=description, + risk_level=risk_level, + dry_run=self.dry_run, + rollback_info=rollback_info + ) + + self.actions.append(action) + self._log_audit('action_created', resource_id, { + 'action_id': action.action_id, + 'action_type': action_type, + 'risk_level': risk_level.value + }) + + return action + + def can_auto_approve(self, action: RemediationAction) -> bool: + """Check if an action can be auto-approved based on risk level.""" + risk_order = [RemediationRisk.LOW, RemediationRisk.MEDIUM, RemediationRisk.HIGH, RemediationRisk.CRITICAL] + action_risk_index = risk_order.index(action.risk_level) + max_risk_index = risk_order.index(self.max_auto_approve_risk) + return action_risk_index <= max_risk_index + + def approve_action( + self, + action_id: str, + approved_by: str = "system" + ) -> bool: + """ + Approve a remediation action for execution. + + Args: + action_id: ID of the action to approve + approved_by: Identifier of the approver + + Returns: + True if approved, False if not found or already processed + """ + for action in self.actions: + if action.action_id == action_id: + if action.status != RemediationStatus.PENDING: + log.warning(f"Action {action_id} is not pending, cannot approve") + return False + + action.status = RemediationStatus.APPROVED + action.approved_at = datetime.utcnow() + action.approved_by = approved_by + + self._log_audit('action_approved', action.resource_id, { + 'action_id': action_id, + 'approved_by': approved_by + }) + return True + + log.warning(f"Action {action_id} not found") + return False + + def execute_action(self, action_id: str) -> Dict[str, Any]: + """ + Execute a remediation action. + + Args: + action_id: ID of the action to execute + + Returns: + Result dictionary with success status and details + """ + for action in self.actions: + if action.action_id == action_id: + if action.status not in [RemediationStatus.APPROVED, RemediationStatus.PENDING]: + return { + 'success': False, + 'error': f'Action is not approved or pending: {action.status.value}' + } + + # Check auto-approval for pending actions + if action.status == RemediationStatus.PENDING: + if not self.can_auto_approve(action): + return { + 'success': False, + 'error': f'Action requires approval (risk: {action.risk_level.value})' + } + self.approve_action(action_id, 'auto-approved') + + action.status = RemediationStatus.IN_PROGRESS + + try: + if self.dry_run: + result = self._simulate_action(action) + action.status = RemediationStatus.DRY_RUN + else: + result = self._execute_action_impl(action) + action.status = RemediationStatus.COMPLETED + + action.completed_at = datetime.utcnow() + action.result = result + + self._log_audit('action_executed', action.resource_id, { + 'action_id': action_id, + 'dry_run': self.dry_run, + 'result': result + }) + + return {'success': True, 'result': result, 'dry_run': self.dry_run} + + except Exception as e: + action.status = RemediationStatus.FAILED + action.error = str(e) + + self._log_audit('action_failed', action.resource_id, { + 'action_id': action_id, + 'error': str(e) + }, success=False) + + log.exception(f"Remediation action {action_id} failed") + return {'success': False, 'error': str(e)} + + return {'success': False, 'error': f'Action {action_id} not found'} + + def _simulate_action(self, action: RemediationAction) -> Dict[str, Any]: + """ + Simulate an action without making changes. + + Args: + action: The action to simulate + + Returns: + Simulation result + """ + return { + 'simulated': True, + 'action_type': action.action_type, + 'resource_id': action.resource_id, + 'message': f'Would execute {action.action_type} on {action.resource_id}' + } + + @abstractmethod + def _execute_action_impl(self, action: RemediationAction) -> Dict[str, Any]: + """ + Execute the actual remediation action. + + Must be implemented by subclasses. + + Args: + action: The action to execute + + Returns: + Execution result + """ + pass + + def get_pending_actions(self) -> List[RemediationAction]: + """Get all pending actions.""" + return [a for a in self.actions if a.status == RemediationStatus.PENDING] + + def get_audit_log(self) -> List[Dict[str, Any]]: + """Get the audit log.""" + return self.audit_log.copy() + + def cancel_action(self, action_id: str, reason: str = "") -> bool: + """ + Cancel a pending or approved action. + + Args: + action_id: ID of the action to cancel + reason: Optional reason for cancellation + + Returns: + True if cancelled, False otherwise + """ + for action in self.actions: + if action.action_id == action_id: + if action.status in [RemediationStatus.PENDING, RemediationStatus.APPROVED]: + action.status = RemediationStatus.CANCELLED + action.error = reason if reason else "Cancelled by user" + + self._log_audit('action_cancelled', action.resource_id, { + 'action_id': action_id, + 'reason': reason + }) + return True + return False + return False diff --git a/remediation/ec2_remediation.py b/remediation/ec2_remediation.py new file mode 100644 index 0000000..6d4057f --- /dev/null +++ b/remediation/ec2_remediation.py @@ -0,0 +1,434 @@ +""" +EC2 Remediation Module. + +Provides safe remediation actions for EC2 instances including: +- Adding/updating tags +- Creating snapshots before changes +- Stopping idle instances +- Rightsizing recommendations application +""" + +import logging +from datetime import datetime +from typing import Dict, List, Any, Optional + +from config import AWSConfig +from remediation.base import ( + BaseRemediation, + RemediationAction, + RemediationRisk, + RemediationStatus +) + +log = logging.getLogger(__name__) + + +class EC2Remediation(BaseRemediation): + """ + Remediation handler for EC2 instances. + + Supports: + - Tag management (LOW risk) + - Snapshot creation (LOW risk) + - Instance stop (MEDIUM risk) + - Instance type modification (MEDIUM risk) + - Instance termination (HIGH risk - requires explicit approval) + """ + + def __init__( + self, + aws_config: AWSConfig, + dry_run: bool = True, + max_auto_approve_risk: RemediationRisk = RemediationRisk.LOW + ): + super().__init__(aws_config, dry_run, max_auto_approve_risk) + self.ec2_client = aws_config.create_client('ec2') + + def get_resource_type(self) -> str: + return 'ec2_instance' + + def get_available_actions(self) -> List[Dict[str, Any]]: + """Return list of available EC2 remediation actions.""" + return [ + { + 'action_type': 'add_tags', + 'description': 'Add or update tags on an EC2 instance', + 'risk_level': RemediationRisk.LOW.value, + 'parameters': ['instance_id', 'tags'] + }, + { + 'action_type': 'create_snapshot', + 'description': 'Create EBS snapshots for all volumes attached to instance', + 'risk_level': RemediationRisk.LOW.value, + 'parameters': ['instance_id', 'snapshot_description'] + }, + { + 'action_type': 'stop_instance', + 'description': 'Stop a running EC2 instance', + 'risk_level': RemediationRisk.MEDIUM.value, + 'parameters': ['instance_id'] + }, + { + 'action_type': 'modify_instance_type', + 'description': 'Change instance type (requires stop/start)', + 'risk_level': RemediationRisk.MEDIUM.value, + 'parameters': ['instance_id', 'new_instance_type'] + }, + { + 'action_type': 'terminate_instance', + 'description': 'Terminate an EC2 instance (DESTRUCTIVE)', + 'risk_level': RemediationRisk.HIGH.value, + 'parameters': ['instance_id'] + } + ] + + def add_tags( + self, + instance_id: str, + tags: Dict[str, str], + auto_execute: bool = False + ) -> RemediationAction: + """ + Add or update tags on an EC2 instance. + + Args: + instance_id: EC2 instance ID + tags: Dictionary of tag key-value pairs + auto_execute: If True, execute immediately if auto-approval allowed + + Returns: + RemediationAction tracking this operation + """ + action = self.create_action( + resource_id=instance_id, + action_type='add_tags', + description=f"Add {len(tags)} tag(s) to instance {instance_id}", + risk_level=RemediationRisk.LOW, + rollback_info={'instance_id': instance_id, 'tags_to_add': tags} + ) + + # Store parameters for execution + action.result = {'parameters': {'instance_id': instance_id, 'tags': tags}} + + if auto_execute and self.can_auto_approve(action): + self.execute_action(action.action_id) + + return action + + def create_snapshot( + self, + instance_id: str, + description: str = None, + auto_execute: bool = False + ) -> RemediationAction: + """ + Create EBS snapshots for all volumes attached to an instance. + + Args: + instance_id: EC2 instance ID + description: Snapshot description + auto_execute: If True, execute immediately if auto-approval allowed + + Returns: + RemediationAction tracking this operation + """ + if not description: + description = f"Dedo-Duro backup before remediation - {datetime.utcnow().isoformat()}" + + action = self.create_action( + resource_id=instance_id, + action_type='create_snapshot', + description=f"Create snapshots for volumes on {instance_id}", + risk_level=RemediationRisk.LOW, + rollback_info={'instance_id': instance_id, 'description': description} + ) + + action.result = {'parameters': {'instance_id': instance_id, 'description': description}} + + if auto_execute and self.can_auto_approve(action): + self.execute_action(action.action_id) + + return action + + def stop_instance( + self, + instance_id: str, + create_snapshot_first: bool = True, + auto_execute: bool = False + ) -> RemediationAction: + """ + Stop a running EC2 instance. + + Args: + instance_id: EC2 instance ID + create_snapshot_first: If True, create snapshots before stopping + auto_execute: If True, execute immediately if auto-approval allowed + + Returns: + RemediationAction tracking this operation + """ + action = self.create_action( + resource_id=instance_id, + action_type='stop_instance', + description=f"Stop instance {instance_id}", + risk_level=RemediationRisk.MEDIUM, + rollback_info={ + 'instance_id': instance_id, + 'create_snapshot_first': create_snapshot_first, + 'can_restart': True + } + ) + + action.result = {'parameters': { + 'instance_id': instance_id, + 'create_snapshot_first': create_snapshot_first + }} + + if auto_execute and self.can_auto_approve(action): + self.execute_action(action.action_id) + + return action + + def modify_instance_type( + self, + instance_id: str, + new_instance_type: str, + create_snapshot_first: bool = True, + auto_execute: bool = False + ) -> RemediationAction: + """ + Modify the instance type (rightsizing). + + This operation requires the instance to be stopped. + + Args: + instance_id: EC2 instance ID + new_instance_type: New instance type (e.g., 't3.medium') + create_snapshot_first: If True, create snapshots before modifying + auto_execute: If True, execute immediately if auto-approval allowed + + Returns: + RemediationAction tracking this operation + """ + # Get current instance type for rollback + current_type = None + try: + response = self.ec2_client.describe_instances(InstanceIds=[instance_id]) + for reservation in response.get('Reservations', []): + for instance in reservation.get('Instances', []): + current_type = instance.get('InstanceType') + except Exception as e: + log.warning(f"Could not get current instance type for {instance_id}: {e}") + + action = self.create_action( + resource_id=instance_id, + action_type='modify_instance_type', + description=f"Change {instance_id} from {current_type} to {new_instance_type}", + risk_level=RemediationRisk.MEDIUM, + rollback_info={ + 'instance_id': instance_id, + 'original_instance_type': current_type, + 'new_instance_type': new_instance_type + } + ) + + action.result = {'parameters': { + 'instance_id': instance_id, + 'new_instance_type': new_instance_type, + 'create_snapshot_first': create_snapshot_first, + 'original_type': current_type + }} + + if auto_execute and self.can_auto_approve(action): + self.execute_action(action.action_id) + + return action + + def terminate_instance( + self, + instance_id: str, + create_snapshot_first: bool = True + ) -> RemediationAction: + """ + Terminate an EC2 instance. + + WARNING: This is a destructive operation and requires explicit approval. + Auto-execute is not supported for this action. + + Args: + instance_id: EC2 instance ID + create_snapshot_first: If True, create snapshots before terminating + + Returns: + RemediationAction tracking this operation + """ + action = self.create_action( + resource_id=instance_id, + action_type='terminate_instance', + description=f"TERMINATE instance {instance_id} (DESTRUCTIVE)", + risk_level=RemediationRisk.HIGH, + rollback_info={ + 'instance_id': instance_id, + 'create_snapshot_first': create_snapshot_first, + 'warning': 'Termination cannot be undone. AMI/snapshot required for recovery.' + } + ) + + action.result = {'parameters': { + 'instance_id': instance_id, + 'create_snapshot_first': create_snapshot_first + }} + + # Never auto-execute termination + return action + + def _execute_action_impl(self, action: RemediationAction) -> Dict[str, Any]: + """Execute the actual EC2 remediation action.""" + params = action.result.get('parameters', {}) + instance_id = params.get('instance_id') + + if action.action_type == 'add_tags': + return self._execute_add_tags(instance_id, params.get('tags', {})) + + elif action.action_type == 'create_snapshot': + return self._execute_create_snapshot(instance_id, params.get('description')) + + elif action.action_type == 'stop_instance': + if params.get('create_snapshot_first', True): + self._execute_create_snapshot(instance_id, "Pre-stop backup by Dedo-Duro") + return self._execute_stop_instance(instance_id) + + elif action.action_type == 'modify_instance_type': + if params.get('create_snapshot_first', True): + self._execute_create_snapshot(instance_id, "Pre-resize backup by Dedo-Duro") + return self._execute_modify_instance_type( + instance_id, + params.get('new_instance_type') + ) + + elif action.action_type == 'terminate_instance': + if params.get('create_snapshot_first', True): + self._execute_create_snapshot(instance_id, "Pre-termination backup by Dedo-Duro") + return self._execute_terminate_instance(instance_id) + + else: + raise ValueError(f"Unknown action type: {action.action_type}") + + def _execute_add_tags( + self, + instance_id: str, + tags: Dict[str, str] + ) -> Dict[str, Any]: + """Execute tag addition.""" + tag_list = [{'Key': k, 'Value': v} for k, v in tags.items()] + + self.ec2_client.create_tags( + Resources=[instance_id], + Tags=tag_list + ) + + return { + 'success': True, + 'instance_id': instance_id, + 'tags_added': len(tags), + 'tags': tags + } + + def _execute_create_snapshot( + self, + instance_id: str, + description: str + ) -> Dict[str, Any]: + """Execute snapshot creation for all volumes.""" + # Get volumes attached to instance + response = self.ec2_client.describe_instances(InstanceIds=[instance_id]) + + volumes = [] + for reservation in response.get('Reservations', []): + for instance in reservation.get('Instances', []): + for mapping in instance.get('BlockDeviceMappings', []): + ebs = mapping.get('Ebs', {}) + if 'VolumeId' in ebs: + volumes.append(ebs['VolumeId']) + + snapshots = [] + for volume_id in volumes: + snapshot = self.ec2_client.create_snapshot( + VolumeId=volume_id, + Description=f"{description} - Volume {volume_id}", + TagSpecifications=[{ + 'ResourceType': 'snapshot', + 'Tags': [ + {'Key': 'CreatedBy', 'Value': 'Dedo-Duro'}, + {'Key': 'SourceInstance', 'Value': instance_id}, + {'Key': 'SourceVolume', 'Value': volume_id} + ] + }] + ) + snapshots.append(snapshot['SnapshotId']) + + return { + 'success': True, + 'instance_id': instance_id, + 'volumes': volumes, + 'snapshots': snapshots + } + + def _execute_stop_instance(self, instance_id: str) -> Dict[str, Any]: + """Execute instance stop.""" + self.ec2_client.stop_instances(InstanceIds=[instance_id]) + + return { + 'success': True, + 'instance_id': instance_id, + 'action': 'stopped', + 'message': f'Instance {instance_id} is stopping' + } + + def _execute_modify_instance_type( + self, + instance_id: str, + new_instance_type: str + ) -> Dict[str, Any]: + """Execute instance type modification.""" + # First, ensure instance is stopped + response = self.ec2_client.describe_instances(InstanceIds=[instance_id]) + current_state = None + for reservation in response.get('Reservations', []): + for instance in reservation.get('Instances', []): + current_state = instance.get('State', {}).get('Name') + + if current_state == 'running': + self.ec2_client.stop_instances(InstanceIds=[instance_id]) + # Wait for instance to stop + waiter = self.ec2_client.get_waiter('instance_stopped') + waiter.wait(InstanceIds=[instance_id]) + + # Modify instance type + self.ec2_client.modify_instance_attribute( + InstanceId=instance_id, + InstanceType={'Value': new_instance_type} + ) + + # Restart instance if it was running + if current_state == 'running': + self.ec2_client.start_instances(InstanceIds=[instance_id]) + + return { + 'success': True, + 'instance_id': instance_id, + 'new_instance_type': new_instance_type, + 'was_running': current_state == 'running', + 'restarted': current_state == 'running' + } + + def _execute_terminate_instance(self, instance_id: str) -> Dict[str, Any]: + """Execute instance termination.""" + self.ec2_client.terminate_instances(InstanceIds=[instance_id]) + + return { + 'success': True, + 'instance_id': instance_id, + 'action': 'terminated', + 'warning': 'Instance termination is irreversible' + } diff --git a/requirements.txt b/requirements.txt index b0e0c94..54bc599 100755 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,18 @@ pandas>=1.5.0 matplotlib>=3.6.0 polars>=0.20.0 # Added for CUR analysis pyarrow>=14.0.0 # Added for Parquet support with Polars -PyYAML>=6.0 # Added for Helm chart generation (YAML parsing/dumping) \ No newline at end of file +PyYAML>=6.0 # Added for Helm chart generation (YAML parsing/dumping) +python-dateutil>=2.8.0 # Added for timezone handling + +# Web interface dependencies +fastapi>=0.100.0 +uvicorn>=0.22.0 +pydantic>=2.0.0 + +# Development and testing +mypy>=1.0.0 +pytest>=7.0.0 +pytest-asyncio>=0.21.0 +types-PyYAML>=6.0 +types-python-dateutil>=2.8.0 +boto3-stubs[essential]>=1.26.0 \ No newline at end of file diff --git a/simulate_complete_report.py b/simulate_complete_report.py new file mode 100644 index 0000000..8ad511c --- /dev/null +++ b/simulate_complete_report.py @@ -0,0 +1,727 @@ +#!/usr/bin/env python3 +""" +Complete simulation script for Dedo-Duro v12.0+ with all new features. + +Simulates: +- All original analyzers (EC2, RDS, S3, EBS, Lambda, etc.) +- Cost Explorer with anomaly detection +- RTO/RPO Analysis +- EKS Session Monitoring +- EKS Deployment Lifecycle +- Multi-account aggregation +- Environment filtering (prod/dev/staging) +- Tag-based grouping +- Remediation recommendations +""" + +import sys +import os +import json +import random +from datetime import datetime, timedelta, timezone +from dataclasses import dataclass +from typing import Dict, List, Any + +# Add project root to path +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from reporters.html_reporter import HTMLReporter + + +@dataclass +class MockReportConfig: + """Mock configuration for report generation.""" + output_format: str = 'html' + output_file: str = 'complete_simulated_report.html' + include_charts: bool = True + output_s3_bucket: str = None + output_s3_prefix: str = '' + grouping_tags: List[str] = None + + def __post_init__(self): + if self.grouping_tags is None: + self.grouping_tags = ['Team', 'Project', 'Environment', 'CostCenter'] + + def get_default_filename(self) -> str: + return self.output_file + + +def generate_tags(team: str = None, project: str = None, env: str = None) -> List[Dict]: + """Generate realistic AWS tags.""" + teams = ['Platform', 'Data', 'Backend', 'Frontend', 'DevOps', 'ML'] + projects = ['Atlas', 'Phoenix', 'Titan', 'Mercury', 'Neptune'] + environments = ['production', 'staging', 'development', 'test'] + + tags = [] + if team or random.random() > 0.3: + tags.append({'Key': 'Team', 'Value': team or random.choice(teams)}) + if project or random.random() > 0.4: + tags.append({'Key': 'Project', 'Value': project or random.choice(projects)}) + if env or random.random() > 0.2: + tags.append({'Key': 'Environment', 'Value': env or random.choice(environments)}) + if random.random() > 0.5: + tags.append({'Key': 'CostCenter', 'Value': f'CC-{random.randint(1000, 9999)}'}) + return tags + + +def generate_ec2_data() -> List[Dict]: + """Generate EC2 instance data with HTMLReporter-compatible field names.""" + instances = [] + instance_types = ['t3.micro', 't3.small', 't3.medium', 'm5.large', 'm5.xlarge', + 'm5.2xlarge', 'c5.large', 'c5.xlarge', 'r5.large', 'r5.xlarge'] + recommended_types = ['t3.micro', 't3.small', 't4g.micro', 't4g.small', 'm6g.medium', 'm6g.large'] + statuses = ['Oversized', 'Idle', 'Active', 'Graviton Eligible', 'Spot Eligible'] + owners = ['platform-team', 'backend-team', 'data-team', 'ml-team', 'devops-team'] + operating_systems = ['Amazon Linux 2', 'Amazon Linux 2023', 'Ubuntu 22.04', 'Windows Server 2022', 'RHEL 8'] + + for i in range(15): + cpu_avg = random.uniform(2, 85) + status = 'Active' if cpu_avg > 50 else ('Idle' if cpu_avg < 10 else random.choice(statuses[:3])) + cost = random.uniform(20, 800) + savings = 0 if status == 'Active' else cost * random.uniform(0.2, 0.8) + current_type = random.choice(instance_types) + + instances.append({ + 'resource_type': 'EC2 Instance', + # HTMLReporter expects these exact field names + 'account_id': '123456789012', + 'account_alias': 'simulated-account', + 'InstanceId': f'i-{random.randint(10000000, 99999999):08x}', + 'resource_id': f'i-{random.randint(10000000, 99999999):08x}', # Keep for compatibility + 'name': f'{random.choice(["prod", "staging", "dev"])}-{random.choice(["web", "api", "worker", "db"])}-{i+1:02d}', + 'region': random.choice(['us-east-1', 'us-west-2', 'eu-west-1']), + 'nike-owner': random.choice(owners), + 'current_type': current_type, + 'recommended_type': random.choice(recommended_types) if status != 'Active' else current_type, + 'operating_system': random.choice(operating_systems), + 'cpu_utilization_percentage': round(cpu_avg, 1), + 'network_io_mbps': round(random.uniform(0.5, 150), 1), + 'status': status, + 'recommendation': f'Consider {"downsizing" if status == "Oversized" else "stopping" if status == "Idle" else "Graviton migration" if "Graviton" in status else "no action needed"}', + 'estimated_monthly_cost': round(cost, 2), + 'estimated_monthly_savings': round(savings, 2), + # Keep additional fields for internal use + 'cpu_avg_30d': round(cpu_avg, 1), + 'cpu_max_30d': round(min(cpu_avg * random.uniform(1.2, 2.5), 100), 1), + 'memory_avg_30d': round(random.uniform(10, 90), 1), + 'tags': generate_tags() + }) + return instances + + +def generate_rds_data() -> List[Dict]: + """Generate RDS instance data with HTMLReporter-compatible field names.""" + instances = [] + instance_classes = ['db.t3.micro', 'db.t3.small', 'db.t3.medium', 'db.m5.large', + 'db.m5.xlarge', 'db.r5.large', 'db.r5.xlarge'] + engines = ['mysql', 'postgres', 'aurora-mysql', 'aurora-postgresql'] + owners = ['platform-team', 'backend-team', 'data-team', 'analytics-team'] + + for i in range(8): + cpu_avg = random.uniform(5, 75) + cost = random.uniform(50, 1500) + savings = cost * random.uniform(0, 0.5) if cpu_avg < 30 else 0 + current_class = random.choice(instance_classes) + + instances.append({ + 'resource_type': 'RDS Instance', + # HTMLReporter expects these exact field names + 'account_id': '123456789012', + 'account_alias': 'simulated-account', + 'db_instance_identifier': f'db-{random.choice(["prod", "staging", "dev"])}-{random.choice(["main", "replica", "analytics"])}-{i+1:02d}', + 'resource_id': f'db-{i+1:02d}', + 'name': f'database-{i+1}', + 'region': random.choice(['us-east-1', 'us-west-2']), + 'nike-owner': random.choice(owners), + 'current_type': current_class, + 'storage_gb': random.randint(20, 1000), # HTMLReporter expects storage_gb + 'cpu_utilization_percentage': round(cpu_avg, 1), + 'connections': random.randint(5, 500), + 'status': 'Underutilized' if cpu_avg < 20 else 'Active', + 'recommendation': 'Consider downsizing instance class' if savings > 0 else 'Well utilized', + 'estimated_monthly_cost': round(cost, 2), + 'estimated_monthly_savings': round(savings, 2), + # Keep additional fields + 'engine': random.choice(engines), + 'multi_az': random.choice([True, False]), + 'tags': generate_tags() + }) + return instances + + +def generate_s3_data() -> List[Dict]: + """Generate S3 bucket data with HTMLReporter-compatible field names.""" + buckets = [] + purposes = ['logs', 'backups', 'data-lake', 'static-assets', 'archives', 'temp'] + owners = ['platform-team', 'data-team', 'backend-team', 'devops-team'] + + for i in range(10): + size_gb = random.randint(10, 50000) + has_lifecycle = random.choice([True, False]) + has_intelligent_tiering = random.choice([True, False]) + cost = size_gb * 0.023 # Standard storage cost + savings = cost * 0.6 if not has_lifecycle and size_gb > 1000 else 0 + + buckets.append({ + 'resource_type': 'S3 Bucket', + # HTMLReporter expects these exact field names + 'account_id': '123456789012', + 'account_alias': 'simulated-account', + 'bucket_name': f'{random.choice(["prod", "staging", "dev"])}-{random.choice(purposes)}-{random.randint(1000, 9999)}', + 'resource_id': f'bucket-{i+1}', + 'name': f'bucket-{i+1}', + 'region': 'us-east-1', + 'nike-owner': random.choice(owners), + 'storage_gb': size_gb, + 'lifecycle_policy_status': 'Configured' if has_lifecycle else 'Not Configured', # HTMLReporter expects lifecycle_policy_status + 'status': 'Lifecycle Missing' if not has_lifecycle else 'Optimized', + 'recommendation': 'Add lifecycle policy for cost optimization' if not has_lifecycle else 'Well configured', + 'estimated_monthly_cost': round(cost, 2), + 'estimated_monthly_savings': round(savings, 2), + # Keep additional fields + 'storage_class': 'STANDARD', + 'object_count': random.randint(1000, 10000000), + 'has_lifecycle_policy': has_lifecycle, + 'has_intelligent_tiering': has_intelligent_tiering, + 'last_modified_days': random.randint(1, 365), + 'tags': generate_tags() + }) + return buckets + + +def generate_ebs_data() -> List[Dict]: + """Generate EBS volume data with HTMLReporter-compatible field names.""" + volumes = [] + volume_types = ['gp2', 'gp3', 'io1', 'io2', 'st1', 'sc1'] + owners = ['platform-team', 'backend-team', 'data-team', 'devops-team'] + + for i in range(12): + vol_type = random.choice(volume_types) + size_gb = random.randint(20, 2000) + attached = random.choice([True, True, True, False]) # 75% attached + + cost = size_gb * 0.10 if vol_type == 'gp2' else size_gb * 0.08 + savings = cost if not attached else (cost * 0.2 if vol_type == 'gp2' else 0) + iops = random.randint(3000, 16000) if vol_type in ['io1', 'io2', 'gp3'] else random.randint(100, 3000) + throughput = random.randint(125, 1000) if vol_type == 'gp3' else random.randint(50, 250) + + volumes.append({ + 'resource_type': 'EBS Volume', + # HTMLReporter expects these exact field names + 'account_id': '123456789012', + 'account_alias': 'simulated-account', + 'volume_id': f'vol-{random.randint(10000000, 99999999):08x}', + 'resource_id': f'vol-{i+1:02d}', + 'name': f'volume-{random.choice(["data", "root", "backup", "temp"])}-{i+1}', + 'region': random.choice(['us-east-1', 'us-west-2']), + 'nike-owner': random.choice(owners), + 'current_type': vol_type, + 'size_gb': size_gb, + 'iops': iops, + 'throughput_mbps': throughput, # HTMLReporter expects throughput_mbps + 'attached_instance': f'i-{random.randint(10000000, 99999999):08x}' if attached else 'N/A', + 'attached_instance_id': f'i-{random.randint(10000000, 99999999):08x}' if attached else 'N/A', # Must be string 'N/A' not None + 'status': 'Unattached' if not attached else ('Optimize to gp3' if vol_type == 'gp2' else 'Active'), + 'recommendation': 'Delete or snapshot unattached volume' if not attached else 'Migrate to gp3' if vol_type == 'gp2' else 'OK', + 'estimated_monthly_cost': round(cost, 2), + 'estimated_monthly_savings': round(savings, 2), + # Keep additional fields + 'attached': attached, + 'tags': generate_tags() + }) + return volumes + + +def generate_lambda_data() -> List[Dict]: + """Generate Lambda function data with HTMLReporter-compatible field names.""" + functions = [] + runtimes = ['python3.9', 'python3.11', 'nodejs18.x', 'nodejs20.x', 'java17', 'go1.x'] + owners = ['platform-team', 'backend-team', 'data-team', 'ml-team'] + + for i in range(10): + memory = random.choice([128, 256, 512, 1024, 2048, 3072]) + invocations = random.randint(0, 10000000) + avg_duration = random.randint(50, 5000) + + cost = (invocations / 1000000) * 0.20 + (invocations * avg_duration / 1000 * memory / 1024) * 0.0000166667 + savings = cost * 0.3 if memory > 1024 and avg_duration < 500 else 0 + + functions.append({ + 'resource_type': 'Lambda Function', + # HTMLReporter expects these exact field names + 'account_id': '123456789012', + 'account_alias': 'simulated-account', + 'function_name': f'{random.choice(["prod", "dev"])}-{random.choice(["api", "processor", "handler", "worker"])}-{i+1}', + 'resource_id': f'function-{i+1}', + 'name': f'function-{i+1}', + 'region': 'us-east-1', + 'nike-owner': random.choice(owners), + 'runtime': random.choice(runtimes), + 'memory_size': memory, # HTMLReporter expects memory_size + 'status': 'Oversized' if savings > 0 else ('Unused' if invocations < 100 else 'Active'), + 'recommendation': 'Reduce memory allocation' if savings > 0 else 'Consider removing unused function' if invocations < 100 else 'OK', + 'estimated_monthly_cost': round(cost, 2), + 'estimated_monthly_savings': round(savings, 2), + # Keep additional fields + 'memory_mb': memory, + 'timeout_seconds': random.randint(30, 900), + 'invocations_30d': invocations, + 'avg_duration_ms': avg_duration, + 'error_rate': round(random.uniform(0, 5), 2), + 'tags': generate_tags() + }) + return functions + + +def generate_cost_explorer_data() -> List[Dict]: + """Generate Cost Explorer anomaly data with HTMLReporter-compatible field names.""" + anomalies = [] + services = ['Amazon EC2', 'Amazon RDS', 'Amazon S3', 'AWS Lambda', 'Amazon CloudFront'] + + for i in range(5): + expected = random.uniform(1000, 10000) + actual = expected * random.uniform(1.3, 2.5) + + anomalies.append({ + 'resource_type': 'Cost Anomaly', + # HTMLReporter expects these exact field names + 'account_id': '123456789012', + 'account_alias': 'simulated-account', + 'resource_id': f'anomaly-{random.randint(1000, 9999)}', + 'name': random.choice(services), + 'region': 'us-east-1', + 'nike-owner': 'finops-team', + 'service': random.choice(services), + 'anomaly_date': (datetime.now(timezone.utc) - timedelta(days=random.randint(1, 14))).strftime('%Y-%m-%d'), + 'expected_cost': round(expected, 2), + 'actual_cost': round(actual, 2), + 'variance_percent': round((actual - expected) / expected * 100, 1), + 'status': 'Cost Spike Detected', + 'estimated_monthly_cost': round(actual, 2), + 'estimated_monthly_savings': round(actual - expected, 2), + 'recommendation': f'Investigate {round((actual-expected)/expected*100, 0)}% cost increase', + 'tags': [] + }) + return anomalies + + +def generate_rto_data() -> List[Dict]: + """Generate RTO/RPO analysis data with HTMLReporter-compatible field names.""" + findings = [] + owners = ['platform-team', 'backend-team', 'data-team', 'devops-team'] + + # RDS backup findings + for i in range(4): + retention = random.randint(1, 35) + has_cross_region = random.choice([True, False]) + + findings.append({ + 'resource_type': 'RTO Analysis', + # HTMLReporter expects these exact field names + 'account_id': '123456789012', + 'account_alias': 'simulated-account', + 'resource_id': f'db-{random.choice(["prod", "staging"])}-{i+1}', + 'name': f'database-{i+1}', + 'region': 'us-east-1', + 'nike-owner': random.choice(owners), + 'resource_name': f'database-{i+1}', + 'backup_retention_days': retention, + 'cross_region_backup': has_cross_region, + 'estimated_rto_hours': random.randint(1, 24), + 'estimated_rpo_hours': 24 if retention < 7 else 1, + 'status': 'RTO Risk' if not has_cross_region else 'Compliant', + 'estimated_monthly_cost': 0, + 'estimated_monthly_savings': 0, + 'recommendation': 'Enable cross-region backup for disaster recovery' if not has_cross_region else 'Backup configuration adequate', + 'tags': generate_tags(env='production') + }) + + # S3 replication findings + for i in range(3): + has_versioning = random.choice([True, False]) + has_replication = random.choice([True, False]) + + findings.append({ + 'resource_type': 'RTO Analysis', + # HTMLReporter expects these exact field names + 'account_id': '123456789012', + 'account_alias': 'simulated-account', + 'resource_id': f'bucket-critical-{i+1}', + 'name': f'critical-data-bucket-{i+1}', + 'region': 'us-east-1', + 'nike-owner': random.choice(owners), + 'resource_name': f'critical-data-bucket-{i+1}', + 'versioning_enabled': has_versioning, + 'cross_region_replication': has_replication, + 'estimated_rto_hours': 4 if has_replication else 24, + 'estimated_rpo_hours': 0 if has_replication else 24, + 'status': 'RPO Risk' if not has_replication else 'Compliant', + 'estimated_monthly_cost': 0, + 'estimated_monthly_savings': 0, + 'recommendation': 'Enable cross-region replication for critical data' if not has_replication else 'Replication configured', + 'tags': generate_tags(env='production') + }) + + return findings + + +def generate_eks_session_data() -> List[Dict]: + """Generate EKS session monitoring data with HTMLReporter-compatible field names.""" + sessions = [] + users = ['admin', 'developer1', 'developer2', 'ci-bot', 'monitoring-agent'] + + for i in range(6): + duration = random.randint(60, 28800) # 1 min to 8 hours + is_suspicious = duration > 14400 or random.random() > 0.8 + cluster = random.choice(['prod-cluster', 'staging-cluster', 'dev-cluster']) + + sessions.append({ + 'resource_type': 'EKS Session', + # HTMLReporter expects these exact field names + 'account_id': '123456789012', + 'account_alias': 'simulated-account', + 'resource_id': f'session-{random.randint(100000, 999999)}', + 'name': f'{cluster}-session-{i+1}', + 'region': 'us-east-1', + 'nike-owner': 'devops-team', + 'cluster_name': cluster, + 'user': random.choice(users), + 'session_duration_seconds': duration, + 'commands_executed': random.randint(1, 500), + 'namespace_accessed': random.choice(['default', 'kube-system', 'production', 'monitoring']), + 'status': 'Suspicious Activity' if is_suspicious else 'Normal', + 'started_at': (datetime.now(timezone.utc) - timedelta(seconds=duration)).isoformat(), + 'estimated_monthly_cost': 0, + 'estimated_monthly_savings': 0, + 'recommendation': 'Review long-running session for security' if is_suspicious else 'Normal session activity', + 'tags': [] + }) + return sessions + + +def generate_eks_deployment_data() -> List[Dict]: + """Generate EKS deployment lifecycle data with HTMLReporter-compatible field names.""" + deployments = [] + apps = ['frontend', 'backend-api', 'worker', 'scheduler', 'gateway'] + owners = ['platform-team', 'backend-team', 'frontend-team', 'devops-team'] + + for app in apps: + age_days = random.randint(1, 365) + last_update_days = random.randint(0, age_days) + restart_count = random.randint(0, 50) + replicas = random.randint(1, 10) + + is_stale = last_update_days > 90 + has_restart_issues = restart_count > 20 + + deployments.append({ + 'resource_type': 'EKS Deployment', + # HTMLReporter expects these exact field names + 'account_id': '123456789012', + 'account_alias': 'simulated-account', + 'resource_id': f'{random.choice(["prod", "staging"])}-{app}', + 'name': app, + 'region': 'us-east-1', + 'nike-owner': random.choice(owners), + 'deployment_name': app, + 'cluster_name': random.choice(['prod-cluster', 'staging-cluster']), + 'namespace': random.choice(['production', 'staging', 'default']), + 'replicas': replicas, + 'available_replicas': replicas - random.randint(0, 1), + 'age_days': age_days, + 'last_update_days': last_update_days, + 'restart_count_30d': restart_count, + 'status': 'Stale Deployment' if is_stale else ('Restart Issues' if has_restart_issues else 'Healthy'), + 'estimated_monthly_cost': round(replicas * random.uniform(50, 200), 2), + 'estimated_monthly_savings': 0, + 'recommendation': 'Review and update stale deployment' if is_stale else ('Investigate restart issues' if has_restart_issues else 'Deployment healthy'), + 'tags': generate_tags(project=app.replace('-', ' ').title()) + }) + return deployments + + +def generate_sagemaker_data() -> List[Dict]: + """Generate SageMaker resource data with HTMLReporter-compatible field names.""" + resources = [] + + # Notebook instances + for i in range(3): + hours_idle = random.randint(0, 168) + cost = random.uniform(50, 500) + + notebook_name = f'notebook-{random.choice(["data-science", "ml-training", "experimentation"])}-{i+1}' + resources.append({ + 'resource_type': 'SageMaker Notebook', + # HTMLReporter expects these exact field names for sagemaker + 'account_id': '123456789012', + 'account_alias': 'simulated-account', + 'resource_arn': f'arn:aws:sagemaker:us-east-1:123456789012:notebook-instance/{notebook_name}', + 'resource_id': notebook_name, + 'name': f'notebook-{i+1}', + 'region': 'us-east-1', + 'nike-owner': 'ml-team', + 'type': 'Notebook Instance', + 'current_type': random.choice(['ml.t3.medium', 'ml.t3.large', 'ml.m5.xlarge']), + 'instance_type': random.choice(['ml.t3.medium', 'ml.t3.large', 'ml.m5.xlarge']), + 'status': 'Idle' if hours_idle > 24 else 'In Use', + 'hours_idle': hours_idle, + 'estimated_monthly_cost': round(cost, 2), + 'estimated_monthly_savings': round(cost * 0.8, 2) if hours_idle > 48 else 0, + 'recommendation': 'Stop idle notebook instance' if hours_idle > 24 else 'Active notebook', + 'tags': generate_tags(team='ML') + }) + + # Endpoints + for i in range(2): + invocations = random.randint(0, 100000) + cost = random.uniform(200, 2000) + + endpoint_name = f'endpoint-{random.choice(["inference", "prediction", "scoring"])}-{i+1}' + resources.append({ + 'resource_type': 'SageMaker Endpoint', + # HTMLReporter expects these exact field names for sagemaker + 'account_id': '123456789012', + 'account_alias': 'simulated-account', + 'resource_arn': f'arn:aws:sagemaker:us-east-1:123456789012:endpoint/{endpoint_name}', + 'resource_id': endpoint_name, + 'name': f'endpoint-{i+1}', + 'region': 'us-east-1', + 'nike-owner': 'ml-team', + 'type': 'Inference Endpoint', + 'current_type': random.choice(['ml.m5.large', 'ml.c5.xlarge', 'ml.g4dn.xlarge']), + 'instance_type': random.choice(['ml.m5.large', 'ml.c5.xlarge', 'ml.g4dn.xlarge']), + 'instance_count': random.randint(1, 4), + 'invocations_30d': invocations, + 'status': 'Low Utilization' if invocations < 1000 else 'Active', + 'estimated_monthly_cost': round(cost, 2), + 'estimated_monthly_savings': round(cost * 0.7, 2) if invocations < 1000 else 0, + 'recommendation': 'Consider serverless inference' if invocations < 1000 else 'Well utilized endpoint', + 'tags': generate_tags(team='ML') + }) + + return resources + + +def generate_bedrock_data() -> List[Dict]: + """Generate Bedrock/AI service data with HTMLReporter-compatible field names.""" + resources = [] + is_underutilized = random.random() > 0.5 + + # Provisioned throughput + resources.append({ + 'resource_type': 'Bedrock Provisioned Throughput', + # HTMLReporter expects these exact field names + 'account_id': '123456789012', + 'account_alias': 'simulated-account', + 'resource_id': 'pt-claude-instant-prod', + 'name': 'Claude Instant Production', + 'region': 'us-east-1', + 'nike-owner': 'ml-team', + 'model_id': 'anthropic.claude-instant-v1', + 'model_units': 2, + 'utilization_percent': random.uniform(10, 90), + 'status': 'Underutilized' if is_underutilized else 'Active', + 'estimated_monthly_cost': 2400.00, + 'estimated_monthly_savings': 1200.00 if is_underutilized else 0, + 'recommendation': 'Consider on-demand pricing for variable workloads', + 'tags': generate_tags(team='ML') + }) + + return resources + + +def generate_multi_account_summary() -> Dict: + """Generate multi-account summary.""" + accounts = [ + {'account_id': '111111111111', 'alias': 'Production', 'resources': 45, 'monthly_cost': 15000, 'savings': 3500}, + {'account_id': '222222222222', 'alias': 'Staging', 'resources': 25, 'monthly_cost': 5000, 'savings': 2000}, + {'account_id': '333333333333', 'alias': 'Development', 'resources': 35, 'monthly_cost': 8000, 'savings': 4500}, + {'account_id': '444444444444', 'alias': 'Sandbox', 'resources': 15, 'monthly_cost': 2000, 'savings': 1500}, + ] + return { + 'accounts': accounts, + 'total_accounts': len(accounts), + 'total_resources': sum(a['resources'] for a in accounts), + 'total_monthly_cost': sum(a['monthly_cost'] for a in accounts), + 'total_savings': sum(a['savings'] for a in accounts) + } + + +def generate_all_mock_data() -> Dict[str, List[Dict]]: + """Generate all mock data for comprehensive report.""" + return { + 'ec2': generate_ec2_data(), + 'rds': generate_rds_data(), + 's3': generate_s3_data(), + 'ebs': generate_ebs_data(), + 'lambda': generate_lambda_data(), + 'cost_explorer': generate_cost_explorer_data(), + 'rto_analysis': generate_rto_data(), + 'eks_sessions': generate_eks_session_data(), + 'eks_deployments': generate_eks_deployment_data(), + 'sagemaker': generate_sagemaker_data(), + 'bedrock': generate_bedrock_data(), + } + + +def calculate_summary(results: Dict[str, List[Dict]]) -> Dict: + """Calculate summary statistics compatible with HTMLReporter.""" + total_resources = sum(len(v) for v in results.values()) + total_cost = sum(r.get('estimated_monthly_cost', 0) for v in results.values() for r in v) + total_savings = sum(r.get('estimated_monthly_savings', 0) for v in results.values() for r in v) + + # Group by status + status_counts = {} + for resources in results.values(): + for r in resources: + status = r.get('status', 'Unknown') + status_counts[status] = status_counts.get(status, 0) + 1 + + # Group by team + team_costs = {} + for resources in results.values(): + for r in resources: + team = 'Untagged' + for tag in r.get('tags', []): + if tag.get('Key') == 'Team': + team = tag.get('Value', 'Untagged') + break + team_costs[team] = team_costs.get(team, 0) + r.get('estimated_monthly_cost', 0) + + # Calculate EC2-specific savings for right-sizing + ec2_savings = sum(r.get('estimated_monthly_savings', 0) for r in results.get('ec2', [])) + + # Calculate spot savings (simulate ~40% of EC2 cost as spot potential) + spot_monthly = ec2_savings * 0.6 + spot_annual = spot_monthly * 12 + + # Calculate old EBS snapshots + old_snapshots = random.randint(5, 25) + old_snapshot_cost = old_snapshots * random.uniform(5, 15) + + return { + # Basic info - required by HTMLReporter + 'account_id': '123456789012', + 'account_alias': 'simulated-account', + 'region': 'us-east-1', + 'analysis_date': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC'), + 'analyzed_regions': ['us-east-1', 'us-west-2', 'eu-west-1'], + + # Resource counts + 'total_resources': total_resources, + 'optimization_opportunities': sum(1 for v in results.values() for r in v if r.get('estimated_monthly_savings', 0) > 0), + + # Cost summaries + 'total_monthly_cost': round(total_cost, 2), + 'potential_monthly_savings': round(total_savings, 2), + 'potential_annual_savings': round(total_savings * 12, 2), + + # Spot savings (simulated) + 'spot': { + 'estimated_monthly_savings_spot_priority': round(spot_monthly, 2), + 'estimated_annual_savings_spot_priority': round(spot_annual, 2), + }, + + # Right-sizing savings + 'estimated_monthly_savings_right_sizing_priority': round(ec2_savings, 2), + 'estimated_annual_savings_right_sizing_priority': round(ec2_savings * 12, 2), + + # Schedule-based savings (simulate dev/staging stop outside hours) + 'estimated_annual_savings_schedule': round(total_savings * 0.3 * 12, 2), + + # EBS Snapshots + 'ebs_snapshot': { + 'old_ebs_snapshots_count': old_snapshots, + 'old_ebs_snapshots_estimated_cost': round(old_snapshot_cost, 2), + }, + + # Security findings (simulated) + 'security_findings_count': random.randint(3, 15), + + # Additional data + 'status_breakdown': status_counts, + 'cost_by_team': team_costs, + 'multi_account': generate_multi_account_summary() + } + + +def print_summary(summary: Dict): + """Print summary to console.""" + print("\n" + "=" * 70) + print("DEDO-DURO COMPLETE SIMULATION REPORT v12.0") + print("=" * 70) + + print(f"\n📊 OVERALL SUMMARY") + print("-" * 40) + print(f" Total Resources Analyzed: {summary['total_resources']:,}") + print(f" Optimization Opportunities: {summary['optimization_opportunities']:,}") + print(f" Total Monthly Cost: ${summary['total_monthly_cost']:,.2f}") + print(f" Potential Monthly Savings: ${summary['potential_monthly_savings']:,.2f}") + print(f" Potential Annual Savings: ${summary['potential_annual_savings']:,.2f}") + + print(f"\n💰 SAVINGS BY TEAM") + print("-" * 40) + for team, cost in sorted(summary['cost_by_team'].items(), key=lambda x: -x[1]): + print(f" {team:20s} ${cost:,.2f}") + + print(f"\n🏢 MULTI-ACCOUNT SUMMARY") + print("-" * 40) + ma = summary['multi_account'] + print(f" Total Accounts: {ma['total_accounts']}") + print(f" Total Resources: {ma['total_resources']}") + print(f" Combined Cost: ${ma['total_monthly_cost']:,.2f}/month") + print(f" Combined Savings: ${ma['total_savings']:,.2f}/month") + + print("\n" + "=" * 70) + + +def main(): + print("\n🚀 Starting Dedo-Duro Complete Simulation...") + + # Generate mock data + print("📝 Generating comprehensive mock AWS resource data...") + results = generate_all_mock_data() + + # Calculate summary + print("📊 Calculating summary statistics...") + summary = calculate_summary(results) + + # Print console summary + print_summary(summary) + + # Generate HTML report + print("\n📄 Generating HTML report with Chart.js visualizations...") + config = MockReportConfig() + reporter = HTMLReporter(config) + + html_content = reporter.generate(results, summary) + + output_file = config.output_file + with open(output_file, 'w', encoding='utf-8') as f: + f.write(html_content) + + file_size = os.path.getsize(output_file) + print(f"✅ Report saved to: {os.path.abspath(output_file)}") + print(f" Report size: {file_size:,} bytes") + + # Verify charts + print("\n📈 Chart Verification:") + charts_found = { + 'Savings by Resource Type': 'savingsChart' in html_content, + 'Resource Distribution': 'resourceChart' in html_content, + 'Cost by Status': 'statusChart' in html_content or 'chart' in html_content.lower(), + } + for chart_name, found in charts_found.items(): + status = "✅" if found else "❌" + print(f" {status} {chart_name}") + + print("\n" + "=" * 70) + print("🎉 Simulation complete! Open the report in your browser:") + print(f" file://{os.path.abspath(output_file)}") + print("=" * 70 + "\n") + + return summary + + +if __name__ == '__main__': + main() diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..6509c48 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,27 @@ +""" +Pytest configuration for Dedo-Duro tests. + +This conftest.py ensures the project root is in sys.path first, +avoiding conflicts with any other installed versions. +""" + +import sys +from pathlib import Path + +# Insert project root at the very beginning of sys.path +# This must happen before any imports from the project +project_root = Path(__file__).parent.parent.resolve() +if str(project_root) in sys.path: + sys.path.remove(str(project_root)) +sys.path.insert(0, str(project_root)) + +# Also remove any conflicting paths that contain another dedo-duro installation +paths_to_remove = [p for p in sys.path if 'myenv' in p and 'dedo-duro' in p] +for p in paths_to_remove: + sys.path.remove(p) + +# Verify we can import from the correct location +import analyzers +expected_path = str(project_root / 'analyzers') +actual_path = str(Path(analyzers.__file__).parent.resolve()) +assert actual_path == expected_path, f"Wrong analyzers path: {actual_path} != {expected_path}" diff --git a/tests/test_comprehensive.py b/tests/test_comprehensive.py new file mode 100644 index 0000000..23e9a4f --- /dev/null +++ b/tests/test_comprehensive.py @@ -0,0 +1,768 @@ +""" +Comprehensive test suite for all Dedo-Duro features. + +Tests cover: +- All new analyzers (Cost Explorer, RTO, EKS Session, EKS Deployment) +- Web API endpoints +- Remediation framework +- Multi-account configuration +- Environment filtering +- CI/CD configuration validation +""" + +import pytest +import json +import os +from datetime import datetime, timezone, timedelta +from unittest.mock import MagicMock, patch +from pathlib import Path + + +# ============================================================================ +# FIXTURES +# ============================================================================ + +@pytest.fixture +def mock_aws_config(): + """Create a mock AWS configuration.""" + from config import AWSConfig + config = MagicMock(spec=AWSConfig) + config.region = 'us-east-1' + config.create_client = MagicMock(return_value=MagicMock()) + return config + + +@pytest.fixture +def mock_analysis_config(): + """Create a mock analysis configuration.""" + from config import AnalysisConfig + return AnalysisConfig() + + +@pytest.fixture +def mock_metrics(): + """Create a mock CloudWatch metrics.""" + from core.metrics import CloudWatchMetrics + return MagicMock(spec=CloudWatchMetrics) + + +# ============================================================================ +# COST EXPLORER ANALYZER TESTS +# ============================================================================ + +class TestCostExplorerAnalyzer: + """Test Cost Explorer analyzer functionality.""" + + def test_module_import(self): + """Test Cost Explorer analyzer can be imported.""" + from analyzers.cost_explorer_analyzer import CostExplorerAnalyzer + assert CostExplorerAnalyzer is not None + + def test_analyzer_initialization(self, mock_aws_config, mock_analysis_config, mock_metrics): + """Test Cost Explorer analyzer can be initialized.""" + from analyzers.cost_explorer_analyzer import CostExplorerAnalyzer + + analyzer = CostExplorerAnalyzer( + mock_aws_config, mock_analysis_config, mock_metrics, '123456789012', 'aws' + ) + assert analyzer.get_service_name() == 'cost_explorer' + assert analyzer.get_estimated_time() == "Medium (30-60s)" + + def test_get_costs_by_service(self, mock_aws_config, mock_analysis_config, mock_metrics): + """Test fetching costs grouped by service.""" + from analyzers.cost_explorer_analyzer import CostExplorerAnalyzer + from datetime import date + + mock_ce_client = MagicMock() + mock_ce_client.get_cost_and_usage.return_value = { + 'ResultsByTime': [{ + 'Groups': [ + {'Keys': ['Amazon EC2'], 'Metrics': {'UnblendedCost': {'Amount': '100.50'}}}, + {'Keys': ['Amazon RDS'], 'Metrics': {'UnblendedCost': {'Amount': '50.25'}}} + ] + }] + } + mock_aws_config.create_client.return_value = mock_ce_client + + analyzer = CostExplorerAnalyzer( + mock_aws_config, mock_analysis_config, mock_metrics, '123456789012', 'aws' + ) + analyzer.ce_client = mock_ce_client + + costs = analyzer._get_costs_by_service(date(2024, 1, 1), date(2024, 1, 31)) + + assert mock_ce_client.get_cost_and_usage.called + assert 'Amazon EC2' in costs + assert costs['Amazon EC2'] == 100.50 + + def test_detect_anomalies_no_data(self, mock_aws_config, mock_analysis_config, mock_metrics): + """Test anomaly detection with insufficient data.""" + from analyzers.cost_explorer_analyzer import CostExplorerAnalyzer + + analyzer = CostExplorerAnalyzer( + mock_aws_config, mock_analysis_config, mock_metrics, '123456789012', 'aws' + ) + + # Less than 7 days of data + daily_costs = [{'date': '2024-01-01', 'cost': 100.0}] + anomalies = analyzer._detect_anomalies(daily_costs) + assert len(anomalies) == 0 + + def test_detect_anomalies_with_spike(self, mock_aws_config, mock_analysis_config, mock_metrics): + """Test anomaly detection with cost spike.""" + from analyzers.cost_explorer_analyzer import CostExplorerAnalyzer + + analyzer = CostExplorerAnalyzer( + mock_aws_config, mock_analysis_config, mock_metrics, '123456789012', 'aws' + ) + + # Normal costs with one huge spike + daily_costs = [{'date': f'2024-01-{i:02d}', 'cost': 100.0} for i in range(1, 30)] + daily_costs.append({'date': '2024-01-30', 'cost': 1000.0}) # Spike! + + anomalies = analyzer._detect_anomalies(daily_costs) + assert len(anomalies) >= 1 + + +# ============================================================================ +# RTO ANALYZER TESTS +# ============================================================================ + +class TestRTOAnalyzer: + """Test RTO/RPO disaster recovery analyzer.""" + + def test_module_import(self): + """Test RTO analyzer can be imported.""" + from analyzers.rto_analyzer import RTOAnalyzer + assert RTOAnalyzer is not None + + def test_analyzer_initialization(self, mock_aws_config, mock_analysis_config, mock_metrics): + """Test RTO analyzer can be initialized.""" + from analyzers.rto_analyzer import RTOAnalyzer + + analyzer = RTOAnalyzer( + mock_aws_config, mock_analysis_config, mock_metrics, '123456789012', 'aws' + ) + assert analyzer.get_service_name() == 'rto_analysis' + + def test_analyze_rds_backups(self, mock_aws_config, mock_analysis_config, mock_metrics): + """Test RDS backup analysis.""" + from analyzers.rto_analyzer import RTOAnalyzer + + mock_rds_client = MagicMock() + mock_rds_client.get_paginator.return_value.paginate.return_value = [{ + 'DBInstances': [{ + 'DBInstanceIdentifier': 'mydb', + 'DBInstanceClass': 'db.t3.medium', + 'Engine': 'mysql', + 'BackupRetentionPeriod': 7, + 'MultiAZ': True, + 'StorageEncrypted': True + }] + }] + + analyzer = RTOAnalyzer( + mock_aws_config, mock_analysis_config, mock_metrics, '123456789012', 'aws' + ) + analyzer.rds_client = mock_rds_client + + issues = analyzer._analyze_rds_backups() + + # MultiAZ with 7 day retention should have minimal findings + assert isinstance(issues, list) + + +# ============================================================================ +# EKS SESSION ANALYZER TESTS +# ============================================================================ + +class TestEKSSessionAnalyzer: + """Test EKS session monitoring analyzer.""" + + def test_module_import(self): + """Test EKS Session analyzer can be imported.""" + from analyzers.eks_session_analyzer import EKSSessionAnalyzer + assert EKSSessionAnalyzer is not None + + def test_analyzer_initialization(self, mock_aws_config, mock_analysis_config, mock_metrics): + """Test EKS Session analyzer can be initialized.""" + from analyzers.eks_session_analyzer import EKSSessionAnalyzer + + analyzer = EKSSessionAnalyzer( + mock_aws_config, mock_analysis_config, mock_metrics, '123456789012', 'aws' + ) + assert analyzer.get_service_name() == 'eks_sessions' + + def test_get_active_ssm_sessions(self, mock_aws_config, mock_analysis_config, mock_metrics): + """Test SSM session retrieval.""" + from analyzers.eks_session_analyzer import EKSSessionAnalyzer + + mock_ssm = MagicMock() + mock_ssm.get_paginator.return_value.paginate.return_value = [{ + 'Sessions': [{ + 'SessionId': 'session-123', + 'Target': 'i-12345', + 'Status': 'Connected', + 'StartDate': datetime.now(timezone.utc), + 'Owner': 'arn:aws:iam::123456789012:user/admin' + }] + }] + + analyzer = EKSSessionAnalyzer( + mock_aws_config, mock_analysis_config, mock_metrics, '123456789012', 'aws' + ) + analyzer.ssm_client = mock_ssm + + sessions = analyzer._get_active_ssm_sessions() + assert len(sessions) == 1 + assert sessions[0]['session_id'] == 'session-123' + + +# ============================================================================ +# EKS DEPLOYMENT LIFECYCLE TESTS +# ============================================================================ + +class TestEKSDeploymentLifecycleAnalyzer: + """Test EKS deployment lifecycle analyzer.""" + + def test_module_import(self): + """Test EKS Deployment Lifecycle analyzer can be imported.""" + from analyzers.eks_deployment_lifecycle import EKSDeploymentLifecycleAnalyzer + assert EKSDeploymentLifecycleAnalyzer is not None + + def test_analyzer_initialization(self, mock_aws_config, mock_analysis_config, mock_metrics): + """Test EKS Deployment Lifecycle analyzer can be initialized.""" + from analyzers.eks_deployment_lifecycle import EKSDeploymentLifecycleAnalyzer + + analyzer = EKSDeploymentLifecycleAnalyzer( + mock_aws_config, mock_analysis_config, mock_metrics, '123456789012', 'aws' + ) + assert analyzer.get_service_name() == 'eks_deployments' + + +# ============================================================================ +# WEB API TESTS +# ============================================================================ + +class TestWebAPI: + """Test FastAPI web application.""" + + def test_app_imports(self): + """Test that web app can be imported.""" + from web.app import app, AnalysisRequest, AnalysisStatus + assert app is not None + + def test_analysis_request_validation(self): + """Test AnalysisRequest model validation.""" + from web.app import AnalysisRequest + + request = AnalysisRequest( + region='us-east-1', + resource_types=['ec2', 'rds'], + output_format='json' + ) + assert request.region == 'us-east-1' + assert len(request.resource_types) == 2 + + def test_analysis_request_defaults(self): + """Test AnalysisRequest default values.""" + from web.app import AnalysisRequest + + request = AnalysisRequest() + assert request.output_format == 'html' + assert request.multi_region is False + assert request.region is None + + def test_analysis_status_model(self): + """Test AnalysisStatus model.""" + from web.app import AnalysisStatus + + now = datetime.now(timezone.utc) + status = AnalysisStatus( + job_id='test-123', + status='completed', + started_at=now, + completed_at=now, + progress=100, + message='Done', + report_url='/reports/test.html', + error=None + ) + assert status.job_id == 'test-123' + assert status.progress == 100 + + def test_list_analyzers_endpoint(self): + """Test list analyzers endpoint.""" + from fastapi.testclient import TestClient + from web.app import app + + client = TestClient(app) + response = client.get('/api/analyzers') + assert response.status_code == 200 + data = response.json() + assert 'analyzers' in data + + +# ============================================================================ +# REMEDIATION FRAMEWORK TESTS +# ============================================================================ + +class TestRemediationFramework: + """Test remediation base classes and EC2 remediation.""" + + def test_remediation_risk_levels(self): + """Test risk level enumeration.""" + from remediation.base import RemediationRisk + + assert RemediationRisk.LOW.value == 'low' + assert RemediationRisk.MEDIUM.value == 'medium' + assert RemediationRisk.HIGH.value == 'high' + assert RemediationRisk.CRITICAL.value == 'critical' + + def test_remediation_status_lifecycle(self): + """Test remediation status states.""" + from remediation.base import RemediationStatus + + assert RemediationStatus.PENDING.value == 'pending' + assert RemediationStatus.APPROVED.value == 'approved' + assert RemediationStatus.IN_PROGRESS.value == 'in_progress' + assert RemediationStatus.COMPLETED.value == 'completed' + assert RemediationStatus.FAILED.value == 'failed' + assert RemediationStatus.CANCELLED.value == 'cancelled' + assert RemediationStatus.DRY_RUN.value == 'dry_run' + + def test_remediation_action_creation(self): + """Test creating a remediation action.""" + from remediation.base import RemediationAction, RemediationRisk, RemediationStatus + + action = RemediationAction( + action_id='action-001', + resource_type='ec2_instance', + resource_id='i-12345', + action_type='stop_instance', + description='Stop idle instance', + risk_level=RemediationRisk.MEDIUM + ) + + assert action.action_id == 'action-001' + assert action.status == RemediationStatus.PENDING + assert action.dry_run is True + + def test_remediation_action_to_dict(self): + """Test serializing action to dictionary.""" + from remediation.base import RemediationAction, RemediationRisk + + action = RemediationAction( + action_id='action-001', + resource_type='ec2_instance', + resource_id='i-12345', + action_type='add_tags', + description='Add compliance tags', + risk_level=RemediationRisk.LOW + ) + + result = action.to_dict() + + assert result['action_id'] == 'action-001' + assert result['resource_type'] == 'ec2_instance' + assert result['risk_level'] == 'low' + assert result['status'] == 'pending' + + def test_ec2_remediation_available_actions(self, mock_aws_config): + """Test listing available EC2 remediation actions.""" + from remediation.ec2_remediation import EC2Remediation + + remediation = EC2Remediation(mock_aws_config, dry_run=True) + actions = remediation.get_available_actions() + + action_types = [a['action_type'] for a in actions] + assert 'add_tags' in action_types + assert 'create_snapshot' in action_types + assert 'stop_instance' in action_types + assert 'terminate_instance' in action_types + + def test_ec2_auto_approve_low_risk(self, mock_aws_config): + """Test auto-approval for low risk actions.""" + from remediation.ec2_remediation import EC2Remediation + from remediation.base import RemediationAction, RemediationRisk + + remediation = EC2Remediation(mock_aws_config, dry_run=True) + + action = RemediationAction( + action_id='test-1', + resource_type='ec2_instance', + resource_id='i-12345', + action_type='add_tags', + description='Add tags', + risk_level=RemediationRisk.LOW + ) + + assert remediation.can_auto_approve(action) is True + + def test_ec2_no_auto_approve_high_risk(self, mock_aws_config): + """Test that high risk actions are not auto-approved.""" + from remediation.ec2_remediation import EC2Remediation + from remediation.base import RemediationAction, RemediationRisk + + remediation = EC2Remediation(mock_aws_config, dry_run=True) + + action = RemediationAction( + action_id='test-1', + resource_type='ec2_instance', + resource_id='i-12345', + action_type='terminate_instance', + description='Terminate', + risk_level=RemediationRisk.HIGH + ) + + assert remediation.can_auto_approve(action) is False + + def test_ec2_add_tags_action(self, mock_aws_config): + """Test creating add_tags remediation action.""" + from remediation.ec2_remediation import EC2Remediation + + remediation = EC2Remediation(mock_aws_config, dry_run=True) + + action = remediation.add_tags( + instance_id='i-12345', + tags={'Environment': 'test', 'Owner': 'team-a'}, + auto_execute=False + ) + + assert action.action_type == 'add_tags' + assert action.resource_id == 'i-12345' + + def test_ec2_terminate_never_auto_executes(self, mock_aws_config): + """Test that terminate action never auto-executes.""" + from remediation.ec2_remediation import EC2Remediation + from remediation.base import RemediationStatus + + remediation = EC2Remediation(mock_aws_config, dry_run=True) + + action = remediation.terminate_instance( + instance_id='i-12345', + create_snapshot_first=True + ) + + assert action.status == RemediationStatus.PENDING + assert action.action_type == 'terminate_instance' + + def test_approve_action(self, mock_aws_config): + """Test approving a remediation action.""" + from remediation.ec2_remediation import EC2Remediation + from remediation.base import RemediationStatus + + remediation = EC2Remediation(mock_aws_config, dry_run=True) + + action = remediation.add_tags( + instance_id='i-12345', + tags={'CostCenter': 'IT'}, + auto_execute=False + ) + + assert action.status == RemediationStatus.PENDING + + # Approve the action + result = remediation.approve_action(action.action_id, approved_by='admin@example.com') + assert result is True + assert action.status == RemediationStatus.APPROVED + assert action.approved_by == 'admin@example.com' + + def test_get_pending_actions(self, mock_aws_config): + """Test getting pending actions.""" + from remediation.ec2_remediation import EC2Remediation + + remediation = EC2Remediation(mock_aws_config, dry_run=True) + + # Create some actions + remediation.add_tags('i-1', {'Key': 'Value'}) + remediation.add_tags('i-2', {'Key': 'Value'}) + + pending = remediation.get_pending_actions() + assert len(pending) == 2 + + def test_cancel_action(self, mock_aws_config): + """Test cancelling a remediation action.""" + from remediation.ec2_remediation import EC2Remediation + from remediation.base import RemediationStatus + + remediation = EC2Remediation(mock_aws_config, dry_run=True) + + action = remediation.add_tags('i-12345', {'Key': 'Value'}) + assert action.status == RemediationStatus.PENDING + + result = remediation.cancel_action(action.action_id, reason='No longer needed') + assert result is True + assert action.status == RemediationStatus.CANCELLED + + +# ============================================================================ +# MULTI-ACCOUNT CONFIGURATION TESTS +# ============================================================================ + +class TestMultiAccountConfig: + """Test multi-account configuration.""" + + def test_default_values(self): + """Test MultiAccountConfig default values.""" + from config import MultiAccountConfig + + config = MultiAccountConfig() + + assert config.enabled is False + assert config.parallel_accounts is True + assert config.session_duration == 3600 + assert config.max_account_workers == 3 + assert config.partition == 'aws' + + def test_arn_generation_default_partition(self): + """Test ARN generation with default partition.""" + from config import MultiAccountConfig + + config = MultiAccountConfig() + arn = config.get_assume_role_arn('123456789012', 'CrossAccountRole') + + assert arn == 'arn:aws:iam::123456789012:role/CrossAccountRole' + + def test_arn_generation_china_partition(self): + """Test ARN generation with China partition.""" + from config import MultiAccountConfig + + config = MultiAccountConfig(partition='aws-cn') + arn = config.get_assume_role_arn('123456789012', 'CrossAccountRole') + + assert arn == 'arn:aws-cn:iam::123456789012:role/CrossAccountRole' + + def test_arn_generation_govcloud_partition(self): + """Test ARN generation with GovCloud partition.""" + from config import MultiAccountConfig + + config = MultiAccountConfig(partition='aws-us-gov') + arn = config.get_assume_role_arn('123456789012', 'CrossAccountRole') + + assert arn == 'arn:aws-us-gov:iam::123456789012:role/CrossAccountRole' + + def test_load_accounts_from_json(self, tmp_path): + """Test loading accounts from JSON file.""" + from config import MultiAccountConfig + + accounts_file = tmp_path / "accounts.json" + accounts_data = { + 'accounts': [ + {'account_id': '111111111111', 'role_name': 'Role1', 'alias': 'prod'}, + {'account_id': '222222222222', 'role_name': 'Role2', 'alias': 'dev'} + ] + } + accounts_file.write_text(json.dumps(accounts_data)) + + config = MultiAccountConfig(accounts_file=str(accounts_file)) + loaded = config.load_accounts_from_file() + + assert len(loaded) == 2 + assert loaded[0]['account_id'] == '111111111111' + + +# ============================================================================ +# ENVIRONMENT FILTERING TESTS +# ============================================================================ + +class TestEnvironmentFiltering: + """Test environment filtering in AnalysisConfig.""" + + def test_exact_match(self): + """Test exact environment match.""" + from config import AnalysisConfig + + config = AnalysisConfig(environment_filter='prod') + tags = [{'Key': 'Environment', 'Value': 'prod'}] + + assert config.matches_environment(tags) is True + + def test_partial_match(self): + """Test partial environment match.""" + from config import AnalysisConfig + + config = AnalysisConfig(environment_filter='prod') + tags = [{'Key': 'Environment', 'Value': 'production'}] + + assert config.matches_environment(tags) is True + + def test_no_match(self): + """Test when environment doesn't match.""" + from config import AnalysisConfig + + config = AnalysisConfig(environment_filter='prod') + tags = [{'Key': 'Environment', 'Value': 'development'}] + + assert config.matches_environment(tags) is False + + def test_no_filter_matches_all(self): + """Test that no filter means all environments match.""" + from config import AnalysisConfig + + config = AnalysisConfig(environment_filter=None) + tags = [{'Key': 'Environment', 'Value': 'anything'}] + + assert config.matches_environment(tags) is True + + def test_custom_tag_key(self): + """Test matching with custom environment tag key.""" + from config import AnalysisConfig + + config = AnalysisConfig( + environment_filter='staging', + environment_tags=['Stage', 'Env', 'Environment'] + ) + tags = [{'Key': 'Stage', 'Value': 'staging-east'}] + + assert config.matches_environment(tags) is True + + def test_empty_tags(self): + """Test with empty tags list.""" + from config import AnalysisConfig + + config = AnalysisConfig(environment_filter='prod') + tags = [] + + assert config.matches_environment(tags) is False + + +# ============================================================================ +# CI/CD CONFIGURATION VALIDATION TESTS +# ============================================================================ + +class TestCICDConfigurations: + """Test CI/CD configuration file validity.""" + + def test_github_actions_yaml_valid(self): + """Test GitHub Actions workflow is valid YAML.""" + import yaml + + workflow_path = Path('.github/workflows/dedo-duro-analysis.yml') + if workflow_path.exists(): + with open(workflow_path) as f: + workflow = yaml.safe_load(f) + + assert 'name' in workflow + # Note: YAML parses 'on' as boolean True, so check for both + assert 'on' in workflow or True in workflow + assert 'jobs' in workflow + + def test_circleci_yaml_valid(self): + """Test CircleCI config is valid YAML.""" + import yaml + + config_path = Path('.circleci/config.yml') + if config_path.exists(): + with open(config_path) as f: + config = yaml.safe_load(f) + + assert 'version' in config + assert 'jobs' in config + + def test_jenkinsfile_exists(self): + """Test Jenkinsfile exists and has content.""" + jenkinsfile_path = Path('ci/Jenkinsfile') + if jenkinsfile_path.exists(): + content = jenkinsfile_path.read_text() + assert 'pipeline' in content + assert 'stages' in content + + +# ============================================================================ +# CORE FUNCTIONALITY TESTS +# ============================================================================ + +class TestCoreFunctionality: + """Test core functionality and integrations.""" + + def test_reporter_tag_grouping(self): + """Test tag-based grouping in reporters.""" + from core.reporter import BaseReporter + from config import ReportConfig + + config = ReportConfig() + + class TestReporter(BaseReporter): + def generate(self, results, summary): + return '' + + reporter = TestReporter(config) + + resource = {'tags': [{'Key': 'Team', 'Value': 'Platform'}]} + group = reporter._get_tag_group(resource, ['Team', 'Project']) + assert group == 'Platform' + + def test_reporter_untagged_fallback(self): + """Test fallback to Untagged for resources without matching tags.""" + from core.reporter import BaseReporter + from config import ReportConfig + + config = ReportConfig() + + class TestReporter(BaseReporter): + def generate(self, results, summary): + return '' + + reporter = TestReporter(config) + + resource = {'tags': []} + group = reporter._get_tag_group(resource, ['Team', 'Project']) + assert group == 'Untagged' + + +# ============================================================================ +# INTEGRATION TESTS +# ============================================================================ + +class TestIntegration: + """Integration tests for complete workflows.""" + + def test_all_analyzers_importable(self): + """Test that all analyzer modules can be imported.""" + analyzers_to_test = [ + 'analyzers.cost_explorer_analyzer', + 'analyzers.rto_analyzer', + 'analyzers.eks_session_analyzer', + 'analyzers.eks_deployment_lifecycle', + 'analyzers.ec2', + 'analyzers.rds', + ] + + for module_name in analyzers_to_test: + try: + __import__(module_name) + except ImportError as e: + pytest.fail(f"Failed to import {module_name}: {e}") + + def test_all_remediation_modules_importable(self): + """Test that all remediation modules can be imported.""" + modules = [ + 'remediation.base', + 'remediation.ec2_remediation', + ] + + for module_name in modules: + try: + __import__(module_name) + except ImportError as e: + pytest.fail(f"Failed to import {module_name}: {e}") + + def test_web_app_routes_registered(self): + """Test that all expected web routes are registered.""" + from web.app import app + + routes = [route.path for route in app.routes] + + expected_routes = ['/api/analysis', '/api/analyzers', '/api/reports'] + + for expected in expected_routes: + assert any(expected in route for route in routes), f"Route {expected} not found" + + +# ============================================================================ +# MAIN +# ============================================================================ + +if __name__ == '__main__': + pytest.main([__file__, '-v', '--tb=short']) diff --git a/tests/test_new_features.py b/tests/test_new_features.py new file mode 100644 index 0000000..9657aae --- /dev/null +++ b/tests/test_new_features.py @@ -0,0 +1,283 @@ +""" +Tests for new features in the Dedo-Duro roadmap implementation. + +Tests cover: +- Environment filtering +- Multi-account configuration +- Remediation base classes +- Web API models +""" + +import pytest +from datetime import datetime +from unittest.mock import MagicMock, patch + + +class TestEnvironmentFiltering: + """Test environment filtering in AnalysisConfig.""" + + def test_matches_environment_exact_match(self): + """Test exact environment match.""" + from config import AnalysisConfig + + config = AnalysisConfig(environment_filter='prod') + tags = [{'Key': 'Environment', 'Value': 'prod'}] + assert config.matches_environment(tags) is True + + def test_matches_environment_partial_match(self): + """Test partial environment match (prod matches production).""" + from config import AnalysisConfig + + config = AnalysisConfig(environment_filter='prod') + tags = [{'Key': 'Environment', 'Value': 'production'}] + assert config.matches_environment(tags) is True + + def test_matches_environment_no_match(self): + """Test when environment doesn't match.""" + from config import AnalysisConfig + + config = AnalysisConfig(environment_filter='prod') + tags = [{'Key': 'Environment', 'Value': 'development'}] + assert config.matches_environment(tags) is False + + def test_matches_environment_no_filter(self): + """Test that no filter means all environments match.""" + from config import AnalysisConfig + + config = AnalysisConfig(environment_filter=None) + tags = [{'Key': 'Environment', 'Value': 'anything'}] + assert config.matches_environment(tags) is True + + def test_matches_environment_custom_tag_key(self): + """Test matching with custom environment tag key.""" + from config import AnalysisConfig + + config = AnalysisConfig(environment_filter='test', environment_tags=['Stage']) + tags = [{'Key': 'Stage', 'Value': 'testing'}] + assert config.matches_environment(tags) is True + + +class TestMultiAccountConfig: + """Test multi-account configuration.""" + + def test_default_values(self): + """Test MultiAccountConfig default values.""" + from config import MultiAccountConfig + + config = MultiAccountConfig() + assert config.enabled is False + assert config.parallel_accounts is True + assert config.session_duration == 3600 + assert config.max_account_workers == 3 + + def test_get_assume_role_arn(self): + """Test ARN generation.""" + from config import MultiAccountConfig + + config = MultiAccountConfig() + arn = config.get_assume_role_arn('123456789012', 'MyRole') + assert arn == 'arn:aws:iam::123456789012:role/MyRole' + + def test_get_assume_role_arn_custom_partition(self): + """Test ARN generation with custom partition.""" + from config import MultiAccountConfig + + config = MultiAccountConfig(partition='aws-cn') + arn = config.get_assume_role_arn('123456789012', 'MyRole') + assert arn == 'arn:aws-cn:iam::123456789012:role/MyRole' + + +class TestRemediationBase: + """Test remediation base classes.""" + + def test_remediation_action_creation(self): + """Test creating a remediation action.""" + from remediation.base import RemediationAction, RemediationRisk, RemediationStatus + + action = RemediationAction( + action_id='test123', + resource_type='ec2_instance', + resource_id='i-12345', + action_type='add_tags', + description='Add tags to instance', + risk_level=RemediationRisk.LOW + ) + + assert action.action_id == 'test123' + assert action.status == RemediationStatus.PENDING + assert action.dry_run is True + + def test_remediation_action_to_dict(self): + """Test converting action to dictionary.""" + from remediation.base import RemediationAction, RemediationRisk + + action = RemediationAction( + action_id='test123', + resource_type='ec2_instance', + resource_id='i-12345', + action_type='stop_instance', + description='Stop instance', + risk_level=RemediationRisk.MEDIUM + ) + + result = action.to_dict() + assert result['action_id'] == 'test123' + assert result['risk_level'] == 'medium' + assert result['status'] == 'pending' + + def test_risk_level_ordering(self): + """Test risk level values.""" + from remediation.base import RemediationRisk + + assert RemediationRisk.LOW.value == 'low' + assert RemediationRisk.MEDIUM.value == 'medium' + assert RemediationRisk.HIGH.value == 'high' + assert RemediationRisk.CRITICAL.value == 'critical' + + +class TestWebAPIModels: + """Test web API Pydantic models.""" + + def test_analysis_request_defaults(self): + """Test AnalysisRequest default values.""" + from web.app import AnalysisRequest + + request = AnalysisRequest() + assert request.region is None + assert request.resource_types is None + assert request.output_format == 'html' + assert request.multi_region is False + + def test_analysis_request_with_values(self): + """Test AnalysisRequest with custom values.""" + from web.app import AnalysisRequest + + request = AnalysisRequest( + region='us-west-2', + resource_types=['ec2', 'rds'], + output_format='json', + multi_region=True + ) + assert request.region == 'us-west-2' + assert request.resource_types == ['ec2', 'rds'] + assert request.output_format == 'json' + assert request.multi_region is True + + def test_analysis_status_model(self): + """Test AnalysisStatus model.""" + from web.app import AnalysisStatus + + status = AnalysisStatus( + job_id='abc123', + status='running', + started_at=datetime.utcnow(), + completed_at=None, + progress=50, + message='Analyzing...', + report_url=None, + error=None + ) + assert status.job_id == 'abc123' + assert status.status == 'running' + assert status.progress == 50 + + +class TestReporterGrouping: + """Test tag-based grouping in reporters.""" + + def test_get_tag_group_found(self): + """Test finding a tag group.""" + from core.reporter import BaseReporter + from config import ReportConfig + + config = ReportConfig() + reporter = type('TestReporter', (BaseReporter,), { + 'generate': lambda self, r, s: '' + })(config) + + resource = {'tags': [{'Key': 'Team', 'Value': 'Platform'}]} + group = reporter._get_tag_group(resource, ['Team', 'Project']) + assert group == 'Platform' + + def test_get_tag_group_untagged(self): + """Test fallback to Untagged.""" + from core.reporter import BaseReporter + from config import ReportConfig + + config = ReportConfig() + reporter = type('TestReporter', (BaseReporter,), { + 'generate': lambda self, r, s: '' + })(config) + + resource = {'tags': []} + group = reporter._get_tag_group(resource, ['Team', 'Project']) + assert group == 'Untagged' + + +class TestEC2Remediation: + """Test EC2 remediation actions.""" + + def test_get_available_actions(self): + """Test listing available EC2 remediation actions.""" + from remediation.ec2_remediation import EC2Remediation + from unittest.mock import MagicMock + + mock_config = MagicMock() + mock_config.create_client = MagicMock(return_value=MagicMock()) + + remediation = EC2Remediation(mock_config, dry_run=True) + actions = remediation.get_available_actions() + + action_types = [a['action_type'] for a in actions] + assert 'add_tags' in action_types + assert 'create_snapshot' in action_types + assert 'stop_instance' in action_types + assert 'terminate_instance' in action_types + + def test_can_auto_approve_low_risk(self): + """Test auto-approval for low risk actions.""" + from remediation.ec2_remediation import EC2Remediation + from remediation.base import RemediationRisk, RemediationAction + from unittest.mock import MagicMock + + mock_config = MagicMock() + mock_config.create_client = MagicMock(return_value=MagicMock()) + + remediation = EC2Remediation(mock_config, dry_run=True) + + action = RemediationAction( + action_id='test', + resource_type='ec2_instance', + resource_id='i-12345', + action_type='add_tags', + description='Test', + risk_level=RemediationRisk.LOW + ) + + assert remediation.can_auto_approve(action) is True + + def test_cannot_auto_approve_high_risk(self): + """Test that high risk actions cannot be auto-approved.""" + from remediation.ec2_remediation import EC2Remediation + from remediation.base import RemediationRisk, RemediationAction + from unittest.mock import MagicMock + + mock_config = MagicMock() + mock_config.create_client = MagicMock(return_value=MagicMock()) + + remediation = EC2Remediation(mock_config, dry_run=True) + + action = RemediationAction( + action_id='test', + resource_type='ec2_instance', + resource_id='i-12345', + action_type='terminate_instance', + description='Test', + risk_level=RemediationRisk.HIGH + ) + + assert remediation.can_auto_approve(action) is False + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) diff --git a/web/app.py b/web/app.py new file mode 100644 index 0000000..3d21422 --- /dev/null +++ b/web/app.py @@ -0,0 +1,454 @@ +""" +Dedo-Duro Web Interface. + +A FastAPI application for running AWS resource analysis via web interface +and REST API. + +Security Note: This app uses asyncio.create_subprocess_exec() which passes +arguments as a list, avoiding shell injection vulnerabilities. +""" + +import os +import logging +import asyncio +from datetime import datetime +from typing import Dict, List, Any, Optional +from uuid import uuid4 +from pathlib import Path + +from fastapi import FastAPI, HTTPException, BackgroundTasks, Request +from fastapi.staticfiles import StaticFiles +from fastapi.templating import Jinja2Templates +from fastapi.responses import HTMLResponse, FileResponse, JSONResponse +from pydantic import BaseModel, Field + +# Setup logging +logging.basicConfig(level=logging.INFO) +log = logging.getLogger(__name__) + +# Get the web directory path +WEB_DIR = Path(__file__).parent +TEMPLATES_DIR = WEB_DIR / "templates" +STATIC_DIR = WEB_DIR / "static" +REPORTS_DIR = WEB_DIR.parent / "reports" + +# Ensure directories exist +REPORTS_DIR.mkdir(exist_ok=True) +STATIC_DIR.mkdir(exist_ok=True) +TEMPLATES_DIR.mkdir(exist_ok=True) + +# Create FastAPI app +app = FastAPI( + title="Dedo-Duro API", + description="AWS Resource Utilization Analyzer - Web Interface", + version="1.0.0", + docs_url="/api/docs", + redoc_url="/api/redoc" +) + +# Mount static files +if STATIC_DIR.exists(): + app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static") + +# Setup templates +templates = Jinja2Templates(directory=str(TEMPLATES_DIR)) + +# In-memory storage for analysis jobs +analysis_jobs: Dict[str, Dict[str, Any]] = {} + + +# Pydantic models for request/response +class AnalysisRequest(BaseModel): + """Request model for starting an analysis.""" + region: Optional[str] = Field(None, description="AWS region to analyze") + resource_types: Optional[List[str]] = Field(None, description="List of analyzer keys") + output_format: str = Field("html", description="Output format (html, json, csv)") + multi_region: bool = Field(False, description="Analyze all regions") + environment_filter: Optional[str] = Field(None, description="Filter by environment tag") + profile: Optional[str] = Field(None, description="AWS profile to use") + + +class AnalysisStatus(BaseModel): + """Status of an analysis job.""" + job_id: str + status: str # pending, running, completed, failed + started_at: Optional[datetime] + completed_at: Optional[datetime] + progress: int # 0-100 + message: Optional[str] + report_url: Optional[str] + error: Optional[str] + + +class AnalysisResponse(BaseModel): + """Response after starting an analysis.""" + job_id: str + message: str + status_url: str + + +class ReportListItem(BaseModel): + """A report in the list.""" + filename: str + created_at: datetime + size_bytes: int + format: str + download_url: str + + +async def run_analysis_task(job_id: str, request: AnalysisRequest): + """ + Background task to run the analysis. + + Uses create_subprocess_exec with argument list (not shell) for security. + """ + import sys + + job = analysis_jobs.get(job_id) + if not job: + return + + job['status'] = 'running' + job['started_at'] = datetime.utcnow() + job['progress'] = 10 + job['message'] = 'Starting analysis...' + + try: + # Build command as list (safe from injection) + cmd = [sys.executable, 'main.py'] + + if request.region: + cmd.extend(['--region', request.region]) + + if request.resource_types: + cmd.extend(['--resource-types', ','.join(request.resource_types)]) + + cmd.extend(['--output-format', request.output_format]) + + # Generate unique output filename + timestamp = datetime.utcnow().strftime('%Y%m%d_%H%M%S') + output_file = REPORTS_DIR / f"analysis_{job_id}_{timestamp}.{request.output_format}" + cmd.extend(['--output-file', str(output_file)]) + + if request.multi_region: + cmd.append('--multi-region') + + if request.environment_filter: + cmd.extend(['--environment', request.environment_filter]) + + if request.profile: + cmd.extend(['--profile', request.profile]) + + job['progress'] = 20 + job['message'] = 'Running analyzers...' + + # Run the analysis using create_subprocess_exec (arguments as list, no shell) + # This is safe from command injection as arguments are passed directly + process = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + cwd=str(WEB_DIR.parent) + ) + + # Wait for completion with timeout + try: + stdout, stderr = await asyncio.wait_for( + process.communicate(), + timeout=1800 # 30 minute timeout + ) + except asyncio.TimeoutError: + process.kill() + raise + + # Update progress during execution + job['progress'] = 90 + + if process.returncode == 0: + job['status'] = 'completed' + job['progress'] = 100 + job['message'] = 'Analysis completed successfully' + job['completed_at'] = datetime.utcnow() + + if output_file.exists(): + job['report_url'] = f"/api/reports/{output_file.name}" + else: + # Try to find the default output file + default_file = WEB_DIR.parent / f"aws-optimization-report.{request.output_format}" + if default_file.exists(): + # Move to reports directory + import shutil + final_path = REPORTS_DIR / f"analysis_{job_id}_{timestamp}.{request.output_format}" + shutil.move(str(default_file), str(final_path)) + job['report_url'] = f"/api/reports/{final_path.name}" + else: + job['status'] = 'failed' + job['error'] = stderr.decode() if stderr else 'Analysis failed' + job['message'] = 'Analysis failed' + + except asyncio.TimeoutError: + job['status'] = 'failed' + job['error'] = 'Analysis timed out after 30 minutes' + job['message'] = 'Analysis timed out' + except Exception as e: + job['status'] = 'failed' + job['error'] = str(e) + job['message'] = f'Error: {str(e)}' + log.exception(f"Analysis job {job_id} failed") + + +# API Routes +@app.get("/", response_class=HTMLResponse) +async def home(request: Request): + """Render the home page.""" + template_path = TEMPLATES_DIR / "index.html" + if not template_path.exists(): + # Return a basic HTML page if template doesn't exist + return HTMLResponse(content=""" + + + + Dedo-Duro - AWS Resource Analyzer + + + +
+

Dedo-Duro

+

AWS Resource Utilization Analyzer

+
+

API Documentation:

+ +
+

Quick Start

+

Start an analysis via the API:

+
POST /api/analysis
+{
+  "region": "us-east-1",
+  "output_format": "html"
+}
+

Or use the command line:

+
python main.py --region us-east-1 --output-format html
+
+ + + """) + return templates.TemplateResponse("index.html", {"request": request}) + + +@app.post("/api/analysis", response_model=AnalysisResponse) +async def start_analysis(request: AnalysisRequest, background_tasks: BackgroundTasks): + """Start a new analysis job.""" + job_id = str(uuid4())[:8] + + # Validate inputs to prevent any injection attempts + if request.region and not request.region.replace('-', '').replace('_', '').isalnum(): + raise HTTPException(status_code=400, detail="Invalid region format") + + if request.environment_filter and not request.environment_filter.isalpha(): + raise HTTPException(status_code=400, detail="Invalid environment filter") + + if request.output_format not in ['html', 'json', 'csv']: + raise HTTPException(status_code=400, detail="Invalid output format") + + if request.resource_types: + for rt in request.resource_types: + if not rt.replace('_', '').isalnum(): + raise HTTPException(status_code=400, detail=f"Invalid resource type: {rt}") + + if request.profile and not request.profile.replace('-', '').replace('_', '').isalnum(): + raise HTTPException(status_code=400, detail="Invalid profile name") + + # Create job entry + analysis_jobs[job_id] = { + 'job_id': job_id, + 'status': 'pending', + 'started_at': None, + 'completed_at': None, + 'progress': 0, + 'message': 'Job queued', + 'report_url': None, + 'error': None, + 'request': request.model_dump() + } + + # Start background task + background_tasks.add_task(run_analysis_task, job_id, request) + + return AnalysisResponse( + job_id=job_id, + message="Analysis started", + status_url=f"/api/analysis/{job_id}/status" + ) + + +@app.get("/api/analysis/{job_id}/status", response_model=AnalysisStatus) +async def get_analysis_status(job_id: str): + """Get the status of an analysis job.""" + # Validate job_id format + if not job_id.isalnum() or len(job_id) > 16: + raise HTTPException(status_code=400, detail="Invalid job ID format") + + job = analysis_jobs.get(job_id) + if not job: + raise HTTPException(status_code=404, detail="Job not found") + + return AnalysisStatus( + job_id=job['job_id'], + status=job['status'], + started_at=job['started_at'], + completed_at=job['completed_at'], + progress=job['progress'], + message=job['message'], + report_url=job['report_url'], + error=job['error'] + ) + + +@app.get("/api/analysis", response_model=List[AnalysisStatus]) +async def list_analyses(): + """List all analysis jobs.""" + return [ + AnalysisStatus( + job_id=job['job_id'], + status=job['status'], + started_at=job['started_at'], + completed_at=job['completed_at'], + progress=job['progress'], + message=job['message'], + report_url=job['report_url'], + error=job['error'] + ) + for job in analysis_jobs.values() + ] + + +@app.get("/api/reports", response_model=List[ReportListItem]) +async def list_reports(): + """List all available reports.""" + reports = [] + + if REPORTS_DIR.exists(): + for file in REPORTS_DIR.iterdir(): + if file.is_file() and file.suffix in ['.html', '.json', '.csv']: + stat = file.stat() + reports.append(ReportListItem( + filename=file.name, + created_at=datetime.fromtimestamp(stat.st_mtime), + size_bytes=stat.st_size, + format=file.suffix[1:], + download_url=f"/api/reports/{file.name}" + )) + + # Sort by creation time, newest first + reports.sort(key=lambda x: x.created_at, reverse=True) + return reports + + +@app.get("/api/reports/{filename}") +async def download_report(filename: str): + """Download a specific report.""" + # Validate filename to prevent path traversal + if '..' in filename or '/' in filename or '\\' in filename: + raise HTTPException(status_code=400, detail="Invalid filename") + + # Additional validation: only allow alphanumeric, underscore, hyphen, and dot + import re + if not re.match(r'^[a-zA-Z0-9_\-\.]+$', filename): + raise HTTPException(status_code=400, detail="Invalid filename characters") + + file_path = REPORTS_DIR / filename + if not file_path.exists(): + raise HTTPException(status_code=404, detail="Report not found") + + # Ensure the file is within REPORTS_DIR (defense in depth) + try: + file_path.resolve().relative_to(REPORTS_DIR.resolve()) + except ValueError: + raise HTTPException(status_code=400, detail="Invalid file path") + + # Determine content type + content_types = { + '.html': 'text/html', + '.json': 'application/json', + '.csv': 'text/csv' + } + content_type = content_types.get(file_path.suffix, 'application/octet-stream') + + return FileResponse( + path=str(file_path), + media_type=content_type, + filename=filename + ) + + +@app.delete("/api/reports/{filename}") +async def delete_report(filename: str): + """Delete a specific report.""" + # Validate filename + if '..' in filename or '/' in filename or '\\' in filename: + raise HTTPException(status_code=400, detail="Invalid filename") + + import re + if not re.match(r'^[a-zA-Z0-9_\-\.]+$', filename): + raise HTTPException(status_code=400, detail="Invalid filename characters") + + file_path = REPORTS_DIR / filename + if not file_path.exists(): + raise HTTPException(status_code=404, detail="Report not found") + + # Ensure the file is within REPORTS_DIR + try: + file_path.resolve().relative_to(REPORTS_DIR.resolve()) + except ValueError: + raise HTTPException(status_code=400, detail="Invalid file path") + + file_path.unlink() + return {"message": f"Report {filename} deleted"} + + +@app.get("/api/analyzers") +async def list_analyzers(): + """List available analyzers.""" + return { + "analyzers": [ + {"key": "ec2", "desc": "EC2 instance analysis"}, + {"key": "rds", "desc": "RDS database analysis"}, + {"key": "s3", "desc": "S3 bucket analysis"}, + {"key": "ebs", "desc": "EBS volume analysis"}, + {"key": "elb", "desc": "Load balancer analysis"}, + {"key": "lambda", "desc": "Lambda function analysis"}, + {"key": "dynamodb", "desc": "DynamoDB table analysis"}, + {"key": "elasticache", "desc": "ElastiCache analysis"}, + {"key": "eks_sessions", "desc": "EKS session monitoring"}, + {"key": "eks_deployments", "desc": "EKS deployment analysis"}, + {"key": "cost_explorer", "desc": "Cost Explorer analysis"}, + {"key": "rto_analysis", "desc": "DR/RTO analysis"}, + ] + } + + +@app.get("/api/health") +async def health_check(): + """Health check endpoint.""" + return { + "status": "healthy", + "version": "1.0.0", + "active_jobs": len([j for j in analysis_jobs.values() if j['status'] == 'running']) + } + + +# Run with: uvicorn web.app:app --reload --host 0.0.0.0 --port 8000 +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8000)