diff --git a/tools/azure/.gitignore b/tools/azure/.gitignore new file mode 100644 index 00000000000..c98637f1475 --- /dev/null +++ b/tools/azure/.gitignore @@ -0,0 +1,39 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +venv/ +ENV/ +env/ +.venv + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Output files +*.csv +*.txt +*.log diff --git a/tools/azure/README.md b/tools/azure/README.md new file mode 100644 index 00000000000..9be1e650eb4 --- /dev/null +++ b/tools/azure/README.md @@ -0,0 +1,303 @@ +# Azure Log Analytics Tools + +This directory contains Python tools for querying and analyzing Azure Log Analytics data for HPCC Systems deployments on Azure Kubernetes Service (AKS). + +## Overview + +These tools help analyze Kubernetes pod and node inventory data from Azure Log Analytics to understand resource usage, identify which HPCC components were running, and support cost analysis and optimization. + +### Tools + +1. **azure_log_analytics_fetch.py** - Queries Azure Log Analytics using KQL +2. **azure_log_analytics_analyze.py** - Analyzes the fetched data and categorizes components + +## Prerequisites + +- Python 3.7 or higher +- Azure subscription with Log Analytics workspace +- Appropriate Azure credentials (see Authentication section below) + +## Installation + +Install required Python packages: + +```bash +pip install -r requirements.txt +``` + +Or install packages individually: + +```bash +pip install azure-identity azure-monitor-query +``` + +## Authentication + +The tools support two authentication methods: + +### 1. Default Azure Credentials (Recommended) + +Uses Azure CLI, managed identity, or environment variables. Set up Azure CLI: + +```bash +az login +az account set --subscription +``` + +### 2. Service Principal + +Use client credentials by providing tenant ID, client ID, and client secret: + +```bash +# Set environment variables +export AZURE_TENANT_ID="your-tenant-id" +export AZURE_CLIENT_ID="your-client-id" +export AZURE_CLIENT_SECRET="your-client-secret" +``` + +## Usage + +### Step 1: Fetch Data from Azure Log Analytics + +Query Azure Log Analytics to fetch Kubernetes node and pod inventory data: + +```bash +./azure_log_analytics_fetch.py \ + --subscription-id \ + --workspace-id \ + --aks-name \ + --start "2024-01-01T00:00:00Z" \ + --end "2024-01-01T23:59:59Z" \ + --output results.csv \ + --verbose +``` + +#### Optional Parameters + +**Filter by specific namespaces:** +```bash +--namespaces "default,hpcc,monitoring" +``` + +**Include resource group in metadata:** +```bash +--resource-group +``` + +**Use service principal authentication:** +```bash +--tenant-id \ +--client-id \ +--client-secret +``` + +#### Output Format + +The output CSV file contains: +- Metadata header with query parameters (as comments) +- Combined node and pod inventory data +- RecordType field to distinguish between Node and Pod records + +### Step 2: Analyze the Data + +Analyze the fetched CSV data to categorize pods into HPCC components: + +```bash +./azure_log_analytics_analyze.py \ + --input results.csv \ + --output analysis.csv \ + --summary summary.txt \ + --verbose +``` + +#### Output Files + +1. **analysis.csv** - Time-series data with categorized components + - Component categorization (HPCC vs system pods) + - Node usage information + - Suitable for visualization (e.g., Gantt charts) + +2. **summary.txt** (optional) - Human-readable summary report + - Statistics on pod distribution + - Category and component breakdowns + - Node usage analysis + - Namespace distribution + +## HPCC Component Categories + +The analysis tool recognizes the following HPCC components based on the helm/hpcc chart: + +- **dali** - Distributed Array Logical Index +- **thor** - Thor cluster (manager, workers, agents) +- **roxie** - Roxie cluster (server, agent, toposerver) +- **esp** - Enterprise Services Platform (eclwatch, eclservices, eclqueries, esdl-sandbox) +- **eclagent** - ECL Agent +- **eclccserver** - ECL CC Server +- **eclscheduler** - ECL Scheduler +- **dfuserver** - Distributed File Utility Server +- **sasha** - Storage Archive Service +- **dafilesrv** - Dali File Server + +Non-HPCC system components are also categorized: +- kubernetes-system (kube-*) +- monitoring (prometheus, grafana) +- logging (fluentd, elasticsearch) +- ingress (nginx, traefik) +- azure-system (aks-*, omsagent, azure-disk, azure-file) +- And others... + +## Example Workflow + +Complete example for analyzing a 24-hour period: + +```bash +# Step 1: Fetch data +./azure_log_analytics_fetch.py \ + --subscription-id "12345678-1234-1234-1234-123456789012" \ + --workspace-id "abcd1234-5678-90ab-cdef-1234567890ab" \ + --aks-name "my-hpcc-cluster" \ + --start "2024-11-01T00:00:00Z" \ + --end "2024-11-02T00:00:00Z" \ + --output hpcc_inventory_nov1.csv \ + --verbose + +# Step 2: Analyze data +./azure_log_analytics_analyze.py \ + --input hpcc_inventory_nov1.csv \ + --output hpcc_analysis_nov1.csv \ + --summary hpcc_summary_nov1.txt \ + --verbose + +# Step 3: Review the results +cat hpcc_summary_nov1.txt +``` + +## Output Analysis and Visualization + +The analysis output is designed to support: + +### 1. Cost Analysis +- Identify which components were running on which nodes +- Correlate with Azure VM costs to determine component-level expenses +- Understand which components kept VMs active + +### 2. Time-Series Visualization +The `analysis.csv` output contains time-series data with: +- TimeGenerated: Timestamp of the record +- Component: HPCC component name or category +- NodeName: Node where the pod was running +- PodStatus: Status of the pod (Running, Pending, etc.) + +This data can be imported into visualization tools to create: +- **Gantt charts** showing component lifecycles +- **Resource usage timelines** by component +- **Node utilization charts** showing which nodes were active + +### 3. Visualization Examples + +#### Using Python (Pandas + Matplotlib) +```python +import pandas as pd +import matplotlib.pyplot as plt + +# Load the analysis +df = pd.read_csv('hpcc_analysis_nov1.csv', comment='#') +df['TimeGenerated'] = pd.to_datetime(df['TimeGenerated']) + +# Filter for HPCC components +hpcc_df = df[df['IsHPCC'] == 'Yes'] + +# Group by component and time +component_timeline = hpcc_df.groupby(['Component', 'TimeGenerated']).size() + +# Create visualization +# ... (customize based on your needs) +``` + +#### Using Excel/Google Sheets +1. Import the CSV file +2. Filter by IsHPCC='Yes' for HPCC components +3. Create pivot tables for component analysis +4. Generate charts for timeline visualization + +## Troubleshooting + +### Authentication Issues + +If you encounter authentication errors: + +```bash +# Verify Azure CLI login +az account show + +# Re-login if needed +az login + +# Verify workspace access +az monitor log-analytics workspace show \ + --workspace-name \ + --resource-group +``` + +### No Data Returned + +If the query returns no results: + +1. Verify the date range is correct (use ISO 8601 format with 'Z' for UTC) +2. Check that the AKS cluster name matches exactly +3. Verify the cluster is sending data to the Log Analytics workspace +4. Ensure KubeNodeInventory and KubePodInventory tables exist in the workspace + +### Large Result Sets + +For large time ranges or busy clusters: + +1. Query smaller time windows (e.g., 1 day at a time) +2. Use namespace filtering to reduce data volume +3. Consider using Azure's query limits and pagination + +## Integration with Cost Analysis + +To integrate with Azure cost analysis: + +1. Export Azure cost data for the same time period +2. Match VM resources to node names from the analysis +3. Correlate component usage with VM costs +4. Calculate per-component cost allocation + +Example workflow: +```bash +# Get VM costs from Azure +az consumption usage list \ + --start-date "2024-11-01" \ + --end-date "2024-11-02" \ + --query "[?contains(instanceName, 'aks')]" > vm_costs.json + +# Correlate with component analysis +# ... (custom script to join data) +``` + +## Contributing + +When adding new HPCC components or improving categorization: + +1. Update the component patterns in `azure_log_analytics_analyze.py` +2. Refer to `helm/hpcc/templates/` for component naming conventions +3. Test with sample data to ensure correct categorization +4. Update this README with any new features + +## License + +Copyright (C) 2024 HPCC Systems® + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/tools/azure/azure_log_analytics_analyze.py b/tools/azure/azure_log_analytics_analyze.py new file mode 100755 index 00000000000..713b49199fb --- /dev/null +++ b/tools/azure/azure_log_analytics_analyze.py @@ -0,0 +1,501 @@ +#!/usr/bin/env python3 +################################################################################ +# HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +""" +Azure Log Analytics Analyze Tool + +This tool analyzes the CSV output from azure_log_analytics_fetch.py and categorizes +pods into HPCC components based on their naming patterns. It creates a time-series +analysis showing which components were running on which nodes over time, enabling +visualization as Gantt charts and cost analysis. + +HPCC Component Categories (derived from helm/hpcc chart): +- dali: Distributed Array Logical Index +- thor: Thor cluster (manager, workers) +- roxie: Roxie cluster (server, agent) +- esp: Enterprise Services Platform (eclwatch, eclservices, etc.) +- eclagent: ECL Agent +- eclccserver: ECL CC Server +- eclscheduler: ECL Scheduler +- dfuserver: Distributed File Utility Server +- sasha: Storage Archive Service +- dafilesrv: Dali File Server +""" + +import argparse +import sys +import csv +from datetime import datetime, timezone +from typing import List, Dict, Set, Optional +from collections import defaultdict +import re + + +# HPCC component patterns based on helm chart naming conventions +HPCC_COMPONENT_PATTERNS = { + 'dali': [r'.*-dali(-.*)?$', r'^dali(-.*)?$'], + 'thor': [r'.*-thor(-.*)?$', r'^thor(-.*)?$', r'.*-thoragent(-.*)?$', r'.*-thormanager(-.*)?$', r'.*-thorworker(-.*)?$'], + 'roxie': [r'.*-roxie(-.*)?$', r'^roxie(-.*)?$', r'.*-roxieagent(-.*)?$', r'.*-roxieserver(-.*)?$', r'.*-toposerver(-.*)?$'], + 'esp': [r'.*-esp(-.*)?$', r'^esp(-.*)?$', r'.*-eclwatch(-.*)?$', r'.*-eclservices(-.*)?$', r'.*-eclqueries(-.*)?$', r'.*-esdl-sandbox(-.*)?$'], + 'eclagent': [r'.*-eclagent(-.*)?$', r'^eclagent(-.*)?$'], + 'eclccserver': [r'.*-eclccserver(-.*)?$', r'^eclccserver(-.*)?$'], + 'eclscheduler': [r'.*-eclscheduler(-.*)?$', r'^eclscheduler(-.*)?$'], + 'dfuserver': [r'.*-dfuserver(-.*)?$', r'^dfuserver(-.*)?$'], + 'sasha': [r'.*-sasha(-.*)?$', r'^sasha(-.*)?$'], + 'dafilesrv': [r'.*-dafilesrv(-.*)?$', r'^dafilesrv(-.*)?$'], +} + +# Common non-HPCC system pods +SYSTEM_COMPONENT_PATTERNS = { + 'azure-system': [r'^aks-.*', r'.*-azuremonitor(-.*)?$', r'^omsagent(-.*)?$', r'.*-omsagent(-.*)?$', r'.*-azuredisk(-.*)?$', r'.*-azurefile(-.*)?$'], + 'monitoring': [r'^prometheus(-.*)?$', r'.*-prometheus(-.*)?$', r'^grafana(-.*)?$', r'.*-grafana(-.*)?$', r'^alertmanager(-.*)?$', r'.*-alertmanager(-.*)?$'], + 'logging': [r'^fluentd(-.*)?$', r'.*-fluentd(-.*)?$', r'^fluent-bit(-.*)?$', r'.*-fluent-bit(-.*)?$', r'^elasticsearch(-.*)?$', r'.*-elasticsearch(-.*)?$', r'^kibana(-.*)?$', r'.*-kibana(-.*)?$', r'^logstash(-.*)?$', r'.*-logstash(-.*)?$'], + 'ingress': [r'.*-ingress(-.*)?$', r'.*-nginx(-.*)?$', r'.*-traefik(-.*)?$'], + 'storage': [r'.*-csi(-.*)?$', r'.*-nfs(-.*)?$'], + 'networking': [r'.*-calico(-.*)?$', r'.*-flannel(-.*)?$', r'.*-weave(-.*)?$', r'.*-cilium(-.*)?$'], + 'security': [r'.*-vault(-.*)?$', r'.*-cert-manager(-.*)?$'], + 'kubernetes-system': [r'^kube-.*', r'.*-kube-.*', r'^coredns(-.*)?$'], +} + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description='Analyze Azure Log Analytics CSV output and categorize HPCC components', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic analysis + %(prog)s --input results.csv --output analysis.csv + + # Detailed analysis with verbose output + %(prog)s --input results.csv --output analysis.csv --verbose + + # Generate summary report + %(prog)s --input results.csv --output analysis.csv --summary summary.txt + """ + ) + + parser.add_argument('--input', '-i', required=True, + help='Input CSV file from azure_log_analytics_fetch.py') + parser.add_argument('--output', '-o', required=True, + help='Output CSV file with categorized component analysis') + parser.add_argument('--summary', '-s', + help='Optional summary report file (text format)') + parser.add_argument('--verbose', '-v', action='store_true', + help='Enable verbose output') + + return parser.parse_args() + + +def categorize_pod(pod_name: str, namespace: str, controller_name: str) -> tuple: + """ + Categorize a pod into HPCC component or system category. + + Args: + pod_name: Name of the pod + namespace: Kubernetes namespace + controller_name: Controller name (deployment, statefulset, etc.) + + Returns: + Tuple of (category, component_type, is_hpcc) + """ + # Normalize names for pattern matching + search_name = pod_name.lower() + search_controller = (controller_name or "").lower() + + # Check HPCC components first + for component, patterns in HPCC_COMPONENT_PATTERNS.items(): + for pattern in patterns: + if re.match(pattern, search_name) or re.match(pattern, search_controller): + return ('hpcc', component, True) + + # Check system components + for category, patterns in SYSTEM_COMPONENT_PATTERNS.items(): + for pattern in patterns: + if re.match(pattern, search_name): + return (category, 'system', False) + + # Check namespace-based categorization for system pods + if namespace in ['kube-system', 'kube-public', 'kube-node-lease']: + return ('kubernetes-system', 'system', False) + + # Uncategorized + return ('unknown', 'unknown', False) + + +def parse_csv_input(input_path: str, verbose: bool = False) -> tuple: + """ + Parse the input CSV file and extract metadata and records. + + Args: + input_path: Path to input CSV file + verbose: Enable verbose output + + Returns: + Tuple of (metadata dict, list of pod records, list of node records) + """ + metadata = {} + pod_records = [] + node_records = [] + + with open(input_path, 'r', encoding='utf-8') as csvfile: + # Read all lines first + lines = csvfile.readlines() + + # Parse metadata from comment lines + data_start_idx = 0 + for i, line in enumerate(lines): + if line.startswith('#'): + # Parse metadata + if ':' in line: + key_value = line[1:].strip().split(':', 1) + if len(key_value) == 2: + key = key_value[0].strip().lower().replace(' ', '_') + value = key_value[1].strip() + metadata[key] = value + else: + data_start_idx = i + break + + # Parse CSV data + if data_start_idx < len(lines): + csv_lines = lines[data_start_idx:] + reader = csv.DictReader(csv_lines) + for row in reader: + record_type = row.get('RecordType', '') + if record_type == 'Pod': + pod_records.append(row) + elif record_type == 'Node': + node_records.append(row) + + if verbose: + print(f"Parsed metadata: {len(metadata)} items") + print(f"Found {len(pod_records)} pod records") + print(f"Found {len(node_records)} node records") + + return metadata, pod_records, node_records + + +def analyze_pods(pod_records: List[Dict], verbose: bool = False) -> List[Dict]: + """ + Analyze pod records and categorize them. + + Args: + pod_records: List of pod record dictionaries + verbose: Enable verbose output + + Returns: + List of analyzed pod records with categorization + """ + analyzed_records = [] + category_counts = defaultdict(int) + component_counts = defaultdict(int) + + for record in pod_records: + pod_name = record.get('Name', '') + namespace = record.get('Namespace', '') + controller_name = record.get('ControllerName', '') + + category, component, is_hpcc = categorize_pod(pod_name, namespace, controller_name) + + analyzed_record = record.copy() + analyzed_record['Category'] = category + analyzed_record['Component'] = component + analyzed_record['IsHPCC'] = 'Yes' if is_hpcc else 'No' + + analyzed_records.append(analyzed_record) + + category_counts[category] += 1 + if is_hpcc: + component_counts[component] += 1 + + if verbose: + print("\nCategorization Summary:") + print(f" Total pods analyzed: {len(pod_records)}") + print(f" HPCC components found: {sum(1 for r in analyzed_records if r['IsHPCC'] == 'Yes')}") + print("\nCategory breakdown:") + for category, count in sorted(category_counts.items()): + print(f" {category}: {count}") + print("\nHPCC component breakdown:") + for component, count in sorted(component_counts.items()): + print(f" {component}: {count}") + + return analyzed_records + + +def create_time_series(analyzed_pods: List[Dict], node_records: List[Dict], + verbose: bool = False) -> List[Dict]: + """ + Create time-series data showing components and their node usage over time. + + Args: + analyzed_pods: List of analyzed pod records + node_records: List of node records + verbose: Enable verbose output + + Returns: + List of time-series records + """ + time_series = [] + + # Create a mapping of computer/node to its details + node_map = {} + for node in node_records: + computer = node.get('Computer', '') + if computer: + node_map[computer] = node + + # Process each pod record as a time-series entry + for pod in analyzed_pods: + time_generated = pod.get('TimeGenerated', '') + component = pod.get('Component', '') + category = pod.get('Category', '') + is_hpcc = pod.get('IsHPCC', 'No') + pod_name = pod.get('Name', '') + namespace = pod.get('Namespace', '') + computer = pod.get('Computer', '') + pod_status = pod.get('PodStatus', '') + controller_name = pod.get('ControllerName', '') + controller_kind = pod.get('ControllerKind', '') + + # Get node information if available + node_info = node_map.get(computer, {}) + node_status = node_info.get('Status', 'Unknown') + node_zone = node_info.get('KubernetesZone', 'Unknown') + + time_series_entry = { + 'TimeGenerated': time_generated, + 'Component': component, + 'Category': category, + 'IsHPCC': is_hpcc, + 'PodName': pod_name, + 'Namespace': namespace, + 'NodeName': computer, + 'NodeStatus': node_status, + 'NodeZone': node_zone, + 'PodStatus': pod_status, + 'ControllerName': controller_name, + 'ControllerKind': controller_kind, + } + + time_series.append(time_series_entry) + + # Sort by time + time_series.sort(key=lambda x: x['TimeGenerated']) + + if verbose: + print(f"\nCreated time-series with {len(time_series)} entries") + + return time_series + + +def write_analysis_output(time_series: List[Dict], output_path: str, + metadata: Dict, verbose: bool = False): + """ + Write analyzed time-series data to CSV file. + + Args: + time_series: List of time-series records + output_path: Output file path + metadata: Original metadata from input + verbose: Enable verbose output + """ + if not time_series: + print("No data to write", file=sys.stderr) + return + + with open(output_path, 'w', newline='', encoding='utf-8') as csvfile: + # Write metadata header + csvfile.write("# Azure Log Analytics Component Analysis\n") + csvfile.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n") + csvfile.write(f"# Source AKS Cluster: {metadata.get('aks_cluster', 'N/A')}\n") + csvfile.write(f"# Source Time Range: {metadata.get('time_range', 'N/A')}\n") + csvfile.write(f"# Total Records: {len(time_series)}\n") + csvfile.write("#\n") + csvfile.write("# This file contains a time-series analysis of Kubernetes components\n") + csvfile.write("# suitable for visualization (e.g., Gantt chart) and cost analysis.\n") + csvfile.write("#\n") + + # Get columns + columns = list(time_series[0].keys()) if time_series else [] + + # Write CSV + writer = csv.DictWriter(csvfile, fieldnames=columns) + writer.writeheader() + writer.writerows(time_series) + + if verbose: + print(f"Analysis written to {output_path}") + + +def generate_summary_report(analyzed_pods: List[Dict], time_series: List[Dict], + summary_path: str, metadata: Dict, verbose: bool = False): + """ + Generate a human-readable summary report. + + Args: + analyzed_pods: List of analyzed pod records + time_series: List of time-series records + summary_path: Output file path for summary + metadata: Original metadata from input + verbose: Enable verbose output + """ + with open(summary_path, 'w', encoding='utf-8') as f: + f.write("=" * 80 + "\n") + f.write("Azure Log Analytics Component Analysis Summary\n") + f.write("=" * 80 + "\n\n") + + f.write(f"Generated: {datetime.now(timezone.utc).isoformat()}\n") + f.write(f"Source AKS Cluster: {metadata.get('aks_cluster', 'N/A')}\n") + f.write(f"Source Time Range: {metadata.get('time_range', 'N/A')}\n\n") + + # Overall statistics + f.write("-" * 80 + "\n") + f.write("Overall Statistics\n") + f.write("-" * 80 + "\n") + f.write(f"Total pod records: {len(analyzed_pods)}\n") + hpcc_pods = [p for p in analyzed_pods if p['IsHPCC'] == 'Yes'] + f.write(f"HPCC pods: {len(hpcc_pods)}\n") + f.write(f"Non-HPCC pods: {len(analyzed_pods) - len(hpcc_pods)}\n\n") + + # Category breakdown + f.write("-" * 80 + "\n") + f.write("Category Breakdown\n") + f.write("-" * 80 + "\n") + category_counts = defaultdict(int) + for pod in analyzed_pods: + category_counts[pod['Category']] += 1 + + for category, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True): + percentage = (count / len(analyzed_pods)) * 100 if analyzed_pods else 0 + f.write(f" {category:30s}: {count:5d} ({percentage:5.1f}%)\n") + f.write("\n") + + # HPCC component breakdown + if hpcc_pods: + f.write("-" * 80 + "\n") + f.write("HPCC Component Breakdown\n") + f.write("-" * 80 + "\n") + component_counts = defaultdict(int) + for pod in hpcc_pods: + component_counts[pod['Component']] += 1 + + for component, count in sorted(component_counts.items(), key=lambda x: x[1], reverse=True): + percentage = (count / len(hpcc_pods)) * 100 + f.write(f" {component:30s}: {count:5d} ({percentage:5.1f}%)\n") + f.write("\n") + + # Node usage + f.write("-" * 80 + "\n") + f.write("Node Usage\n") + f.write("-" * 80 + "\n") + node_counts = defaultdict(int) + for entry in time_series: + node_counts[entry['NodeName']] += 1 + + f.write(f"Unique nodes: {len(node_counts)}\n") + f.write("\nTop 10 nodes by pod count:\n") + for node, count in sorted(node_counts.items(), key=lambda x: x[1], reverse=True)[:10]: + f.write(f" {node:50s}: {count:5d} pod records\n") + f.write("\n") + + # Namespace breakdown + f.write("-" * 80 + "\n") + f.write("Namespace Breakdown\n") + f.write("-" * 80 + "\n") + namespace_counts = defaultdict(int) + for pod in analyzed_pods: + namespace = pod.get('Namespace', 'unknown') + namespace_counts[namespace] += 1 + + for namespace, count in sorted(namespace_counts.items(), key=lambda x: x[1], reverse=True): + percentage = (count / len(analyzed_pods)) * 100 if analyzed_pods else 0 + f.write(f" {namespace:30s}: {count:5d} ({percentage:5.1f}%)\n") + f.write("\n") + + f.write("=" * 80 + "\n") + f.write("End of Report\n") + f.write("=" * 80 + "\n") + + if verbose: + print(f"Summary report written to {summary_path}") + + +def main(): + """Main entry point.""" + args = parse_args() + + try: + # Parse input CSV + if args.verbose: + print(f"Reading input from {args.input}") + + metadata, pod_records, node_records = parse_csv_input(args.input, args.verbose) + + if not pod_records: + print("Warning: No pod records found in input file", file=sys.stderr) + + # Analyze pods + if args.verbose: + print("\nAnalyzing pods...") + + analyzed_pods = analyze_pods(pod_records, args.verbose) + + # Create time-series + if args.verbose: + print("\nCreating time-series...") + + time_series = create_time_series(analyzed_pods, node_records, args.verbose) + + # Write output + if args.verbose: + print(f"\nWriting analysis to {args.output}") + + write_analysis_output(time_series, args.output, metadata, args.verbose) + + # Generate summary if requested + if args.summary: + if args.verbose: + print(f"\nGenerating summary report to {args.summary}") + + generate_summary_report(analyzed_pods, time_series, args.summary, + metadata, args.verbose) + + print(f"\nAnalysis complete!") + print(f" Analyzed {len(pod_records)} pod records") + print(f" Generated {len(time_series)} time-series entries") + print(f" Output written to {args.output}") + if args.summary: + print(f" Summary report written to {args.summary}") + + except FileNotFoundError as e: + print(f"Error: Input file not found: {e}", file=sys.stderr) + sys.exit(1) + except KeyboardInterrupt: + print("\nOperation cancelled by user", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error: {str(e)}", file=sys.stderr) + if args.verbose: + import traceback + traceback.print_exc() + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/tools/azure/azure_log_analytics_fetch.py b/tools/azure/azure_log_analytics_fetch.py new file mode 100755 index 00000000000..3f7719a1448 --- /dev/null +++ b/tools/azure/azure_log_analytics_fetch.py @@ -0,0 +1,320 @@ +#!/usr/bin/env python3 +################################################################################ +# HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +""" +Azure Log Analytics Fetch Tool + +This tool queries Azure Log Analytics using KQL (Kusto Query Language) to fetch +information about Kubernetes nodes and pods from a specified AKS cluster for a +given date/time range. + +It queries KubeNodeInventory and KubePodInventory tables and outputs the results +in CSV format with metadata about the query in a comment header. +""" + +import argparse +import sys +import csv +from datetime import datetime, timezone +from typing import List, Dict, Optional +import json + +try: + from azure.identity import DefaultAzureCredential, ClientSecretCredential + from azure.monitor.query import LogsQueryClient, LogsQueryStatus +except ImportError: + print("Error: Azure SDK packages not installed.", file=sys.stderr) + print("Please install required packages: pip install azure-identity azure-monitor-query", file=sys.stderr) + sys.exit(1) + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description='Fetch Kubernetes node and pod inventory from Azure Log Analytics', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Fetch data for the last 24 hours + %(prog)s --subscription-id --resource-group --aks-name \\ + --workspace-id --start "2024-01-01T00:00:00Z" \\ + --end "2024-01-01T23:59:59Z" --output results.csv + + # Fetch data for specific namespaces only + %(prog)s --subscription-id --resource-group --aks-name \\ + --workspace-id --start "2024-01-01T00:00:00Z" \\ + --end "2024-01-01T23:59:59Z" --namespaces default,kube-system \\ + --output results.csv + + # Use client credentials for authentication + %(prog)s --subscription-id --workspace-id \\ + --tenant-id --client-id --client-secret \\ + --aks-name --start "2024-01-01T00:00:00Z" \\ + --end "2024-01-01T23:59:59Z" --output results.csv + """ + ) + + # Authentication arguments + auth_group = parser.add_argument_group('authentication') + auth_group.add_argument('--tenant-id', help='Azure tenant ID (optional, for service principal auth)') + auth_group.add_argument('--client-id', help='Azure client ID (optional, for service principal auth)') + auth_group.add_argument('--client-secret', help='Azure client secret (optional, for service principal auth)') + + # Required arguments + required = parser.add_argument_group('required arguments') + required.add_argument('--subscription-id', required=True, help='Azure subscription ID') + required.add_argument('--workspace-id', required=True, help='Log Analytics workspace ID') + required.add_argument('--aks-name', required=True, help='AKS cluster name') + required.add_argument('--start', required=True, help='Start datetime (ISO 8601 format, e.g., 2024-01-01T00:00:00Z)') + required.add_argument('--end', required=True, help='End datetime (ISO 8601 format, e.g., 2024-01-01T23:59:59Z)') + + # Optional arguments + parser.add_argument('--resource-group', help='Azure resource group name (optional, used in metadata)') + parser.add_argument('--namespaces', help='Comma-separated list of namespaces to filter (default: all)') + parser.add_argument('--output', '-o', default='azure_log_analytics_output.csv', + help='Output CSV file path (default: azure_log_analytics_output.csv)') + parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output') + + return parser.parse_args() + + +def create_credential(args): + """Create Azure credential based on provided arguments.""" + if args.tenant_id and args.client_id and args.client_secret: + if args.verbose: + print("Using ClientSecretCredential for authentication") + return ClientSecretCredential( + tenant_id=args.tenant_id, + client_id=args.client_id, + client_secret=args.client_secret + ) + else: + if args.verbose: + print("Using DefaultAzureCredential for authentication") + return DefaultAzureCredential() + + +def build_kql_query(aks_name: str, start_time: str, end_time: str, namespaces: Optional[List[str]] = None) -> str: + """ + Build the KQL query to fetch node and pod inventory data. + + Args: + aks_name: Name of the AKS cluster + start_time: Start time in ISO 8601 format + end_time: End time in ISO 8601 format + namespaces: Optional list of namespaces to filter + + Returns: + KQL query string + """ + namespace_filter = "" + if namespaces: + namespace_list = "', '".join(namespaces) + namespace_filter = f"| where Namespace in ('{namespace_list}')" + + query = f""" +let startTime = datetime({start_time}); +let endTime = datetime({end_time}); +let clusterName = "{aks_name}"; +// Get node inventory +let nodes = KubeNodeInventory +| where TimeGenerated between (startTime .. endTime) +| where ClusterName == clusterName +| project TimeGenerated, ClusterName, Computer, Status, + KubeletVersion, KubeProxyVersion, CreationTimeStamp, + Labels, KubernetesProviderID, KubernetesZone +| extend RecordType = "Node"; +// Get pod inventory +let pods = KubePodInventory +| where TimeGenerated between (startTime .. endTime) +| where ClusterName == clusterName +{namespace_filter} +| project TimeGenerated, ClusterName, Namespace, Name, PodLabel, + PodStatus, PodCreationTimeStamp, Computer, ControllerName, + ControllerKind, ContainerName, ContainerStatus, PodIp, + ServiceName, PodUid +| extend RecordType = "Pod"; +// Union results +union nodes, pods +| order by TimeGenerated asc +""" + return query + + +def query_log_analytics(credential, workspace_id: str, query: str, verbose: bool = False) -> List[Dict]: + """ + Execute KQL query against Log Analytics workspace. + + Args: + credential: Azure credential object + workspace_id: Log Analytics workspace ID + query: KQL query string + verbose: Enable verbose output + + Returns: + List of result rows as dictionaries + """ + client = LogsQueryClient(credential) + + if verbose: + print(f"Executing query against workspace {workspace_id}") + print(f"Query:\n{query}\n") + + try: + response = client.query_workspace( + workspace_id=workspace_id, + query=query, + timespan=None # timespan is specified in the query + ) + + if response.status == LogsQueryStatus.PARTIAL: + print("Warning: Partial results returned", file=sys.stderr) + if hasattr(response, 'partial_error'): + print(f"Partial error: {response.partial_error}", file=sys.stderr) + + if response.status == LogsQueryStatus.SUCCESS: + # Extract results + results = [] + for table in response.tables: + for row in table.rows: + row_dict = {} + for i, column in enumerate(table.columns): + row_dict[column] = row[i] + results.append(row_dict) + + if verbose: + print(f"Retrieved {len(results)} records") + + return results + else: + print(f"Query failed with status: {response.status}", file=sys.stderr) + return [] + + except Exception as e: + print(f"Error executing query: {str(e)}", file=sys.stderr) + raise + + +def write_csv_output(results: List[Dict], output_path: str, metadata: Dict, verbose: bool = False): + """ + Write query results to CSV file with metadata header. + + Args: + results: List of result rows as dictionaries + output_path: Output CSV file path + metadata: Metadata dictionary for header comments + verbose: Enable verbose output + """ + if not results: + print("No results to write", file=sys.stderr) + return + + # Get all unique column names from results + all_columns = set() + for row in results: + all_columns.update(row.keys()) + + # Sort columns for consistent output + columns = sorted(all_columns) + + with open(output_path, 'w', newline='', encoding='utf-8') as csvfile: + # Write metadata as comments + csvfile.write("# Azure Log Analytics Query Results\n") + csvfile.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n") + csvfile.write(f"# Subscription ID: {metadata.get('subscription_id', 'N/A')}\n") + if metadata.get('resource_group'): + csvfile.write(f"# Resource Group: {metadata['resource_group']}\n") + csvfile.write(f"# AKS Cluster: {metadata.get('aks_name', 'N/A')}\n") + csvfile.write(f"# Workspace ID: {metadata.get('workspace_id', 'N/A')}\n") + csvfile.write(f"# Time Range: {metadata.get('start_time', 'N/A')} to {metadata.get('end_time', 'N/A')}\n") + if metadata.get('namespaces'): + csvfile.write(f"# Namespaces: {metadata['namespaces']}\n") + else: + csvfile.write("# Namespaces: All\n") + csvfile.write(f"# Total Records: {len(results)}\n") + csvfile.write("#\n") + + # Write CSV data + writer = csv.DictWriter(csvfile, fieldnames=columns, extrasaction='ignore') + writer.writeheader() + writer.writerows(results) + + if verbose: + print(f"Results written to {output_path}") + + +def main(): + """Main entry point.""" + args = parse_args() + + # Validate datetime formats + try: + datetime.fromisoformat(args.start.replace('Z', '+00:00')) + datetime.fromisoformat(args.end.replace('Z', '+00:00')) + except ValueError as e: + print(f"Error: Invalid datetime format: {e}", file=sys.stderr) + print("Please use ISO 8601 format (e.g., 2024-01-01T00:00:00Z)", file=sys.stderr) + sys.exit(1) + + # Parse namespaces if provided + namespaces = None + if args.namespaces: + namespaces = [ns.strip() for ns in args.namespaces.split(',')] + + try: + # Create credential + credential = create_credential(args) + + # Build query + query = build_kql_query(args.aks_name, args.start, args.end, namespaces) + + # Execute query + results = query_log_analytics(credential, args.workspace_id, query, args.verbose) + + if not results: + print("No results found for the specified query", file=sys.stderr) + sys.exit(1) + + # Prepare metadata + metadata = { + 'subscription_id': args.subscription_id, + 'resource_group': args.resource_group, + 'aks_name': args.aks_name, + 'workspace_id': args.workspace_id, + 'start_time': args.start, + 'end_time': args.end, + 'namespaces': args.namespaces + } + + # Write output + write_csv_output(results, args.output, metadata, args.verbose) + + print(f"Successfully fetched {len(results)} records and saved to {args.output}") + + except KeyboardInterrupt: + print("\nOperation cancelled by user", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error: {str(e)}", file=sys.stderr) + if args.verbose: + import traceback + traceback.print_exc() + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/tools/azure/requirements.txt b/tools/azure/requirements.txt new file mode 100644 index 00000000000..018515eba5a --- /dev/null +++ b/tools/azure/requirements.txt @@ -0,0 +1,9 @@ +# Azure Log Analytics Tools - Python Dependencies +# +# Install with: pip install -r requirements.txt + +# Azure SDK for authentication +azure-identity>=1.14.0 + +# Azure Monitor Query for Log Analytics +azure-monitor-query>=1.2.0 diff --git a/tools/azure/test_categorization.py b/tools/azure/test_categorization.py new file mode 100755 index 00000000000..eea44a259ae --- /dev/null +++ b/tools/azure/test_categorization.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +################################################################################ +# HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +""" +Test script for categorization logic in azure_log_analytics_analyze.py +""" + +import sys +import re + +# Import the patterns from the analyze script +sys.path.insert(0, '.') +from azure_log_analytics_analyze import categorize_pod, HPCC_COMPONENT_PATTERNS, SYSTEM_COMPONENT_PATTERNS + + +def test_hpcc_components(): + """Test HPCC component categorization.""" + print("Testing HPCC Component Categorization") + print("=" * 80) + + test_cases = [ + # (pod_name, namespace, controller_name, expected_category, expected_component) + ('myhpcc-dali-0', 'default', 'myhpcc-dali', 'hpcc', 'dali'), + ('cluster-dali-0', 'default', 'cluster-dali', 'hpcc', 'dali'), + ('myhpcc-thor-thormanager-0', 'default', 'myhpcc-thor-thormanager', 'hpcc', 'thor'), + ('myhpcc-thor-thorworker-0', 'default', 'myhpcc-thor-thorworker', 'hpcc', 'thor'), + ('my-thor-0', 'default', 'my-thor', 'hpcc', 'thor'), + ('myhpcc-roxie-server-0', 'default', 'myhpcc-roxie-server', 'hpcc', 'roxie'), + ('prod-roxie-agent-1', 'default', 'prod-roxie-agent', 'hpcc', 'roxie'), + ('myhpcc-toposerver-0', 'default', 'myhpcc-toposerver', 'hpcc', 'roxie'), + ('myhpcc-eclservices-0', 'default', 'myhpcc-eclservices', 'hpcc', 'esp'), + ('myhpcc-eclwatch-0', 'default', 'myhpcc-eclwatch', 'hpcc', 'esp'), + ('test-esp-0', 'default', 'test-esp', 'hpcc', 'esp'), + ('myhpcc-eclagent-0', 'default', 'myhpcc-eclagent', 'hpcc', 'eclagent'), + ('myhpcc-eclccserver-0', 'default', 'myhpcc-eclccserver', 'hpcc', 'eclccserver'), + ('myhpcc-eclscheduler-0', 'default', 'myhpcc-eclscheduler', 'hpcc', 'eclscheduler'), + ('myhpcc-dfuserver-0', 'default', 'myhpcc-dfuserver', 'hpcc', 'dfuserver'), + ('myhpcc-sasha-0', 'default', 'myhpcc-sasha', 'hpcc', 'sasha'), + ('myhpcc-dafilesrv-0', 'default', 'myhpcc-dafilesrv', 'hpcc', 'dafilesrv'), + ] + + passed = 0 + failed = 0 + + for pod_name, namespace, controller_name, expected_category, expected_component in test_cases: + category, component, is_hpcc = categorize_pod(pod_name, namespace, controller_name) + + if category == expected_category and component == expected_component and is_hpcc: + print(f"✓ PASS: {pod_name:40s} -> {component}") + passed += 1 + else: + print(f"✗ FAIL: {pod_name:40s} -> Expected: {expected_component}, Got: {component}") + failed += 1 + + print(f"\nHPCC Components: {passed} passed, {failed} failed") + print() + return failed == 0 + + +def test_system_components(): + """Test system component categorization.""" + print("Testing System Component Categorization") + print("=" * 80) + + test_cases = [ + # (pod_name, namespace, controller_name, expected_category) + ('kube-proxy-abc123', 'kube-system', 'kube-proxy', 'kubernetes-system'), + ('kube-dns-xyz', 'kube-system', 'kube-dns', 'kubernetes-system'), + ('coredns-12345', 'kube-system', 'coredns', 'kubernetes-system'), + ('omsagent-xyz789', 'kube-system', 'omsagent', 'azure-system'), + ('aks-link-abc', 'default', 'aks-link', 'azure-system'), + ('nginx-ingress-controller-abc', 'ingress-nginx', 'nginx-ingress', 'ingress'), + ('prometheus-server-0', 'monitoring', 'prometheus-server', 'monitoring'), + ('grafana-0', 'monitoring', 'grafana', 'monitoring'), + ('fluentd-abc', 'logging', 'fluentd', 'logging'), + ('elasticsearch-master-0', 'logging', 'elasticsearch-master', 'logging'), + ] + + passed = 0 + failed = 0 + + for pod_name, namespace, controller_name, expected_category in test_cases: + category, component, is_hpcc = categorize_pod(pod_name, namespace, controller_name) + + if category == expected_category and not is_hpcc: + print(f"✓ PASS: {pod_name:40s} -> {category}") + passed += 1 + else: + print(f"✗ FAIL: {pod_name:40s} -> Expected: {expected_category}, Got: {category}") + failed += 1 + + print(f"\nSystem Components: {passed} passed, {failed} failed") + print() + return failed == 0 + + +def test_unknown_components(): + """Test unknown component categorization.""" + print("Testing Unknown Component Categorization") + print("=" * 80) + + test_cases = [ + # (pod_name, namespace, controller_name) + ('my-custom-app-0', 'default', 'my-custom-app'), + ('random-service-123', 'myapp', 'random-service'), + ] + + passed = 0 + failed = 0 + + for pod_name, namespace, controller_name in test_cases: + category, component, is_hpcc = categorize_pod(pod_name, namespace, controller_name) + + if category == 'unknown' and not is_hpcc: + print(f"✓ PASS: {pod_name:40s} -> unknown") + passed += 1 + else: + print(f"✗ FAIL: {pod_name:40s} -> Expected: unknown, Got: {category}") + failed += 1 + + print(f"\nUnknown Components: {passed} passed, {failed} failed") + print() + return failed == 0 + + +def main(): + """Run all tests.""" + print("\n" + "=" * 80) + print("Azure Log Analytics Analyze - Categorization Tests") + print("=" * 80 + "\n") + + all_passed = True + + all_passed = test_hpcc_components() and all_passed + all_passed = test_system_components() and all_passed + all_passed = test_unknown_components() and all_passed + + print("=" * 80) + if all_passed: + print("✓ ALL TESTS PASSED") + print("=" * 80) + return 0 + else: + print("✗ SOME TESTS FAILED") + print("=" * 80) + return 1 + + +if __name__ == '__main__': + sys.exit(main())