From 5de12c6889ad47a49cb6723581eb8265c562140c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 12 Dec 2025 09:16:24 +0000 Subject: [PATCH 1/7] Initial plan From 0a4d3d56e7f86de34e4c1fd5d5bf679dd82dffe5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 12 Dec 2025 09:22:41 +0000 Subject: [PATCH 2/7] Add pod and node inventory scripts with analysis Co-authored-by: jakesmith <902700+jakesmith@users.noreply.github.com> --- tools/azure/analyze_pod_node_inventory.py | 470 ++++++++++++++++++++++ tools/azure/get_pod_node_inventory.py | 435 ++++++++++++++++++++ 2 files changed, 905 insertions(+) create mode 100755 tools/azure/analyze_pod_node_inventory.py create mode 100755 tools/azure/get_pod_node_inventory.py diff --git a/tools/azure/analyze_pod_node_inventory.py b/tools/azure/analyze_pod_node_inventory.py new file mode 100755 index 00000000000..5914e9d521f --- /dev/null +++ b/tools/azure/analyze_pod_node_inventory.py @@ -0,0 +1,470 @@ +#!/usr/bin/env python3 +""" +Analyze Kubernetes pod and node inventory from get_pod_node_inventory.py output. + +Identifies HPCC components from pod names, cross-references pods to nodes, and +calculates resource consumption per component over time. + +Input: + CSV output from get_pod_node_inventory.py (via stdin or file) + Format: TimeGenerated,RecordType,Name,Namespace,PodStatus,Computer,... + +Usage: + get_pod_node_inventory.py ... | python3 analyze_pod_node_inventory.py [options] + python3 analyze_pod_node_inventory.py inventory.csv [options] + +Examples: + # Basic analysis + cat inventory.csv | python3 analyze_pod_node_inventory.py + + # Text report + python3 analyze_pod_node_inventory.py inventory.csv --format text + + # Filter by time range + python3 analyze_pod_node_inventory.py inventory.csv --start-time "2025-11-04 12:00" \ + --end-time "2025-11-04 18:00" + + # Component-level analysis + python3 analyze_pod_node_inventory.py inventory.csv --format text --by-component +""" + +import sys +import argparse +import csv +from datetime import datetime +from collections import defaultdict +from typing import List, Dict, Optional, Set + + +def parse_datetime(dt_str: str) -> datetime: + """Parse datetime string in multiple formats.""" + formats = [ + '%Y-%m-%dT%H:%M:%S.%fZ', + '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%d %H:%M:%S', + '%Y-%m-%d %H:%M', + '%Y-%m-%d' + ] + + for fmt in formats: + try: + return datetime.strptime(dt_str, fmt) + except ValueError: + continue + + raise ValueError(f"Invalid datetime format: {dt_str}") + + +def identify_component(pod_name: str) -> str: + """Identify HPCC component from pod name. + + HPCC pod naming conventions: + - -dali- + - -esp- + - -eclccserver- + - -roxie-- + - -thor--thormanager- + - -thor--thorworker- + - -sasha- + - etc. + + Returns component name or "Other" if not recognized + """ + parts = pod_name.split('-') + + # Handle common HPCC components + if len(parts) >= 2: + # Check for standard components (dali, esp, sasha, etc.) + if 'dali' in parts: + return 'dali' + elif 'esp' in parts: + return 'esp' + elif 'eclccserver' in parts: + return 'eclccserver' + elif 'sasha' in parts: + return 'sasha' + elif 'dfuserver' in parts: + return 'dfuserver' + elif 'eclagent' in parts: + return 'eclagent' + + # Thor has more complex naming + elif 'thor' in parts: + if 'thormanager' in parts or 'manager' in parts: + # Extract cluster name if available + thor_idx = parts.index('thor') + if thor_idx + 1 < len(parts) and 'thormanager' not in parts[thor_idx + 1] and 'manager' not in parts[thor_idx + 1]: + cluster_name = parts[thor_idx + 1] + return f'thor-{cluster_name}' + return 'thor' + elif 'thorworker' in parts or 'worker' in parts: + # Extract cluster name if available + thor_idx = parts.index('thor') + if thor_idx + 1 < len(parts) and 'thorworker' not in parts[thor_idx + 1] and 'worker' not in parts[thor_idx + 1]: + cluster_name = parts[thor_idx + 1] + return f'thor-{cluster_name}-worker' + return 'thor-worker' + else: + return 'thor' + + # Roxie + elif 'roxie' in parts: + # Extract cluster name if available + roxie_idx = parts.index('roxie') + if roxie_idx + 1 < len(parts): + cluster_name = parts[roxie_idx + 1] + return f'roxie-{cluster_name}' + return 'roxie' + + # If not recognized, return "Other" + return 'Other' + + +def read_inventory(input_source: Optional[str], start_time: Optional[datetime], + end_time: Optional[datetime]) -> tuple: + """Read and filter pod and node inventory records. + + Returns: (pods, nodes) where: + pods: List of pod records (dicts) + nodes: List of node records (dicts) + """ + pods = [] + nodes = [] + + # Open input source + if input_source: + f = open(input_source, 'r') + else: + f = sys.stdin + + try: + # Skip comment lines + lines = [line for line in f if not line.strip().startswith('#')] + + # Parse CSV + reader = csv.DictReader(lines) + for row in reader: + # Parse timestamp + try: + timestamp = parse_datetime(row.get('TimeGenerated', '')) + except ValueError: + continue # Skip rows with invalid timestamps + + # Apply time filters + if start_time and timestamp < start_time: + continue + if end_time and timestamp >= end_time: + continue + + # Add timestamp to record + row['_timestamp'] = timestamp + + # Separate pods and nodes + record_type = row.get('RecordType', '') + if record_type == 'Pod': + pods.append(row) + elif record_type == 'Node': + nodes.append(row) + finally: + if input_source: + f.close() + + return pods, nodes + + +def analyze_component_usage(pods: List[Dict], nodes: List[Dict]) -> Dict: + """Analyze component resource usage. + + Returns dict with: + - component_pods: {component: [pod_records]} + - component_nodes: {component: set(node_names)} + - pod_to_node: {pod_name: node_name} + - node_info: {node_name: node_record} + """ + # Build pod to node mapping + pod_to_node = {} + for pod in pods: + pod_name = pod.get('Name', '') + node_name = pod.get('Computer', '') + if pod_name and node_name: + pod_to_node[pod_name] = node_name + + # Build node info mapping + node_info = {} + for node in nodes: + node_name = node.get('Name', '') + if node_name: + node_info[node_name] = node + + # Group pods by component + component_pods = defaultdict(list) + for pod in pods: + pod_name = pod.get('Name', '') + component = identify_component(pod_name) + component_pods[component].append(pod) + + # Build component to nodes mapping + component_nodes = defaultdict(set) + for component, pod_list in component_pods.items(): + for pod in pod_list: + pod_name = pod.get('Name', '') + node_name = pod_to_node.get(pod_name) + if node_name: + component_nodes[component].add(node_name) + + return { + 'component_pods': dict(component_pods), + 'component_nodes': dict(component_nodes), + 'pod_to_node': pod_to_node, + 'node_info': node_info + } + + +def calculate_durations(pods: List[Dict], start_time: datetime, end_time: datetime) -> Dict: + """Calculate how long each component consumed resources. + + Returns dict with: + - component_duration: {component: duration_hours} + - component_pod_hours: {component: pod_hours} + """ + component_duration = defaultdict(float) + component_pod_hours = defaultdict(float) + + # Total time window + total_hours = (end_time - start_time).total_seconds() / 3600.0 + + # Calculate pod hours for each component + component_pod_count = defaultdict(int) + for pod in pods: + pod_name = pod.get('Name', '') + component = identify_component(pod_name) + component_pod_count[component] += 1 + + # Assume all pods were running for the entire time window + # (This is a simplification; more accurate would require time-series data) + for component, count in component_pod_count.items(): + component_pod_hours[component] = count * total_hours + component_duration[component] = total_hours + + return { + 'component_duration': dict(component_duration), + 'component_pod_hours': dict(component_pod_hours) + } + + +def output_csv(analysis: Dict, pods: List[Dict], nodes: List[Dict], + start_time: datetime, end_time: datetime, args) -> None: + """Output analysis results in CSV format.""" + # Metadata header + print("# Generated by: analyze_pod_node_inventory.py") + print(f"# Date generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print(f"# Time range: {start_time.strftime('%Y-%m-%d %H:%M:%S')} to {end_time.strftime('%Y-%m-%d %H:%M:%S')}") + print(f"# Total pods: {len(pods)}") + print(f"# Total nodes: {len(nodes)}") + print("#") + + component_pods = analysis['component_pods'] + component_nodes = analysis['component_nodes'] + + # Calculate durations + duration_info = calculate_durations(pods, start_time, end_time) + component_duration = duration_info['component_duration'] + component_pod_hours = duration_info['component_pod_hours'] + + # Output CSV + print("Component,PodCount,NodeCount,DurationHours,PodHours") + + for component in sorted(component_pods.keys()): + pod_count = len(component_pods[component]) + node_count = len(component_nodes.get(component, set())) + duration = component_duration.get(component, 0.0) + pod_hours = component_pod_hours.get(component, 0.0) + + print(f"{component},{pod_count},{node_count},{duration:.2f},{pod_hours:.2f}") + + +def output_text(analysis: Dict, pods: List[Dict], nodes: List[Dict], + start_time: datetime, end_time: datetime, args) -> None: + """Output analysis results in human-readable text format.""" + print("=" * 80) + print("POD AND NODE INVENTORY ANALYSIS") + print("=" * 80) + print() + + # Summary + print("SUMMARY") + print("-" * 80) + print(f"Time Range: {start_time.strftime('%Y-%m-%d %H:%M:%S')} to {end_time.strftime('%Y-%m-%d %H:%M:%S')}") + + total_hours = (end_time - start_time).total_seconds() / 3600.0 + print(f"Duration: {total_hours:.2f} hours") + print(f"Total Pods: {len(pods)}") + print(f"Total Nodes: {len(nodes)}") + print() + + component_pods = analysis['component_pods'] + component_nodes = analysis['component_nodes'] + pod_to_node = analysis['pod_to_node'] + + # Calculate durations + duration_info = calculate_durations(pods, start_time, end_time) + component_duration = duration_info['component_duration'] + component_pod_hours = duration_info['component_pod_hours'] + + # Component breakdown + print("COMPONENT BREAKDOWN") + print("-" * 80) + print(f"{'Component':<30s} {'Pods':>8s} {'Nodes':>8s} {'Duration':>12s} {'Pod-Hours':>12s}") + print("-" * 80) + + for component in sorted(component_pods.keys()): + pod_count = len(component_pods[component]) + node_count = len(component_nodes.get(component, set())) + duration = component_duration.get(component, 0.0) + pod_hours = component_pod_hours.get(component, 0.0) + + print(f"{component:<30s} {pod_count:>8d} {node_count:>8d} {duration:>10.2f}h {pod_hours:>10.2f}h") + + print() + + # By-component detailed view + if args.by_component: + print("DETAILED COMPONENT ANALYSIS") + print("-" * 80) + + for component in sorted(component_pods.keys()): + print(f"\n{component}:") + print(f" Pods: {len(component_pods[component])}") + print(f" Nodes: {len(component_nodes.get(component, set()))}") + + # List pods + if len(component_pods[component]) <= 20: + print(" Pod List:") + for pod in sorted(component_pods[component], key=lambda p: p.get('Name', '')): + pod_name = pod.get('Name', '') + node_name = pod_to_node.get(pod_name, 'Unknown') + status = pod.get('PodStatus', '') + print(f" - {pod_name:<50s} -> {node_name:<30s} [{status}]") + else: + print(f" Pod List: (showing first 20 of {len(component_pods[component])})") + for pod in sorted(component_pods[component], key=lambda p: p.get('Name', ''))[:20]: + pod_name = pod.get('Name', '') + node_name = pod_to_node.get(pod_name, 'Unknown') + status = pod.get('PodStatus', '') + print(f" - {pod_name:<50s} -> {node_name:<30s} [{status}]") + + # List nodes + nodes_for_component = component_nodes.get(component, set()) + if nodes_for_component: + print(f" Nodes used: {', '.join(sorted(nodes_for_component))}") + + print() + + # Node utilization + print("NODE UTILIZATION") + print("-" * 80) + + # Count pods per node + node_pod_count = defaultdict(int) + for pod_name, node_name in pod_to_node.items(): + node_pod_count[node_name] += 1 + + print(f"{'Node Name':<50s} {'Pod Count':>10s}") + print("-" * 80) + + for node_name in sorted(node_pod_count.keys()): + count = node_pod_count[node_name] + print(f"{node_name:<50s} {count:>10d}") + + print() + print("=" * 80) + + +def main(): + parser = argparse.ArgumentParser( + description='Analyze Kubernetes pod and node inventory', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=''' +Examples: + # Basic CSV analysis + cat inventory.csv | %(prog)s + + # Text report + %(prog)s inventory.csv --format text + + # Detailed component analysis + %(prog)s inventory.csv --format text --by-component + + # Time-filtered analysis + %(prog)s inventory.csv --start-time "2025-11-04 12:00" --end-time "2025-11-04 18:00" + ''' + ) + + parser.add_argument( + 'input', + nargs='?', + help='Input CSV file from get_pod_node_inventory.py (or read from stdin)' + ) + + parser.add_argument( + '--start-time', + help='Start time filter (YYYY-MM-DD or YYYY-MM-DD HH:MM)' + ) + + parser.add_argument( + '--end-time', + help='End time filter (YYYY-MM-DD or YYYY-MM-DD HH:MM)' + ) + + parser.add_argument( + '--format', + choices=['csv', 'text'], + default='csv', + help='Output format (default: csv)' + ) + + parser.add_argument( + '--by-component', + action='store_true', + help='Show detailed breakdown by component (text format only)' + ) + + args = parser.parse_args() + + # Parse time filters + start_time = parse_datetime(args.start_time) if args.start_time else None + end_time = parse_datetime(args.end_time) if args.end_time else None + + # Read inventory + pods, nodes = read_inventory(args.input, start_time, end_time) + + if not pods and not nodes: + print("No records found matching filters.", file=sys.stderr) + return 1 + + # Determine time range from data if not specified + if pods: + pod_times = [p['_timestamp'] for p in pods if '_timestamp' in p] + if pod_times and not start_time: + start_time = min(pod_times) + if pod_times and not end_time: + end_time = max(pod_times) + + if not start_time or not end_time: + print("Error: Could not determine time range. Please specify --start-time and --end-time", file=sys.stderr) + return 1 + + # Analyze + analysis = analyze_component_usage(pods, nodes) + + # Output + if args.format == 'csv': + output_csv(analysis, pods, nodes, start_time, end_time, args) + else: + output_text(analysis, pods, nodes, start_time, end_time, args) + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/tools/azure/get_pod_node_inventory.py b/tools/azure/get_pod_node_inventory.py new file mode 100755 index 00000000000..71207a44680 --- /dev/null +++ b/tools/azure/get_pod_node_inventory.py @@ -0,0 +1,435 @@ +#!/usr/bin/env python3 +""" +Get Azure Kubernetes Pod and Node Inventory from Log Analytics + +This script queries the KubePodInventory and KubeNodeInventory tables in Azure +Log Analytics to retrieve information about pods and nodes running in a specific +namespace during a given time range. + +Usage: + ./get_pod_node_inventory.py --start-time "2025-11-04 12:00" [options] + +Examples: + # Query using workspace ID directly + ./get_pod_node_inventory.py --workspace-id \ + --start-time "2025-11-04 12:00" -n hpcc + + # Auto-discover from AKS cluster + ./get_pod_node_inventory.py --cluster my-aks-cluster \ + --resource-group my-rg --start-time "2025-11-04 12:00" -n hpcc + + # Query specific namespace with time range + ./get_pod_node_inventory.py --workspace-id \ + --start-time "2025-11-04 09:00" --end-time "2025-11-04 17:00" -n hpcc + + # Query all namespaces + ./get_pod_node_inventory.py --workspace-id \ + --start-time "2025-11-04 12:00" --all-namespaces + + # Export to CSV file + ./get_pod_node_inventory.py --workspace-id \ + --start-time "2025-11-04 12:00" -n hpcc > inventory.csv + +DateTime format: YYYY-MM-DD HH:MM (defaults to UTC) +""" + +import sys +import json +import subprocess +import argparse +from datetime import datetime, timedelta + + +def query_log_analytics(workspace_id, query, verbose=False): + """Execute a KQL query against Log Analytics workspace using REST API.""" + + # Use the Log Analytics Query API + uri = f"https://api.loganalytics.io/v1/workspaces/{workspace_id}/query" + + request_body = { + "query": query + } + + cmd = [ + 'az', 'rest', + '--method', 'POST', + '--uri', uri, + '--body', json.dumps(request_body), + '--headers', 'Content-Type=application/json', + '--output', 'json' + ] + + if verbose: + print("=== API Request ===", file=sys.stderr) + print(f"URI: {uri}", file=sys.stderr) + print(f"Command: {' '.join(cmd)}", file=sys.stderr) + print("===================\n", file=sys.stderr) + + try: + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + + if verbose: + print("=== Raw API Response ===", file=sys.stderr) + print(result.stdout[:2000], file=sys.stderr) # First 2000 chars + if len(result.stdout) > 2000: + print(f"... (truncated, total length: {len(result.stdout)} chars)", file=sys.stderr) + print("========================\n", file=sys.stderr) + + data = json.loads(result.stdout) + + if verbose: + print("=== Parsed Response ===", file=sys.stderr) + print(f"Response keys: {list(data.keys())}", file=sys.stderr) + print(f"Number of tables: {len(data.get('tables', []))}", file=sys.stderr) + + # Extract tables from response + tables = data.get('tables', []) + if not tables: + if verbose: + print("WARNING: No tables in response!", file=sys.stderr) + print(f"Full response: {json.dumps(data, indent=2)}", file=sys.stderr) + print("=======================\n", file=sys.stderr) + return [] + + # Get first table (query results) + table = tables[0] + columns = [col['name'] for col in table.get('columns', [])] + rows = table.get('rows', []) + + if verbose: + print(f"Columns: {columns}", file=sys.stderr) + print(f"Number of rows: {len(rows)}", file=sys.stderr) + print("=======================\n", file=sys.stderr) + + # Convert rows to dict format + results = [] + for row in rows: + row_dict = {} + for i, col_name in enumerate(columns): + row_dict[col_name] = row[i] if i < len(row) else None + results.append(row_dict) + + return results + + except subprocess.CalledProcessError as e: + print(f"Error querying Log Analytics: {e.stderr}", file=sys.stderr) + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error parsing JSON response: {e}", file=sys.stderr) + print(f"Raw stdout: {result.stdout}", file=sys.stderr) + sys.exit(1) + + +def get_workspace_info(subscription_id, resource_group, cluster_name, verbose=False): + """Get Log Analytics workspace information from AKS cluster. + + Args: + subscription_id: Azure subscription ID or name (None to use current az account) + resource_group: Azure resource group containing the cluster + cluster_name: AKS cluster name + verbose: Enable debug output + + Returns: + tuple: (workspace_id, resource_group, subscription_id) + """ + + if verbose: + if subscription_id: + print(f"Querying AKS cluster '{cluster_name}' in subscription '{subscription_id}', RG '{resource_group}'...", file=sys.stderr) + else: + print(f"Querying AKS cluster '{cluster_name}' in RG '{resource_group}' (using current az account)...", file=sys.stderr) + + # Build command + cmd = ['az', 'aks', 'show'] + if subscription_id: + cmd.extend(['--subscription', subscription_id]) + cmd.extend([ + '-n', cluster_name, + '-g', resource_group, + '--query', '{id:id, workspace:addonProfiles.omsagent.config.logAnalyticsWorkspaceResourceID}', + '-o', 'json' + ]) + + # Query the specific cluster for its workspace + try: + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + print(f"Error: Could not find AKS cluster '{cluster_name}' in resource group '{resource_group}'", file=sys.stderr) + print(f"Subscription: {subscription_id}", file=sys.stderr) + print(f"Details: {result.stderr}", file=sys.stderr) + sys.exit(1) + + cluster_info = json.loads(result.stdout) + workspace_resource_id = cluster_info.get('workspace', '').strip() + + if not workspace_resource_id: + print(f"Error: Cluster '{cluster_name}' does not have Container Insights (omsagent addon) configured.", file=sys.stderr) + print("To fix this:", file=sys.stderr) + print(f" Enable Container Insights on the cluster via Azure Portal or CLI", file=sys.stderr) + sys.exit(1) + + if verbose: + print(f"Workspace resource ID: {workspace_resource_id}", file=sys.stderr) + + # Extract workspace details from resource ID + # Format: /subscriptions/.../resourceGroups/.../providers/Microsoft.OperationalInsights/workspaces/NAME + parts = workspace_resource_id.split('/') + workspace_name = parts[-1] + workspace_rg = parts[4] + workspace_sub = parts[2] + + if verbose: + print(f"Workspace name: {workspace_name}, RG: {workspace_rg}", file=sys.stderr) + + # Get workspace customer ID (needed for Log Analytics API) + result = subprocess.run( + ['az', 'monitor', 'log-analytics', 'workspace', 'show', + '--subscription', workspace_sub, + '-g', workspace_rg, + '-n', workspace_name, + '--query', 'customerId', + '-o', 'tsv'], + capture_output=True, + text=True, + check=True + ) + + workspace_id = result.stdout.strip() + + if verbose: + print(f"Auto-discovered workspace for cluster '{cluster_name}': {workspace_id}", file=sys.stderr) + + return workspace_id, resource_group, subscription_id + + except subprocess.CalledProcessError as e: + print(f"Error querying AKS cluster: {e.stderr}", file=sys.stderr) + sys.exit(1) + except (json.JSONDecodeError, IndexError, KeyError) as e: + print(f"Error parsing cluster information: {e}", file=sys.stderr) + sys.exit(1) + + +def build_query(namespace, start_time, end_time): + """Build the KQL query for pod and node inventory.""" + + # Format times for KQL + start_str = start_time.strftime('%Y-%m-%dT%H:%M:%SZ') + end_str = end_time.strftime('%Y-%m-%dT%H:%M:%SZ') + + query = f""" +let startTime = datetime({start_str}); +let endTime = datetime({end_str}); +// Get pod inventory +let pods = KubePodInventory +| where TimeGenerated >= startTime and TimeGenerated < endTime +""" + + # Add namespace filter only if specified + if namespace: + query += f"| where Namespace == '{namespace}'\n" + + query += """| summarize arg_max(TimeGenerated, *) by Name, Computer +| project TimeGenerated, RecordType="Pod", Name, Namespace, PodStatus, Computer, + ContainerStatus, PodCreationTimeStamp, ControllerName, ControllerKind, PodUid, + ServiceName="", NodeStatus="", KubeletVersion=""; +// Get node inventory +let nodes = KubeNodeInventory +| where TimeGenerated >= startTime and TimeGenerated < endTime +| summarize arg_max(TimeGenerated, *) by Computer +| project TimeGenerated, RecordType="Node", Name=Computer, Namespace="", PodStatus="", + Computer, ContainerStatus="", PodCreationTimeStamp="", ControllerName="", + ControllerKind="", PodUid="", ServiceName="", NodeStatus=Status, + KubeletVersion; +// Union both results +union pods, nodes +| sort by RecordType asc, Name asc +""" + + return query + + +def format_csv_output(results, metadata=None): + """Format the query results as CSV.""" + + lines = [] + + # Add metadata as comments if provided + if metadata: + lines.append("# Generated by: get_pod_node_inventory.py") + lines.append(f"# Date generated: {metadata.get('date_generated', '')}") + lines.append(f"# Command: {metadata.get('command', '')}") + lines.append(f"# Workspace ID: {metadata.get('workspace_id', '')}") + lines.append(f"# Time range: {metadata.get('start_time', '')} to {metadata.get('end_time', '')}") + if metadata.get('namespace'): + lines.append(f"# Namespace: {metadata.get('namespace', '')}") + else: + lines.append("# Namespace: All") + lines.append("#") + + if not results: + return '\n'.join(lines) if lines else "" + + # CSV header - fixed column order + headers = [ + 'TimeGenerated', 'RecordType', 'Name', 'Namespace', 'PodStatus', + 'Computer', 'ContainerStatus', 'PodCreationTimeStamp', 'ControllerName', + 'ControllerKind', 'PodUid', 'ServiceName', 'NodeStatus', 'KubeletVersion' + ] + lines.append(','.join(headers)) + + # CSV rows + for row in results: + values = [str(row.get(h, '')) for h in headers] + lines.append(','.join(values)) + + return '\n'.join(lines) + + +def main(): + parser = argparse.ArgumentParser( + description='Query Kubernetes pod and node inventory from Azure Log Analytics', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Using cluster discovery + %(prog)s --cluster my-aks-cluster --resource-group my-resource-group -n hpcc --start-time "2025-11-04 12:00" + %(prog)s --cluster --resource-group --subscription -n hpcc --start-time "2025-11-04 09:00" --end-time "2025-11-04 17:00" + + # Using workspace ID directly + %(prog)s --workspace-id xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx -n hpcc --start-time "2025-11-04 12:00" + %(prog)s --workspace-id --all-namespaces --start-time "2025-11-04 12:00" + %(prog)s --workspace-id -n hpcc --start-time "2025-11-04 12:00" --duration 120 + %(prog)s --workspace-id -n hpcc --start-time "2025-11-04 12:00" > inventory.csv + """ + ) + + # Workspace identification (mutually exclusive) + parser.add_argument('--workspace-id', help='Log Analytics workspace ID (alternative to cluster discovery)') + + parser.add_argument('--start-time', required=True, metavar='DATETIME', help='Start time (YYYY-MM-DD or YYYY-MM-DD HH:MM)') + parser.add_argument('--end-time', metavar='DATETIME', help='End time (YYYY-MM-DD or YYYY-MM-DD HH:MM). If specified, --duration is ignored.') + + # Namespace (mutually exclusive) + namespace_group = parser.add_mutually_exclusive_group(required=True) + namespace_group.add_argument('-n', '--namespace', help='Kubernetes namespace to query') + namespace_group.add_argument('--all-namespaces', action='store_true', help='Query all namespaces') + + parser.add_argument('--duration', type=int, default=60, metavar='MINUTES', + help='Time window duration in minutes from start time (default: 60). Ignored if --end-time is specified.') + parser.add_argument('--subscription', help='Azure subscription ID or name (defaults to current az account)') + parser.add_argument('--resource-group', help='Azure resource group containing AKS cluster (required with --cluster)') + parser.add_argument('--cluster', help='AKS cluster name (alternative to --workspace-id)') + parser.add_argument('--verbose', action='store_true', help='Print the KQL query for debugging') + + args = parser.parse_args() + + # Validate workspace arguments - must provide either workspace-id OR cluster+resource-group + if not args.workspace_id and not args.cluster: + print("Error: Must provide either --workspace-id OR --cluster with --resource-group", file=sys.stderr) + sys.exit(1) + + if args.workspace_id and args.cluster: + print("Error: Cannot specify both --workspace-id and --cluster (use one or the other)", file=sys.stderr) + sys.exit(1) + + # Validate cluster arguments + if args.cluster and not args.resource_group: + print("Error: --resource-group is required when using --cluster", file=sys.stderr) + sys.exit(1) + + # Get namespace from args + if args.all_namespaces: + namespace = None # Query all namespaces + else: + namespace = args.namespace + + # Parse datetime - support both date and date+time formats + try: + dt = datetime.strptime(args.start_time, '%Y-%m-%d %H:%M') + except ValueError: + try: + # Date only - assume start of day (00:00) + dt = datetime.strptime(args.start_time + ' 00:00', '%Y-%m-%d %H:%M') + except ValueError: + print(f"Error: Invalid start-time format. Use YYYY-MM-DD or YYYY-MM-DD HH:MM", file=sys.stderr) + sys.exit(1) + + start_time = dt + + # Parse end time if provided + if args.end_time: + try: + end_dt = datetime.strptime(args.end_time, '%Y-%m-%d %H:%M') + except ValueError: + try: + # Date only - assume end of day (23:59) + end_dt = datetime.strptime(args.end_time + ' 23:59', '%Y-%m-%d %H:%M') + except ValueError: + print(f"Error: Invalid end-time format. Use YYYY-MM-DD or YYYY-MM-DD HH:MM", file=sys.stderr) + sys.exit(1) + + end_time = end_dt + else: + # Use duration + end_time = dt + timedelta(minutes=args.duration) + + # Get workspace information + if args.workspace_id: + workspace_id = args.workspace_id + resource_group = None + subscription_id = None + else: + workspace_id, resource_group, subscription_id = get_workspace_info( + subscription_id=args.subscription, + resource_group=args.resource_group, + cluster_name=args.cluster, + verbose=args.verbose + ) + + if namespace: + print(f"Querying pod and node inventory for namespace '{namespace}'", file=sys.stderr) + else: + print(f"Querying pod and node inventory for ALL namespaces", file=sys.stderr) + print(f"Time window: {start_time.strftime('%Y-%m-%d %H:%M')} UTC to {end_time.strftime('%Y-%m-%d %H:%M')} UTC", file=sys.stderr) + if subscription_id: + print(f"Subscription: {subscription_id}", file=sys.stderr) + print(f"Workspace: {workspace_id}", file=sys.stderr) + print(file=sys.stderr) + + # Build and execute query + query = build_query(namespace, start_time, end_time) + + if args.verbose: + print("=== KQL Query ===", file=sys.stderr) + print(query, file=sys.stderr) + print("=================\n", file=sys.stderr) + + results = query_log_analytics(workspace_id, query, args.verbose) + + # Prepare metadata for CSV comments + from datetime import datetime as dt_mod + metadata = { + 'date_generated': dt_mod.now().strftime('%Y-%m-%d %H:%M:%S'), + 'command': ' '.join(sys.argv), + 'workspace_id': workspace_id, + 'start_time': start_time.strftime('%Y-%m-%d %H:%M:%S UTC'), + 'end_time': end_time.strftime('%Y-%m-%d %H:%M:%S UTC'), + 'namespace': args.namespace + } + + # Format output as CSV + output = format_csv_output(results, metadata=metadata) + + # Display summary + pod_count = sum(1 for r in results if r.get('RecordType') == 'Pod') + node_count = sum(1 for r in results if r.get('RecordType') == 'Node') + print(f"Found {pod_count} pods and {node_count} nodes", file=sys.stderr) + print(file=sys.stderr) + + # Print output + print(output) + + +if __name__ == '__main__': + main() From db69b606898686486f873d1546e3da0b87f33333 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 12 Dec 2025 09:24:49 +0000 Subject: [PATCH 3/7] Address code review feedback: improve validation and documentation Co-authored-by: jakesmith <902700+jakesmith@users.noreply.github.com> --- tools/azure/analyze_pod_node_inventory.py | 8 ++++++-- tools/azure/get_pod_node_inventory.py | 21 +++++++++++++++++---- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/tools/azure/analyze_pod_node_inventory.py b/tools/azure/analyze_pod_node_inventory.py index 5914e9d521f..bdcc7048da7 100755 --- a/tools/azure/analyze_pod_node_inventory.py +++ b/tools/azure/analyze_pod_node_inventory.py @@ -33,7 +33,7 @@ import csv from datetime import datetime from collections import defaultdict -from typing import List, Dict, Optional, Set +from typing import List, Dict, Optional def parse_datetime(dt_str: str) -> datetime: @@ -223,6 +223,11 @@ def analyze_component_usage(pods: List[Dict], nodes: List[Dict]) -> Dict: def calculate_durations(pods: List[Dict], start_time: datetime, end_time: datetime) -> Dict: """Calculate how long each component consumed resources. + IMPORTANT: This function assumes all pods were running for the entire time window. + This is a simplification based on snapshot data from KubePodInventory. For more + accurate duration tracking, time-series data would be needed to track pod lifecycle + events (start/stop times). + Returns dict with: - component_duration: {component: duration_hours} - component_pod_hours: {component: pod_hours} @@ -241,7 +246,6 @@ def calculate_durations(pods: List[Dict], start_time: datetime, end_time: dateti component_pod_count[component] += 1 # Assume all pods were running for the entire time window - # (This is a simplification; more accurate would require time-series data) for component, count in component_pod_count.items(): component_pod_hours[component] = count * total_hours component_duration[component] = total_hours diff --git a/tools/azure/get_pod_node_inventory.py b/tools/azure/get_pod_node_inventory.py index 71207a44680..0f252b4d2f9 100755 --- a/tools/azure/get_pod_node_inventory.py +++ b/tools/azure/get_pod_node_inventory.py @@ -211,8 +211,19 @@ def get_workspace_info(subscription_id, resource_group, cluster_name, verbose=Fa def build_query(namespace, start_time, end_time): - """Build the KQL query for pod and node inventory.""" - + """Build the KQL query for pod and node inventory. + + Args: + namespace: Kubernetes namespace to filter (None for all namespaces) + start_time: Start datetime object + end_time: End datetime object + """ + # Validate inputs + if not isinstance(start_time, datetime): + raise ValueError("start_time must be a datetime object") + if not isinstance(end_time, datetime): + raise ValueError("end_time must be a datetime object") + # Format times for KQL start_str = start_time.strftime('%Y-%m-%dT%H:%M:%SZ') end_str = end_time.strftime('%Y-%m-%dT%H:%M:%SZ') @@ -227,6 +238,9 @@ def build_query(namespace, start_time, end_time): # Add namespace filter only if specified if namespace: + # Validate namespace to prevent KQL injection (alphanumeric, hyphens, dots only) + if not all(c.isalnum() or c in '-_.' for c in namespace): + raise ValueError(f"Invalid namespace: {namespace}. Only alphanumeric characters, hyphens, underscores, and dots are allowed.") query += f"| where Namespace == '{namespace}'\n" query += """| summarize arg_max(TimeGenerated, *) by Name, Computer @@ -408,9 +422,8 @@ def main(): results = query_log_analytics(workspace_id, query, args.verbose) # Prepare metadata for CSV comments - from datetime import datetime as dt_mod metadata = { - 'date_generated': dt_mod.now().strftime('%Y-%m-%d %H:%M:%S'), + 'date_generated': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'command': ' '.join(sys.argv), 'workspace_id': workspace_id, 'start_time': start_time.strftime('%Y-%m-%d %H:%M:%S UTC'), From 339d186bcca8d8f5dc68ad461541c511191c3949 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 12 Dec 2025 09:26:56 +0000 Subject: [PATCH 4/7] Improve error handling and validation based on code review Co-authored-by: jakesmith <902700+jakesmith@users.noreply.github.com> --- tools/azure/analyze_pod_node_inventory.py | 72 +++++++++++------------ tools/azure/get_pod_node_inventory.py | 7 ++- 2 files changed, 39 insertions(+), 40 deletions(-) diff --git a/tools/azure/analyze_pod_node_inventory.py b/tools/azure/analyze_pod_node_inventory.py index bdcc7048da7..93960af9220 100755 --- a/tools/azure/analyze_pod_node_inventory.py +++ b/tools/azure/analyze_pod_node_inventory.py @@ -131,43 +131,41 @@ def read_inventory(input_source: Optional[str], start_time: Optional[datetime], pods = [] nodes = [] - # Open input source - if input_source: - f = open(input_source, 'r') - else: - f = sys.stdin - + # Open input source with proper error handling try: - # Skip comment lines - lines = [line for line in f if not line.strip().startswith('#')] - - # Parse CSV - reader = csv.DictReader(lines) - for row in reader: - # Parse timestamp - try: - timestamp = parse_datetime(row.get('TimeGenerated', '')) - except ValueError: - continue # Skip rows with invalid timestamps - - # Apply time filters - if start_time and timestamp < start_time: - continue - if end_time and timestamp >= end_time: - continue - - # Add timestamp to record - row['_timestamp'] = timestamp - - # Separate pods and nodes - record_type = row.get('RecordType', '') - if record_type == 'Pod': - pods.append(row) - elif record_type == 'Node': - nodes.append(row) - finally: if input_source: - f.close() + with open(input_source, 'r') as f: + lines = [line for line in f if not line.strip().startswith('#')] + else: + lines = [line for line in sys.stdin if not line.strip().startswith('#')] + except IOError as e: + print(f"Error reading input: {e}", file=sys.stderr) + return ([], []) + + # Parse CSV + reader = csv.DictReader(lines) + for row in reader: + # Parse timestamp + try: + timestamp = parse_datetime(row.get('TimeGenerated', '')) + except ValueError: + continue # Skip rows with invalid timestamps + + # Apply time filters + if start_time and timestamp < start_time: + continue + if end_time and timestamp >= end_time: + continue + + # Add timestamp to record + row['_timestamp'] = timestamp + + # Separate pods and nodes + record_type = row.get('RecordType', '') + if record_type == 'Pod': + pods.append(row) + elif record_type == 'Node': + nodes.append(row) return pods, nodes @@ -464,10 +462,10 @@ def main(): # Output if args.format == 'csv': output_csv(analysis, pods, nodes, start_time, end_time, args) + return 0 else: output_text(analysis, pods, nodes, start_time, end_time, args) - - return 0 + return 0 if __name__ == '__main__': diff --git a/tools/azure/get_pod_node_inventory.py b/tools/azure/get_pod_node_inventory.py index 0f252b4d2f9..0c0524ddc92 100755 --- a/tools/azure/get_pod_node_inventory.py +++ b/tools/azure/get_pod_node_inventory.py @@ -238,9 +238,10 @@ def build_query(namespace, start_time, end_time): # Add namespace filter only if specified if namespace: - # Validate namespace to prevent KQL injection (alphanumeric, hyphens, dots only) - if not all(c.isalnum() or c in '-_.' for c in namespace): - raise ValueError(f"Invalid namespace: {namespace}. Only alphanumeric characters, hyphens, underscores, and dots are allowed.") + # Validate namespace to prevent KQL injection + # Kubernetes namespaces: lowercase letters, numbers, and hyphens only + if not all(c.islower() or c.isdigit() or c == '-' for c in namespace): + raise ValueError(f"Invalid namespace: {namespace}. Only lowercase letters, numbers, and hyphens are allowed.") query += f"| where Namespace == '{namespace}'\n" query += """| summarize arg_max(TimeGenerated, *) by Name, Computer From 515abefd5978072af4cfd3fe2d81be9a174587f4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 12 Dec 2025 09:30:36 +0000 Subject: [PATCH 5/7] Final code review fixes: proper error handling and exit codes Co-authored-by: jakesmith <902700+jakesmith@users.noreply.github.com> --- tools/azure/get_pod_node_inventory.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/azure/get_pod_node_inventory.py b/tools/azure/get_pod_node_inventory.py index 0c0524ddc92..e7d563b9189 100755 --- a/tools/azure/get_pod_node_inventory.py +++ b/tools/azure/get_pod_node_inventory.py @@ -116,7 +116,6 @@ def query_log_analytics(workspace_id, query, verbose=False): sys.exit(1) except json.JSONDecodeError as e: print(f"Error parsing JSON response: {e}", file=sys.stderr) - print(f"Raw stdout: {result.stdout}", file=sys.stderr) sys.exit(1) @@ -443,7 +442,9 @@ def main(): # Print output print(output) + + return 0 if __name__ == '__main__': - main() + sys.exit(main()) From fbc3082b3d8006d5526aed05d1ce8fbf675d57c6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 12 Dec 2025 09:33:37 +0000 Subject: [PATCH 6/7] Final improvements: better validation, clearer naming, refactored duplicate code Co-authored-by: jakesmith <902700+jakesmith@users.noreply.github.com> --- tools/azure/analyze_pod_node_inventory.py | 48 ++++++++++++++--------- tools/azure/get_pod_node_inventory.py | 11 ++++-- 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/tools/azure/analyze_pod_node_inventory.py b/tools/azure/analyze_pod_node_inventory.py index 93960af9220..7ff0a763ae4 100755 --- a/tools/azure/analyze_pod_node_inventory.py +++ b/tools/azure/analyze_pod_node_inventory.py @@ -55,6 +55,27 @@ def parse_datetime(dt_str: str) -> datetime: raise ValueError(f"Invalid datetime format: {dt_str}") +def extract_thor_cluster_name(parts: List[str]) -> str: + """Extract Thor cluster name from pod name parts. + + Args: + parts: Pod name split by '-' + + Returns: + Cluster name or empty string if not found + """ + try: + thor_idx = parts.index('thor') + # Check if next part is cluster name (not thormanager/thorworker/manager/worker) + if thor_idx + 1 < len(parts): + next_part = parts[thor_idx + 1] + if next_part not in ('thormanager', 'thorworker', 'manager', 'worker'): + return next_part + except (ValueError, IndexError): + pass + return '' + + def identify_component(pod_name: str) -> str: """Identify HPCC component from pod name. @@ -90,20 +111,11 @@ def identify_component(pod_name: str) -> str: # Thor has more complex naming elif 'thor' in parts: + cluster_name = extract_thor_cluster_name(parts) if 'thormanager' in parts or 'manager' in parts: - # Extract cluster name if available - thor_idx = parts.index('thor') - if thor_idx + 1 < len(parts) and 'thormanager' not in parts[thor_idx + 1] and 'manager' not in parts[thor_idx + 1]: - cluster_name = parts[thor_idx + 1] - return f'thor-{cluster_name}' - return 'thor' + return f'thor-{cluster_name}' if cluster_name else 'thor' elif 'thorworker' in parts or 'worker' in parts: - # Extract cluster name if available - thor_idx = parts.index('thor') - if thor_idx + 1 < len(parts) and 'thorworker' not in parts[thor_idx + 1] and 'worker' not in parts[thor_idx + 1]: - cluster_name = parts[thor_idx + 1] - return f'thor-{cluster_name}-worker' - return 'thor-worker' + return f'thor-{cluster_name}-worker' if cluster_name else 'thor-worker' else: return 'thor' @@ -218,8 +230,8 @@ def analyze_component_usage(pods: List[Dict], nodes: List[Dict]) -> Dict: } -def calculate_durations(pods: List[Dict], start_time: datetime, end_time: datetime) -> Dict: - """Calculate how long each component consumed resources. +def estimate_durations_from_snapshots(pods: List[Dict], start_time: datetime, end_time: datetime) -> Dict: + """Estimate how long each component consumed resources based on snapshot data. IMPORTANT: This function assumes all pods were running for the entire time window. This is a simplification based on snapshot data from KubePodInventory. For more @@ -268,8 +280,8 @@ def output_csv(analysis: Dict, pods: List[Dict], nodes: List[Dict], component_pods = analysis['component_pods'] component_nodes = analysis['component_nodes'] - # Calculate durations - duration_info = calculate_durations(pods, start_time, end_time) + # Calculate estimated durations from snapshot data + duration_info = estimate_durations_from_snapshots(pods, start_time, end_time) component_duration = duration_info['component_duration'] component_pod_hours = duration_info['component_pod_hours'] @@ -308,8 +320,8 @@ def output_text(analysis: Dict, pods: List[Dict], nodes: List[Dict], component_nodes = analysis['component_nodes'] pod_to_node = analysis['pod_to_node'] - # Calculate durations - duration_info = calculate_durations(pods, start_time, end_time) + # Calculate estimated durations from snapshot data + duration_info = estimate_durations_from_snapshots(pods, start_time, end_time) component_duration = duration_info['component_duration'] component_pod_hours = duration_info['component_pod_hours'] diff --git a/tools/azure/get_pod_node_inventory.py b/tools/azure/get_pod_node_inventory.py index e7d563b9189..b89dfb7a67c 100755 --- a/tools/azure/get_pod_node_inventory.py +++ b/tools/azure/get_pod_node_inventory.py @@ -238,9 +238,14 @@ def build_query(namespace, start_time, end_time): # Add namespace filter only if specified if namespace: # Validate namespace to prevent KQL injection - # Kubernetes namespaces: lowercase letters, numbers, and hyphens only - if not all(c.islower() or c.isdigit() or c == '-' for c in namespace): - raise ValueError(f"Invalid namespace: {namespace}. Only lowercase letters, numbers, and hyphens are allowed.") + # Kubernetes namespaces (RFC 1123): lowercase letters, numbers, hyphens, and dots + # Must start and end with alphanumeric character + if not namespace: + raise ValueError("Namespace cannot be empty") + if not (namespace[0].isalnum() and namespace[-1].isalnum()): + raise ValueError(f"Invalid namespace: {namespace}. Must start and end with alphanumeric character.") + if not all(c.islower() or c.isdigit() or c in '.-' for c in namespace): + raise ValueError(f"Invalid namespace: {namespace}. Only lowercase letters, numbers, hyphens, and dots are allowed.") query += f"| where Namespace == '{namespace}'\n" query += """| summarize arg_max(TimeGenerated, *) by Name, Computer From f5e8bdc95514cb1e666e75e5a9b4f6e574dd756b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 12 Dec 2025 09:36:27 +0000 Subject: [PATCH 7/7] Update README with documentation for new pod/node inventory scripts Co-authored-by: jakesmith <902700+jakesmith@users.noreply.github.com> --- tools/azure/README.md | 241 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 240 insertions(+), 1 deletion(-) diff --git a/tools/azure/README.md b/tools/azure/README.md index caf23a3950e..81d3e937c44 100644 --- a/tools/azure/README.md +++ b/tools/azure/README.md @@ -15,12 +15,14 @@ This directory contains scripts that interact with Azure services to collect ope - [Data Collectors](#data-collectors) - [get_costs.py](#get_costspy) - Azure Cost Management API - [get_storage_usage.py](#get_storage_usagepy) - Storage metrics from Azure Monitor - - [get_pod_inventory.py](#get_pod_inventorypy) - Kubernetes pod inventory from Log Analytics + - [get_pod_inventory.py](#get_pod_inventorypy) - Kubernetes pod inventory from Log Analytics (legacy) + - [get_pod_node_inventory.py](#get_pod_node_inventorypy) - Kubernetes pod and node inventory from Log Analytics - [get_ala_thor_timeline.py](#get_ala_thor_timelinepy) - Thor workunit timeline from Log Analytics - [get_vm_pricing.py](#get_vm_pricingpy) - Azure VM pricing from Retail Prices API - [Data Analyzers](#data-analyzers) - [analyze_costs.py](#analyze_costspy) - Cost breakdown and visualization - [analyze_storage_usage.py](#analyze_storage_usagepy) - Storage usage analysis + - [analyze_pod_node_inventory.py](#analyze_pod_node_inventorypy) - Pod and node inventory analysis with HPCC component identification - [analyze_thor_timeline.py](#analyze_thor_timelinepy) - Thor timeline utilization and cost modeling - [Quick Start](#quick-start) - [Authentication](#authentication) @@ -314,6 +316,103 @@ hpcc-dali-0,Running,aks-agentpool-12345,ready,2025-11-01T08:30:00Z --- +### get_pod_node_inventory.py + +Get Kubernetes pod and node inventory from Azure Log Analytics. + +#### Purpose + +Query both `KubePodInventory` and `KubeNodeInventory` tables in Azure Log Analytics to retrieve comprehensive information about pods and nodes in a namespace during a given time range. This tool provides unified pod and node data for component-level resource analysis. + +#### When to Use + +- Analyzing HPCC component resource consumption +- Cross-referencing pods to nodes for capacity planning +- Understanding which components are using which nodes +- Generating data for component-level cost attribution +- Investigating resource allocation patterns + +#### Features + +- Queries both `KubePodInventory` and `KubeNodeInventory` tables via Azure Log Analytics REST API +- Supports workspace ID or AKS cluster discovery for workspace lookup +- Namespace filtering or all-namespaces mode +- Flexible time window specification (start/end or start+duration) +- CSV output format with metadata headers +- Comprehensive validation (RFC 1123 namespace names, datetime validation) +- KQL injection protection + +#### Usage + +```bash +# Using workspace ID directly +./get_pod_node_inventory.py --workspace-id xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx \ + -n hpcc --start-time "2025-11-04 12:00" + +# Using cluster discovery +./get_pod_node_inventory.py --cluster my-aks-cluster \ + --resource-group my-resource-group -n hpcc --start-time "2025-11-04 12:00" + +# With explicit time range +./get_pod_node_inventory.py --workspace-id \ + -n hpcc --start-time "2025-11-04 09:00" --end-time "2025-11-04 17:00" + +# Query all namespaces +./get_pod_node_inventory.py --workspace-id \ + --all-namespaces --start-time "2025-11-04 12:00" + +# Save to CSV file +./get_pod_node_inventory.py --workspace-id \ + -n hpcc --start-time "2025-11-04 12:00" > inventory.csv +``` + +#### Command-Line Options + +**Required Arguments:** +- `--start-time DATETIME` - Start time (YYYY-MM-DD or YYYY-MM-DD HH:MM) +- Either `-n, --namespace NAME` or `--all-namespaces` - Namespace to query +- Either `--workspace-id ID` or `--cluster NAME` with `--resource-group RG` + +**Workspace Identification (choose one):** +- `--workspace-id ID` - Log Analytics workspace ID (customer ID) directly +- `--cluster NAME` - AKS cluster name (discovers workspace from cluster) + - Requires: `--resource-group RG` + - Optional: `--subscription ID` + +**Optional Arguments:** +- `--end-time DATETIME` - End time (YYYY-MM-DD or YYYY-MM-DD HH:MM) +- `--duration MINUTES` - Time window in minutes from start (default: 60) +- `--verbose` - Print KQL query for debugging + +#### Output Format + +CSV format with metadata header: +```csv +# Generated by: get_pod_node_inventory.py +# Date generated: 2025-12-12 09:00:00 +# Workspace ID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx +# Time range: 2025-11-04 12:00:00 UTC to 2025-11-04 14:00:00 UTC +# Namespace: hpcc +# +TimeGenerated,RecordType,Name,Namespace,PodStatus,Computer,ContainerStatus,... +2025-11-04T12:00:00Z,Pod,hpcc-dali-0,hpcc,Running,aks-node-123,ready,... +2025-11-04T12:00:00Z,Node,aks-node-123,,,aks-node-123,,,,... +``` + +#### Dependencies + +- Python 3.7+ +- Azure CLI (`az`) installed and authenticated +- Container Insights enabled on AKS cluster (if using cluster discovery) +- Standard library modules only + +#### Required Permissions + +- Reader role on the AKS cluster resource (if using cluster discovery) +- Log Analytics Reader role on the workspace + +--- + ### get_ala_thor_timeline.py Extract Thor workunit timeline from Azure Log Analytics audit logs. @@ -749,6 +848,146 @@ Read storage usage CSV data (from `get_storage_usage.py` or compatible source) a --- +### analyze_pod_node_inventory.py + +Analyze pod and node inventory data with HPCC component identification. + +#### Purpose + +Analyze the CSV output from `get_pod_node_inventory.py` to identify HPCC components, cross-reference pods to nodes, and calculate resource consumption per component over time. This tool provides insights into which HPCC components are consuming which resources. + +#### When to Use + +- Understanding HPCC component resource consumption patterns +- Identifying which nodes are running which components +- Calculating pod-hours and duration for cost attribution +- Analyzing component deployment patterns +- Generating component-level resource usage reports + +#### Features + +- Identifies HPCC components from pod naming conventions + - dali, esp, eclccserver, sasha, dfuserver, eclagent + - Thor clusters (manager and worker pods) + - Roxie clusters +- Cross-references pods to nodes (computers) +- Calculates pod count and node count per component +- Estimates resource consumption duration (pod-hours) +- Multiple output formats (CSV, text) +- Detailed component breakdown option (--by-component) +- Time range filtering + +#### Component Identification + +The tool identifies HPCC components based on pod naming conventions: +- `hpcc-dali-*` → dali +- `hpcc-esp-*` → esp +- `hpcc-thor--thormanager-*` → thor- +- `hpcc-thor--thorworker-*` → thor--worker +- `hpcc-roxie--*` → roxie- +- Other standard HPCC components + +#### Usage + +```bash +# Basic CSV analysis +cat inventory.csv | ./analyze_pod_node_inventory.py + +# From file with CSV output +./analyze_pod_node_inventory.py inventory.csv + +# Human-readable text report +./analyze_pod_node_inventory.py inventory.csv --format text + +# Detailed component breakdown +./analyze_pod_node_inventory.py inventory.csv --format text --by-component + +# Time-filtered analysis +./analyze_pod_node_inventory.py inventory.csv \ + --start-time "2025-11-04 12:00" --end-time "2025-11-04 18:00" + +# Pipeline from collector +./get_pod_node_inventory.py --workspace-id -n hpcc \ + --start-time "2025-11-04 12:00" | ./analyze_pod_node_inventory.py +``` + +#### Command-Line Options + +**Positional Arguments:** +- `input` - Input CSV file from get_pod_node_inventory.py (or read from stdin if omitted) + +**Optional Arguments:** +- `--start-time DATETIME` - Start time filter (YYYY-MM-DD or YYYY-MM-DD HH:MM) +- `--end-time DATETIME` - End time filter (YYYY-MM-DD or YYYY-MM-DD HH:MM) +- `--format {csv,text}` - Output format (default: csv) +- `--by-component` - Show detailed breakdown by component (text format only) + +#### Output Formats + +**CSV (default):** +```csv +# Generated by: analyze_pod_node_inventory.py +# Date generated: 2025-12-12 09:00:00 +# Time range: 2025-11-04 12:00:00 to 2025-11-04 14:00:00 +# Total pods: 42 +# Total nodes: 10 +# +Component,PodCount,NodeCount,DurationHours,PodHours +dali,1,1,2.00,2.00 +esp,3,3,2.00,6.00 +thor-mycluster,1,1,2.00,2.00 +thor-mycluster-worker,8,8,2.00,16.00 +roxie-cluster1,4,4,2.00,8.00 +``` + +**Text:** +``` +================================================================================ +POD AND NODE INVENTORY ANALYSIS +================================================================================ + +SUMMARY +-------------------------------------------------------------------------------- +Time Range: 2025-11-04 12:00:00 to 2025-11-04 14:00:00 +Duration: 2.00 hours +Total Pods: 42 +Total Nodes: 10 + +COMPONENT BREAKDOWN +-------------------------------------------------------------------------------- +Component Pods Nodes Duration Pod-Hours +-------------------------------------------------------------------------------- +dali 1 1 2.00h 2.00h +esp 3 3 2.00h 6.00h +thor-mycluster 1 1 2.00h 2.00h +thor-mycluster-worker 8 8 2.00h 16.00h +roxie-cluster1 4 4 2.00h 8.00h + +NODE UTILIZATION +-------------------------------------------------------------------------------- +Node Name Pod Count +-------------------------------------------------------------------------------- +aks-nodepool1-12345 5 +aks-nodepool1-12346 4 +... +``` + +**Text with --by-component:** +Includes detailed pod-to-node mapping for each component. + +#### Important Notes + +- **Duration Calculation:** The tool estimates durations based on snapshot data from KubePodInventory. It assumes all pods in the snapshot were running for the entire time window. For more accurate pod lifecycle tracking, time-series data would be needed. +- **Component Identification:** Based on standard HPCC pod naming conventions. Non-HPCC pods are categorized as "Other". + +#### Dependencies + +- Python 3.7+ +- Standard library modules only (csv, datetime, collections) +- No Azure CLI required (works offline on saved CSV files) + +--- + ### analyze_thor_timeline.py Thor cluster utilization analysis and cost modeling from timeline data.