From 35a26e4d412933e27435f3c39918eb3be1e16f59 Mon Sep 17 00:00:00 2001 From: Gustavo Lima Date: Wed, 25 Feb 2026 01:54:19 +0100 Subject: [PATCH 1/3] Add CPU-only partition support and GPU memory querying Support CPU-only partitions by returning 0 from get_total_gpus_per_node instead of defaulting to 8, and raise a clear ValueError when GPU resources are requested on a partition with no GPUs. Add gpu_mem() API and gpu-mem CLI command to query GPU VRAM via nvidia-smi and rocm-smi, with a new GPUMemInfo NamedTuple following existing patterns. Closes #79, Closes #80 --- clusterscope/__init__.py | 2 + clusterscope/cli.py | 13 ++++ clusterscope/cluster_info.py | 133 +++++++++++++++++++++++++++++++-- clusterscope/lib.py | 13 ++++ tests/test_cluster_info.py | 139 ++++++++++++++++++++++++++++++++++- 5 files changed, 293 insertions(+), 7 deletions(-) diff --git a/clusterscope/__init__.py b/clusterscope/__init__.py index 125360a..3f60f91 100644 --- a/clusterscope/__init__.py +++ b/clusterscope/__init__.py @@ -10,6 +10,7 @@ cpus, get_job, get_tmp_dir, + gpu_mem, job_gen_task_slurm, local_node_gpu_generation_and_count, mem, @@ -21,6 +22,7 @@ "slurm_version", "cpus", "mem", + "gpu_mem", "local_node_gpu_generation_and_count", "get_job", "job_gen_task_slurm", diff --git a/clusterscope/cli.py b/clusterscope/cli.py index 4b00845..8de678e 100644 --- a/clusterscope/cli.py +++ b/clusterscope/cli.py @@ -103,6 +103,19 @@ def mem(partition: str): click.echo(f"{mem.mem_total_MB}, {mem.mem_total_GB}, {mem.partition}") +@cli.command(name="gpu-mem") +def gpu_mem(): + """Show GPU memory per GPU model on the current node.""" + unified_info = UnifiedInfo() + gpu_mem_info = unified_info.get_gpu_mem_MB() + if not gpu_mem_info: + click.echo("No GPU memory information found") + return + click.echo("GPU Gen, GPU Vendor, Mem total MB, Mem total GB:") + for gm in gpu_mem_info: + click.echo(f"{gm.gpu_gen}, {gm.vendor}, {gm.mem_total_MB}, {gm.mem_total_GB}") + + @cli.command() @click.option( "--partition", diff --git a/clusterscope/cluster_info.py b/clusterscope/cluster_info.py index 3a2009b..c6c012a 100644 --- a/clusterscope/cluster_info.py +++ b/clusterscope/cluster_info.py @@ -113,6 +113,16 @@ class GPUInfo(NamedTuple): partition: Optional[str] = None +class GPUMemInfo(NamedTuple): + """Represents GPU memory information for a device.""" + + mem_total_MB: int + mem_total_GB: int + vendor: str + gpu_gen: str + partition: Optional[str] = None + + class MemInfo(NamedTuple): """Represents memory information for a host.""" @@ -228,6 +238,19 @@ def get_mem_per_node_MB(self) -> list[MemInfo] | MemInfo: return self.slurm_cluster_info.get_mem_per_node_MB() return self.local_node_info.get_mem_MB() + def get_gpu_mem_MB(self) -> list[GPUMemInfo]: + """Get GPU memory for each GPU model available on the current node. + + GPU memory is a hardware property not exposed by sinfo, so this always + queries the local node via nvidia-smi or rocm-smi. + + Returns: + list[GPUMemInfo]: GPU memory information per GPU model. + """ + if self.has_nvidia_gpus or self.has_amd_gpus: + return self.local_node_info.get_gpu_mem_MB() + return [] + def get_gpu_generation_and_count(self) -> list[GPUInfo]: """Get the number of GPUs on the slurm cluster node. @@ -258,18 +281,16 @@ def get_total_gpus_per_node(self) -> int: """Get the total number of GPUs available per node. Returns: - int: Total number of GPUs per node. Returns 8 as default if no GPUs are detected. + int: Total number of GPUs per node. Returns 0 for CPU-only partitions. """ gpus = self.get_gpu_generation_and_count() if not gpus: - # Default to 8 if no GPUs detected (common configuration) - return 8 + return 0 # Use maximum GPU count across node types in the partition. # This handles heterogeneous partitions where some nodes may have # fewer GPUs than others (e.g., due to hardware issues). - max_gpus = max(g.gpu_count for g in gpus) - return max(max_gpus, 1) # Ensure at least 1 to avoid division by zero + return max(g.gpu_count for g in gpus) def get_task_resource_requirements( self, @@ -326,6 +347,12 @@ def get_task_resource_requirements( elif gpus_per_task is not None: total_gpus_per_node = self.get_total_gpus_per_node() + if total_gpus_per_node == 0: + raise ValueError( + f"Partition '{partition}' has no GPUs. " + "Use cpus_per_task for CPU-only partitions." + ) + cpu_cores_per_gpu = total_cpus_per_node.cpu_count / total_gpus_per_node total_required_cpu_cores_per_task = math.floor( cpu_cores_per_gpu * gpus_per_task @@ -570,6 +597,45 @@ def get_nvidia_gpu_info(self, timeout: int = 60) -> list[GPUInfo]: except RuntimeError as e: raise RuntimeError(f"Failed to get NVIDIA GPU information: {str(e)}") + def _get_nvidia_gpu_mem_MB(self, timeout: int = 60) -> list[GPUMemInfo]: + """Get NVIDIA GPU memory using nvidia-smi.""" + try: + result = run_cli( + [ + "nvidia-smi", + "--query-gpu=gpu_name,memory.total", + "--format=csv,noheader,nounits", + ], + text=True, + timeout=timeout, + ) + + gpu_mem: Dict[str, int] = defaultdict(int) + for line in result.strip().split("\n"): + if not line or ", " not in line: + continue + gpu_name, mem_mb_str = line.rsplit(", ", 1) + gpu_name_upper = gpu_name.strip().upper() + gpu_gen = gpu_name_upper + for gpu_key, gpu_pattern in NVIDIA_GPU_TYPES.items(): + if gpu_pattern in gpu_name_upper: + gpu_gen = gpu_key + break + mem_mb = int(mem_mb_str.strip()) + gpu_mem[gpu_gen] = max(gpu_mem[gpu_gen], mem_mb) + + return [ + GPUMemInfo( + mem_total_MB=mem_mb, + mem_total_GB=mem_mb // 1024, + vendor="nvidia", + gpu_gen=gpu_gen, + ) + for gpu_gen, mem_mb in gpu_mem.items() + ] + except RuntimeError as e: + raise RuntimeError(f"Failed to get NVIDIA GPU memory: {str(e)}") + def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]: """Get AMD GPU information using rocm-smi.""" # Check if AMD GPUs are available @@ -633,6 +699,38 @@ def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]: except RuntimeError as e: raise RuntimeError(f"Failed to get AMD GPU information: {str(e)}") + def _get_amd_gpu_mem_MB(self, timeout: int = 60) -> list[GPUMemInfo]: + """Get AMD GPU memory using rocm-smi.""" + try: + result = run_cli( + ["rocm-smi", "--showmeminfo", "vram", "--json"], + text=True, + timeout=timeout, + ) + data = json.loads(result) + gpu_mem: Dict[str, int] = defaultdict(int) + for card_data in data.values(): + vram_total = card_data.get("VRAM Total Memory (B)", 0) + mem_mb = int(vram_total) // (1024 * 1024) + gpu_gen = "AMD" + for gpu_key, gpu_pattern in AMD_GPU_TYPES.items(): + if gpu_pattern in str(card_data): + gpu_gen = gpu_key + break + gpu_mem[gpu_gen] = max(gpu_mem[gpu_gen], mem_mb) + + return [ + GPUMemInfo( + mem_total_MB=mem_mb, + mem_total_GB=mem_mb // 1024, + vendor="amd", + gpu_gen=gpu_gen, + ) + for gpu_gen, mem_mb in gpu_mem.items() + ] + except RuntimeError as e: + raise RuntimeError(f"Failed to get AMD GPU memory: {str(e)}") + def get_gpu_generation_and_count(self, timeout: int = 60) -> list[GPUInfo]: """Get GPU information for all available GPUs on the local node. @@ -666,6 +764,31 @@ def get_gpu_generation_and_count(self, timeout: int = 60) -> list[GPUInfo]: return gpu_info + def get_gpu_mem_MB(self, timeout: int = 60) -> list[GPUMemInfo]: + """Get GPU memory for all available GPUs on the local node. + + Returns: + list[GPUMemInfo]: GPU memory information per GPU model. + """ + gpu_mem_info = [] + + if self.has_nvidia_gpus(): + try: + gpu_mem_info.extend(self._get_nvidia_gpu_mem_MB(timeout)) + except RuntimeError as e: + logging.warning(f"Failed to get NVIDIA GPU memory: {e}") + + if self.has_amd_gpus(): + try: + gpu_mem_info.extend(self._get_amd_gpu_mem_MB(timeout)) + except RuntimeError as e: + logging.warning(f"Failed to get AMD GPU memory: {e}") + + if not gpu_mem_info: + logging.warning("No GPU memory information found") + + return gpu_mem_info + def has_gpu_type(self, gpu_type: str) -> bool: """Check if a specific GPU type is available on the local node. diff --git a/clusterscope/lib.py b/clusterscope/lib.py index 8cb4e93..14a678c 100644 --- a/clusterscope/lib.py +++ b/clusterscope/lib.py @@ -8,6 +8,7 @@ from clusterscope.cluster_info import ( CPUInfo, GPUInfo, + GPUMemInfo, LocalNodeInfo, MemInfo, UnifiedInfo, @@ -101,6 +102,18 @@ def mem( return mem_info_list +def gpu_mem() -> list[GPUMemInfo]: + """Get GPU memory for each GPU model on the current node. + + Queries the local node via nvidia-smi (NVIDIA) or rocm-smi (AMD). + Returns an empty list on CPU-only nodes. + + Returns: + list[GPUMemInfo]: GPU memory per GPU model. + """ + return get_unified_info().get_gpu_mem_MB() + + def get_tmp_dir(): tmp = get_unified_info().get_tmp_dir() return tmp diff --git a/tests/test_cluster_info.py b/tests/test_cluster_info.py index 660885f..6d59b3e 100644 --- a/tests/test_cluster_info.py +++ b/tests/test_cluster_info.py @@ -12,6 +12,7 @@ CPUInfo, DarwinInfo, GPUInfo, + GPUMemInfo, LinuxInfo, LocalNodeInfo, MemInfo, @@ -988,11 +989,11 @@ def test_get_total_gpus_per_node_with_gpus(self, mock_gpu_info): @patch.object(UnifiedInfo, "get_gpu_generation_and_count") def test_get_total_gpus_per_node_no_gpus_detected(self, mock_gpu_info): - """Test get_total_gpus_per_node defaults to 8 when no GPUs detected.""" + """Test get_total_gpus_per_node returns 0 for CPU-only partitions.""" mock_gpu_info.return_value = [] result = self.unified_info.get_total_gpus_per_node() - self.assertEqual(result, 8) # Default fallback + self.assertEqual(result, 0) # CPU-only partition @patch.object(UnifiedInfo, "get_gpu_generation_and_count") def test_get_total_gpus_per_node_single_gpu_type(self, mock_gpu_info): @@ -1097,5 +1098,139 @@ def test_get_task_resource_requirements_heterogeneous_gpu_partition( self.assertEqual(result.gpus_per_task, 8) +class TestGetTotalGpusPerNodeCPUOnly(unittest.TestCase): + """Test get_total_gpus_per_node returns 0 for CPU-only partitions.""" + + def test_returns_zero_when_no_gpus(self): + unified_info = UnifiedInfo() + unified_info.is_slurm_cluster = False + unified_info.has_nvidia_gpus = False + unified_info.has_amd_gpus = False + self.assertEqual(unified_info.get_total_gpus_per_node(), 0) + + @patch.object(UnifiedInfo, "get_gpu_generation_and_count") + def test_returns_max_gpu_count_when_gpus_present(self, mock_gpu): + mock_gpu.return_value = [ + GPUInfo(gpu_gen="A100", gpu_count=8, vendor="nvidia"), + ] + unified_info = UnifiedInfo() + self.assertEqual(unified_info.get_total_gpus_per_node(), 8) + + @patch.object(UnifiedInfo, "get_gpu_generation_and_count") + def test_returns_max_across_heterogeneous_nodes(self, mock_gpu): + mock_gpu.return_value = [ + GPUInfo(gpu_gen="A100", gpu_count=8, vendor="nvidia"), + GPUInfo(gpu_gen="A100", gpu_count=4, vendor="nvidia"), + ] + unified_info = UnifiedInfo() + self.assertEqual(unified_info.get_total_gpus_per_node(), 8) + + +class TestGetTaskResourceRequirementsCPUOnly(unittest.TestCase): + """Test GPU request on CPU-only partition raises ValueError.""" + + @patch.object(UnifiedInfo, "get_gpu_generation_and_count", return_value=[]) + @patch.object(UnifiedInfo, "get_cpus_per_node") + @patch.object(UnifiedInfo, "get_mem_per_node_MB") + def test_gpu_request_on_cpu_only_partition_raises( + self, mock_mem, mock_cpu, mock_gpu + ): + mock_cpu.return_value = CPUInfo(cpu_count=64, partition="cpu_partition") + mock_mem.return_value = MemInfo( + mem_total_MB=512000, mem_total_GB=500, partition="cpu_partition" + ) + unified_info = UnifiedInfo() + unified_info.is_slurm_cluster = False + with self.assertRaises(ValueError) as ctx: + unified_info.get_task_resource_requirements( + partition="cpu_partition", + gpus_per_task=1, + ) + self.assertIn("no GPUs", str(ctx.exception)) + + @patch.object(UnifiedInfo, "get_cpus_per_node") + @patch.object(UnifiedInfo, "get_mem_per_node_MB") + def test_cpu_request_on_cpu_only_partition_succeeds(self, mock_mem, mock_cpu): + mock_cpu.return_value = CPUInfo(cpu_count=64, partition="cpu_partition") + mock_mem.return_value = MemInfo( + mem_total_MB=512000, mem_total_GB=500, partition="cpu_partition" + ) + unified_info = UnifiedInfo() + unified_info.is_slurm_cluster = False + result = unified_info.get_task_resource_requirements( + partition="cpu_partition", + cpus_per_task=16, + ) + self.assertIsInstance(result, ResourceShape) + self.assertEqual(result.cpus_per_task, 16) + self.assertIsNone(result.gpus_per_task) + + +class TestLocalNodeInfoGPUMem(unittest.TestCase): + """Test GPU memory querying methods.""" + + def setUp(self): + self.local_node_info = LocalNodeInfo() + + @patch.object(LocalNodeInfo, "has_nvidia_gpus", return_value=True) + @patch("clusterscope.cluster_info.run_cli") + def test_get_nvidia_gpu_mem_MB(self, mock_run_cli, mock_has_nvidia): + mock_run_cli.return_value = ( + "NVIDIA A100-SXM4-40GB, 40960\nNVIDIA A100-SXM4-40GB, 40960" + ) + result = self.local_node_info._get_nvidia_gpu_mem_MB() + self.assertEqual(len(result), 1) + self.assertEqual(result[0].gpu_gen, "A100") + self.assertEqual(result[0].mem_total_MB, 40960) + self.assertEqual(result[0].mem_total_GB, 40) + self.assertEqual(result[0].vendor, "nvidia") + + @patch.object(LocalNodeInfo, "has_nvidia_gpus", return_value=True) + @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=False) + @patch.object(LocalNodeInfo, "_get_nvidia_gpu_mem_MB") + def test_get_gpu_mem_MB_nvidia_only( + self, mock_nvidia_mem, mock_has_amd, mock_has_nvidia + ): + mock_nvidia_mem.return_value = [ + GPUMemInfo( + mem_total_MB=40960, mem_total_GB=40, vendor="nvidia", gpu_gen="A100" + ) + ] + result = self.local_node_info.get_gpu_mem_MB() + self.assertEqual(len(result), 1) + self.assertEqual(result[0].gpu_gen, "A100") + + @patch.object(LocalNodeInfo, "has_nvidia_gpus", return_value=False) + @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=False) + def test_get_gpu_mem_MB_no_gpus(self, mock_has_amd, mock_has_nvidia): + result = self.local_node_info.get_gpu_mem_MB() + self.assertEqual(result, []) + + +class TestUnifiedInfoGPUMem(unittest.TestCase): + """Test UnifiedInfo.get_gpu_mem_MB dispatches correctly.""" + + @patch.object(LocalNodeInfo, "get_gpu_mem_MB") + def test_returns_gpu_mem_when_nvidia_present(self, mock_gpu_mem): + mock_gpu_mem.return_value = [ + GPUMemInfo( + mem_total_MB=81920, mem_total_GB=80, vendor="nvidia", gpu_gen="H100" + ) + ] + unified_info = UnifiedInfo() + unified_info.has_nvidia_gpus = True + unified_info.has_amd_gpus = False + result = unified_info.get_gpu_mem_MB() + self.assertEqual(len(result), 1) + self.assertEqual(result[0].gpu_gen, "H100") + + def test_returns_empty_on_cpu_only_node(self): + unified_info = UnifiedInfo() + unified_info.has_nvidia_gpus = False + unified_info.has_amd_gpus = False + result = unified_info.get_gpu_mem_MB() + self.assertEqual(result, []) + + if __name__ == "__main__": unittest.main() From ab420d57f64807e99bfaa1eaa88a219433e571c2 Mon Sep 17 00:00:00 2001 From: Gustavo Lima Date: Wed, 25 Feb 2026 02:34:57 +0100 Subject: [PATCH 2/3] Fix pre-existing bugs and improve code robustness - Remove self.partition mutation in get_task_resource_requirements that could cause stale query results on reused UnifiedInfo instances - Fix unreachable RuntimeError after return in get_mem_per_node_MB and add proper empty-result guard - Initialize gpu_gen variable in get_amd_gpu_info to prevent UnboundLocalError on unknown AMD GPU models - Simplify broken GPU availability guards in get_nvidia_gpu_info and get_amd_gpu_info that silently fell through on success - Add missing partition validation in check-gpu CLI command - Fix test_get_cluster_name to include all valid return values - Fix _get_amd_gpu_mem_MB to query product name separately and filter zero-byte VRAM entries - Add test coverage for AMD GPU memory querying - Add contributor and gpu-mem command to README --- README.md | 3 +- clusterscope/cli.py | 2 + clusterscope/cluster_info.py | 77 +++++++++++++++++++----------------- tests/test_cluster_info.py | 61 +++++++++++++++++++++++----- 4 files changed, 96 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index d4c449d..a6ede95 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ Commands: aws Check if running on AWS and show NCCL settings. check-gpu Check if a specific GPU type exists. cpus Show CPU counts per node. + gpu-mem Show GPU memory per GPU model on the current node. gpus Show GPU information. info Show basic cluster information. job-gen Generate job requirements for different job types. @@ -55,7 +56,7 @@ clusterscope is actively maintained by [Lucca Bertoncini](https://github.com/luc ## Contributors -[Lucca Bertoncini](https://github.com/luccabb), [Kalyan Saladi](https://github.com/skalyan), [Nikhil Gupta](https://github.com/gunchu), [Misko Dzamba](https://github.com/misko), +[Lucca Bertoncini](https://github.com/luccabb), [Kalyan Saladi](https://github.com/skalyan), [Nikhil Gupta](https://github.com/gunchu), [Misko Dzamba](https://github.com/misko), [Gustavo Colombini](https://github.com/gustcol), ### License diff --git a/clusterscope/cli.py b/clusterscope/cli.py index 8de678e..168b73c 100644 --- a/clusterscope/cli.py +++ b/clusterscope/cli.py @@ -191,6 +191,8 @@ def check_gpu(gpu_type: str, partition: str): GPU_TYPE: GPU type to check for (e.g., A100, MI300X) """ + if partition is not None: + validate_partition_exists(partition=partition, exit_on_error=True) unified_info = UnifiedInfo(partition=partition) has_gpu = unified_info.has_gpu_type(gpu_type) if has_gpu: diff --git a/clusterscope/cluster_info.py b/clusterscope/cluster_info.py index c6c012a..2b56e38 100644 --- a/clusterscope/cluster_info.py +++ b/clusterscope/cluster_info.py @@ -325,7 +325,6 @@ def get_task_resource_requirements( if tasks_per_node < 1: raise ValueError("tasks_per_node must be at least 1") - self.partition = partition cpus_per_node = self.get_cpus_per_node() total_cpus_per_node = ( cpus_per_node[0] if isinstance(cpus_per_node, list) else cpus_per_node @@ -535,17 +534,8 @@ def get_mem_MB(self, timeout: int = 60) -> MemInfo: def get_nvidia_gpu_info(self, timeout: int = 60) -> list[GPUInfo]: """Get NVIDIA GPU information using nvidia-smi.""" - # Check if NVIDIA GPUs are available if not self.has_nvidia_gpus(): - try: - # Try to run nvidia-smi command - result = run_cli( - ["nvidia-smi", "--query-gpu=gpu_name", "--format=csv,noheader"], - text=True, - timeout=timeout, - ) - except RuntimeError: - raise RuntimeError("No NVIDIA GPUs found") + raise RuntimeError("No NVIDIA GPUs found") try: result = run_cli( ["nvidia-smi", "--query-gpu=gpu_name,count", "--format=csv,noheader"], @@ -638,17 +628,8 @@ def _get_nvidia_gpu_mem_MB(self, timeout: int = 60) -> list[GPUMemInfo]: def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]: """Get AMD GPU information using rocm-smi.""" - # Check if AMD GPUs are available if not self.has_amd_gpus(): - try: - # Try to run rocm-smi command - result = run_cli( - ["rocm-smi", "--showproductname"], - text=True, - timeout=timeout, - ) - except RuntimeError: - raise RuntimeError("No AMD GPUs found") + raise RuntimeError("No AMD GPUs found") try: result = run_cli( ["rocm-smi", "--showproductname"], text=True, timeout=timeout @@ -668,6 +649,7 @@ def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]: gpu_name_upper = gpu_name.upper() # Check for known AMD GPU types + gpu_gen = "" found_gpu = False for gpu_key, gpu_pattern in AMD_GPU_TYPES.items(): if gpu_pattern in gpu_name_upper: @@ -688,7 +670,7 @@ def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]: found_gpu = True break - if not found_gpu and gpu_gen is None: + if not found_gpu and not gpu_gen: gpu_gen = gpu_name_upper gpu_info[gpu_gen] += 1 @@ -700,33 +682,51 @@ def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]: raise RuntimeError(f"Failed to get AMD GPU information: {str(e)}") def _get_amd_gpu_mem_MB(self, timeout: int = 60) -> list[GPUMemInfo]: - """Get AMD GPU memory using rocm-smi.""" + """Get AMD GPU memory using rocm-smi. + + Queries --showproductname for GPU generation and --showmeminfo for VRAM. + """ try: - result = run_cli( + # Identify GPU generation from product name + name_result = run_cli( + ["rocm-smi", "--showproductname"], text=True, timeout=timeout + ) + detected_gen = "AMD" + for line in name_result.strip().split("\n"): + if "GPU" in line and ":" in line: + parts = line.split(":") + if len(parts) >= 2: + gpu_name_upper = parts[-1].strip().upper() + for gpu_key, gpu_pattern in AMD_GPU_TYPES.items(): + if gpu_pattern in gpu_name_upper: + detected_gen = gpu_key + break + if detected_gen != "AMD": + break + + # Get VRAM info + vram_result = run_cli( ["rocm-smi", "--showmeminfo", "vram", "--json"], text=True, timeout=timeout, ) - data = json.loads(result) - gpu_mem: Dict[str, int] = defaultdict(int) + data = json.loads(vram_result) + max_mem_mb = 0 for card_data in data.values(): vram_total = card_data.get("VRAM Total Memory (B)", 0) mem_mb = int(vram_total) // (1024 * 1024) - gpu_gen = "AMD" - for gpu_key, gpu_pattern in AMD_GPU_TYPES.items(): - if gpu_pattern in str(card_data): - gpu_gen = gpu_key - break - gpu_mem[gpu_gen] = max(gpu_mem[gpu_gen], mem_mb) + max_mem_mb = max(max_mem_mb, mem_mb) + + if max_mem_mb == 0: + return [] return [ GPUMemInfo( - mem_total_MB=mem_mb, - mem_total_GB=mem_mb // 1024, + mem_total_MB=max_mem_mb, + mem_total_GB=max_mem_mb // 1024, vendor="amd", - gpu_gen=gpu_gen, + gpu_gen=detected_gen, ) - for gpu_gen, mem_mb in gpu_mem.items() ] except RuntimeError as e: raise RuntimeError(f"Failed to get AMD GPU memory: {str(e)}") @@ -926,8 +926,11 @@ def get_mem_per_node_MB(self) -> list[MemInfo]: partition=partition.strip("* "), ) ) + if not results: + raise RuntimeError( + f"No mem information found in: {result.stdout}" + ) return results - raise RuntimeError(f"No mem information found in: {result.stdout}") except (subprocess.SubprocessError, FileNotFoundError) as e: logging.error(f"Failed to get Slurm memory information: {str(e)}") raise RuntimeError(f"Failed to get Slurm memory information: {str(e)}") diff --git a/tests/test_cluster_info.py b/tests/test_cluster_info.py index 6d59b3e..babee87 100644 --- a/tests/test_cluster_info.py +++ b/tests/test_cluster_info.py @@ -28,12 +28,18 @@ class TestUnifiedInfo(unittest.TestCase): def test_get_cluster_name(self): unified_info = UnifiedInfo() unified_info.is_slurm_cluster = False - self.assertIn(unified_info.get_cluster_name(), ["local-node", "github"]) + self.assertIn( + unified_info.get_cluster_name(), + ["local-node", "github", "macos", "mast"], + ) def test_get_cluster_name_with_partition(self): unified_info = UnifiedInfo(partition="gpu_partition") unified_info.is_slurm_cluster = False - self.assertIn(unified_info.get_cluster_name(), ["local-node", "github"]) + self.assertIn( + unified_info.get_cluster_name(), + ["local-node", "github", "macos", "mast"], + ) self.assertEqual(unified_info.partition, "gpu_partition") def test_get_gpu_generation_and_count(self): @@ -499,8 +505,9 @@ def test_has_amd_gpus_false_called_process_error(self, mock_run): mock_run.side_effect = subprocess.CalledProcessError(1, ["rocm-smi"]) self.assertFalse(self.local_node_info.has_amd_gpus()) + @patch.object(LocalNodeInfo, "has_nvidia_gpus", return_value=True) @patch("clusterscope.cluster_info.run_cli") - def test_get_nvidia_gpu_info_success(self, mock_run_cli): + def test_get_nvidia_gpu_info_success(self, mock_run_cli, mock_has_nvidia): """Test successful NVIDIA GPU information retrieval.""" mock_run_cli.return_value = "NVIDIA A100-SXM4-40GB, 2\nNVIDIA A100-SXM4-40GB, 2\nTesla V100-SXM2-16GB, 1" @@ -511,8 +518,9 @@ def test_get_nvidia_gpu_info_success(self, mock_run_cli): ] self.assertEqual(result, expected) + @patch.object(LocalNodeInfo, "has_nvidia_gpus", return_value=True) @patch("clusterscope.cluster_info.run_cli") - def test_get_nvidia_gpu_info_empty_lines(self, mock_run_cli): + def test_get_nvidia_gpu_info_empty_lines(self, mock_run_cli, mock_has_nvidia): """Test NVIDIA GPU info parsing with empty lines.""" mock_run_cli.return_value = ( "NVIDIA A100-SXM4-40GB, 1\n\n\nTesla V100-SXM2-16GB, 1\n" @@ -525,8 +533,9 @@ def test_get_nvidia_gpu_info_empty_lines(self, mock_run_cli): ] self.assertEqual(result, expected) + @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=True) @patch("clusterscope.cluster_info.run_cli") - def test_get_amd_gpu_info_mi300x(self, mock_run_cli): + def test_get_amd_gpu_info_mi300x(self, mock_run_cli, mock_has_amd): """Test AMD GPU information retrieval for MI300X.""" mock_run_cli.return_value = """GPU[0]: AMD Instinct MI300X GPU[1]: AMD Instinct MI300X""" @@ -535,8 +544,9 @@ def test_get_amd_gpu_info_mi300x(self, mock_run_cli): expected = [GPUInfo(gpu_gen="MI300X", gpu_count=2, vendor="amd")] self.assertEqual(result, expected) + @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=True) @patch("clusterscope.cluster_info.run_cli") - def test_get_amd_gpu_info_mi300a(self, mock_run_cli): + def test_get_amd_gpu_info_mi300a(self, mock_run_cli, mock_has_amd): """Test AMD GPU information retrieval for MI300A.""" mock_run_cli.return_value = "GPU[0]: AMD Instinct MI300A" @@ -544,8 +554,9 @@ def test_get_amd_gpu_info_mi300a(self, mock_run_cli): expected = [GPUInfo(gpu_gen="MI300A", gpu_count=1, vendor="amd")] self.assertEqual(result, expected) + @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=True) @patch("clusterscope.cluster_info.run_cli") - def test_get_amd_gpu_info_various_models(self, mock_run_cli): + def test_get_amd_gpu_info_various_models(self, mock_run_cli, mock_has_amd): """Test AMD GPU info parsing with various GPU models.""" mock_run_cli.return_value = """GPU[0]: AMD Instinct MI250X GPU[1]: AMD Instinct MI210 @@ -579,8 +590,9 @@ def test_get_amd_gpu_info_various_models(self, mock_run_cli): for gpu in expected: self.assertIn(gpu, result) + @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=True) @patch("clusterscope.cluster_info.run_cli") - def test_get_amd_gpu_info_generic_fallback(self, mock_run_cli): + def test_get_amd_gpu_info_generic_fallback(self, mock_run_cli, mock_has_amd): """Test AMD GPU info parsing with generic fallback for unknown models.""" mock_run_cli.return_value = "GPU[0]: AMD Radeon RX 6800 XT" @@ -589,8 +601,9 @@ def test_get_amd_gpu_info_generic_fallback(self, mock_run_cli): expected = [GPUInfo(gpu_gen="6800", gpu_count=1, vendor="amd", partition=None)] self.assertEqual(result, expected) + @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=True) @patch("clusterscope.cluster_info.run_cli") - def test_get_amd_gpu_info_no_gpu_lines(self, mock_run_cli): + def test_get_amd_gpu_info_no_gpu_lines(self, mock_run_cli, mock_has_amd): """Test AMD GPU info parsing with no GPU lines.""" mock_run_cli.return_value = "Some other output\nNo GPU information here" @@ -1200,6 +1213,36 @@ def test_get_gpu_mem_MB_nvidia_only( self.assertEqual(len(result), 1) self.assertEqual(result[0].gpu_gen, "A100") + @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=True) + @patch("clusterscope.cluster_info.run_cli") + def test_get_amd_gpu_mem_MB(self, mock_run_cli, mock_has_amd): + mock_run_cli.side_effect = [ + # First call: --showproductname + "GPU[0] : Card model: AMD Instinct MI300X\n" + "GPU[1] : Card model: AMD Instinct MI300X\n", + # Second call: --showmeminfo vram --json + '{"card0": {"VRAM Total Memory (B)": 206158430208}, ' + '"card1": {"VRAM Total Memory (B)": 206158430208}}', + ] + result = self.local_node_info._get_amd_gpu_mem_MB() + self.assertEqual(len(result), 1) + self.assertEqual(result[0].gpu_gen, "MI300X") + self.assertEqual(result[0].mem_total_MB, 196608) + self.assertEqual(result[0].mem_total_GB, 192) + self.assertEqual(result[0].vendor, "amd") + + @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=True) + @patch("clusterscope.cluster_info.run_cli") + def test_get_amd_gpu_mem_MB_zero_vram_returns_empty( + self, mock_run_cli, mock_has_amd + ): + mock_run_cli.side_effect = [ + "GPU[0] : Card model: Unknown GPU\n", + '{"card0": {"VRAM Total Memory (B)": 0}}', + ] + result = self.local_node_info._get_amd_gpu_mem_MB() + self.assertEqual(result, []) + @patch.object(LocalNodeInfo, "has_nvidia_gpus", return_value=False) @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=False) def test_get_gpu_mem_MB_no_gpus(self, mock_has_amd, mock_has_nvidia): From be6fcdbbfe6b29cad2780cd550c0dddc4732186b Mon Sep 17 00:00:00 2001 From: Gustavo Lima Date: Wed, 25 Feb 2026 02:38:40 +0100 Subject: [PATCH 3/3] Update contributor name --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a6ede95..4fd76a1 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ clusterscope is actively maintained by [Lucca Bertoncini](https://github.com/luc ## Contributors -[Lucca Bertoncini](https://github.com/luccabb), [Kalyan Saladi](https://github.com/skalyan), [Nikhil Gupta](https://github.com/gunchu), [Misko Dzamba](https://github.com/misko), [Gustavo Colombini](https://github.com/gustcol), +[Lucca Bertoncini](https://github.com/luccabb), [Kalyan Saladi](https://github.com/skalyan), [Nikhil Gupta](https://github.com/gunchu), [Misko Dzamba](https://github.com/misko), [Gustavo Lima](https://github.com/gustcol), ### License