From 35a26e4d412933e27435f3c39918eb3be1e16f59 Mon Sep 17 00:00:00 2001
From: Gustavo Lima <gustcol@gmail.com>
Date: Wed, 25 Feb 2026 01:54:19 +0100
Subject: [PATCH 1/3] Add CPU-only partition support and GPU memory querying

Support CPU-only partitions by returning 0 from get_total_gpus_per_node
instead of defaulting to 8, and raise a clear ValueError when GPU
resources are requested on a partition with no GPUs.

Add gpu_mem() API and gpu-mem CLI command to query GPU VRAM via
nvidia-smi and rocm-smi, with a new GPUMemInfo NamedTuple following
existing patterns.

Closes #79, Closes #80
---
 clusterscope/__init__.py     |   2 +
 clusterscope/cli.py          |  13 ++++
 clusterscope/cluster_info.py | 133 +++++++++++++++++++++++++++++++--
 clusterscope/lib.py          |  13 ++++
 tests/test_cluster_info.py   | 139 ++++++++++++++++++++++++++++++++++-
 5 files changed, 293 insertions(+), 7 deletions(-)

diff --git a/clusterscope/__init__.py b/clusterscope/__init__.py
index 125360a..3f60f91 100644
--- a/clusterscope/__init__.py
+++ b/clusterscope/__init__.py
@@ -10,6 +10,7 @@
     cpus,
     get_job,
     get_tmp_dir,
+    gpu_mem,
     job_gen_task_slurm,
     local_node_gpu_generation_and_count,
     mem,
@@ -21,6 +22,7 @@
     "slurm_version",
     "cpus",
     "mem",
+    "gpu_mem",
     "local_node_gpu_generation_and_count",
     "get_job",
     "job_gen_task_slurm",
diff --git a/clusterscope/cli.py b/clusterscope/cli.py
index 4b00845..8de678e 100644
--- a/clusterscope/cli.py
+++ b/clusterscope/cli.py
@@ -103,6 +103,19 @@ def mem(partition: str):
         click.echo(f"{mem.mem_total_MB}, {mem.mem_total_GB}, {mem.partition}")
 
 
+@cli.command(name="gpu-mem")
+def gpu_mem():
+    """Show GPU memory per GPU model on the current node."""
+    unified_info = UnifiedInfo()
+    gpu_mem_info = unified_info.get_gpu_mem_MB()
+    if not gpu_mem_info:
+        click.echo("No GPU memory information found")
+        return
+    click.echo("GPU Gen, GPU Vendor, Mem total MB, Mem total GB:")
+    for gm in gpu_mem_info:
+        click.echo(f"{gm.gpu_gen}, {gm.vendor}, {gm.mem_total_MB}, {gm.mem_total_GB}")
+
+
 @cli.command()
 @click.option(
     "--partition",
diff --git a/clusterscope/cluster_info.py b/clusterscope/cluster_info.py
index 3a2009b..c6c012a 100644
--- a/clusterscope/cluster_info.py
+++ b/clusterscope/cluster_info.py
@@ -113,6 +113,16 @@ class GPUInfo(NamedTuple):
     partition: Optional[str] = None
 
 
+class GPUMemInfo(NamedTuple):
+    """Represents GPU memory information for a device."""
+
+    mem_total_MB: int
+    mem_total_GB: int
+    vendor: str
+    gpu_gen: str
+    partition: Optional[str] = None
+
+
 class MemInfo(NamedTuple):
     """Represents memory information for a host."""
 
@@ -228,6 +238,19 @@ def get_mem_per_node_MB(self) -> list[MemInfo] | MemInfo:
             return self.slurm_cluster_info.get_mem_per_node_MB()
         return self.local_node_info.get_mem_MB()
 
+    def get_gpu_mem_MB(self) -> list[GPUMemInfo]:
+        """Get GPU memory for each GPU model available on the current node.
+
+        GPU memory is a hardware property not exposed by sinfo, so this always
+        queries the local node via nvidia-smi or rocm-smi.
+
+        Returns:
+            list[GPUMemInfo]: GPU memory information per GPU model.
+        """
+        if self.has_nvidia_gpus or self.has_amd_gpus:
+            return self.local_node_info.get_gpu_mem_MB()
+        return []
+
     def get_gpu_generation_and_count(self) -> list[GPUInfo]:
         """Get the number of GPUs on the slurm cluster node.
 
@@ -258,18 +281,16 @@ def get_total_gpus_per_node(self) -> int:
         """Get the total number of GPUs available per node.
 
         Returns:
-            int: Total number of GPUs per node. Returns 8 as default if no GPUs are detected.
+            int: Total number of GPUs per node. Returns 0 for CPU-only partitions.
         """
         gpus = self.get_gpu_generation_and_count()
         if not gpus:
-            # Default to 8 if no GPUs detected (common configuration)
-            return 8
+            return 0
 
         # Use maximum GPU count across node types in the partition.
         # This handles heterogeneous partitions where some nodes may have
         # fewer GPUs than others (e.g., due to hardware issues).
-        max_gpus = max(g.gpu_count for g in gpus)
-        return max(max_gpus, 1)  # Ensure at least 1 to avoid division by zero
+        return max(g.gpu_count for g in gpus)
 
     def get_task_resource_requirements(
         self,
@@ -326,6 +347,12 @@ def get_task_resource_requirements(
         elif gpus_per_task is not None:
             total_gpus_per_node = self.get_total_gpus_per_node()
 
+            if total_gpus_per_node == 0:
+                raise ValueError(
+                    f"Partition '{partition}' has no GPUs. "
+                    "Use cpus_per_task for CPU-only partitions."
+                )
+
             cpu_cores_per_gpu = total_cpus_per_node.cpu_count / total_gpus_per_node
             total_required_cpu_cores_per_task = math.floor(
                 cpu_cores_per_gpu * gpus_per_task
@@ -570,6 +597,45 @@ def get_nvidia_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
         except RuntimeError as e:
             raise RuntimeError(f"Failed to get NVIDIA GPU information: {str(e)}")
 
+    def _get_nvidia_gpu_mem_MB(self, timeout: int = 60) -> list[GPUMemInfo]:
+        """Get NVIDIA GPU memory using nvidia-smi."""
+        try:
+            result = run_cli(
+                [
+                    "nvidia-smi",
+                    "--query-gpu=gpu_name,memory.total",
+                    "--format=csv,noheader,nounits",
+                ],
+                text=True,
+                timeout=timeout,
+            )
+
+            gpu_mem: Dict[str, int] = defaultdict(int)
+            for line in result.strip().split("\n"):
+                if not line or ", " not in line:
+                    continue
+                gpu_name, mem_mb_str = line.rsplit(", ", 1)
+                gpu_name_upper = gpu_name.strip().upper()
+                gpu_gen = gpu_name_upper
+                for gpu_key, gpu_pattern in NVIDIA_GPU_TYPES.items():
+                    if gpu_pattern in gpu_name_upper:
+                        gpu_gen = gpu_key
+                        break
+                mem_mb = int(mem_mb_str.strip())
+                gpu_mem[gpu_gen] = max(gpu_mem[gpu_gen], mem_mb)
+
+            return [
+                GPUMemInfo(
+                    mem_total_MB=mem_mb,
+                    mem_total_GB=mem_mb // 1024,
+                    vendor="nvidia",
+                    gpu_gen=gpu_gen,
+                )
+                for gpu_gen, mem_mb in gpu_mem.items()
+            ]
+        except RuntimeError as e:
+            raise RuntimeError(f"Failed to get NVIDIA GPU memory: {str(e)}")
+
     def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
         """Get AMD GPU information using rocm-smi."""
         # Check if AMD GPUs are available
@@ -633,6 +699,38 @@ def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
         except RuntimeError as e:
             raise RuntimeError(f"Failed to get AMD GPU information: {str(e)}")
 
+    def _get_amd_gpu_mem_MB(self, timeout: int = 60) -> list[GPUMemInfo]:
+        """Get AMD GPU memory using rocm-smi."""
+        try:
+            result = run_cli(
+                ["rocm-smi", "--showmeminfo", "vram", "--json"],
+                text=True,
+                timeout=timeout,
+            )
+            data = json.loads(result)
+            gpu_mem: Dict[str, int] = defaultdict(int)
+            for card_data in data.values():
+                vram_total = card_data.get("VRAM Total Memory (B)", 0)
+                mem_mb = int(vram_total) // (1024 * 1024)
+                gpu_gen = "AMD"
+                for gpu_key, gpu_pattern in AMD_GPU_TYPES.items():
+                    if gpu_pattern in str(card_data):
+                        gpu_gen = gpu_key
+                        break
+                gpu_mem[gpu_gen] = max(gpu_mem[gpu_gen], mem_mb)
+
+            return [
+                GPUMemInfo(
+                    mem_total_MB=mem_mb,
+                    mem_total_GB=mem_mb // 1024,
+                    vendor="amd",
+                    gpu_gen=gpu_gen,
+                )
+                for gpu_gen, mem_mb in gpu_mem.items()
+            ]
+        except RuntimeError as e:
+            raise RuntimeError(f"Failed to get AMD GPU memory: {str(e)}")
+
     def get_gpu_generation_and_count(self, timeout: int = 60) -> list[GPUInfo]:
         """Get GPU information for all available GPUs on the local node.
 
@@ -666,6 +764,31 @@ def get_gpu_generation_and_count(self, timeout: int = 60) -> list[GPUInfo]:
 
         return gpu_info
 
+    def get_gpu_mem_MB(self, timeout: int = 60) -> list[GPUMemInfo]:
+        """Get GPU memory for all available GPUs on the local node.
+
+        Returns:
+            list[GPUMemInfo]: GPU memory information per GPU model.
+        """
+        gpu_mem_info = []
+
+        if self.has_nvidia_gpus():
+            try:
+                gpu_mem_info.extend(self._get_nvidia_gpu_mem_MB(timeout))
+            except RuntimeError as e:
+                logging.warning(f"Failed to get NVIDIA GPU memory: {e}")
+
+        if self.has_amd_gpus():
+            try:
+                gpu_mem_info.extend(self._get_amd_gpu_mem_MB(timeout))
+            except RuntimeError as e:
+                logging.warning(f"Failed to get AMD GPU memory: {e}")
+
+        if not gpu_mem_info:
+            logging.warning("No GPU memory information found")
+
+        return gpu_mem_info
+
     def has_gpu_type(self, gpu_type: str) -> bool:
         """Check if a specific GPU type is available on the local node.
 
diff --git a/clusterscope/lib.py b/clusterscope/lib.py
index 8cb4e93..14a678c 100644
--- a/clusterscope/lib.py
+++ b/clusterscope/lib.py
@@ -8,6 +8,7 @@
 from clusterscope.cluster_info import (
     CPUInfo,
     GPUInfo,
+    GPUMemInfo,
     LocalNodeInfo,
     MemInfo,
     UnifiedInfo,
@@ -101,6 +102,18 @@ def mem(
     return mem_info_list
 
 
+def gpu_mem() -> list[GPUMemInfo]:
+    """Get GPU memory for each GPU model on the current node.
+
+    Queries the local node via nvidia-smi (NVIDIA) or rocm-smi (AMD).
+    Returns an empty list on CPU-only nodes.
+
+    Returns:
+        list[GPUMemInfo]: GPU memory per GPU model.
+    """
+    return get_unified_info().get_gpu_mem_MB()
+
+
 def get_tmp_dir():
     tmp = get_unified_info().get_tmp_dir()
     return tmp
diff --git a/tests/test_cluster_info.py b/tests/test_cluster_info.py
index 660885f..6d59b3e 100644
--- a/tests/test_cluster_info.py
+++ b/tests/test_cluster_info.py
@@ -12,6 +12,7 @@
     CPUInfo,
     DarwinInfo,
     GPUInfo,
+    GPUMemInfo,
     LinuxInfo,
     LocalNodeInfo,
     MemInfo,
@@ -988,11 +989,11 @@ def test_get_total_gpus_per_node_with_gpus(self, mock_gpu_info):
 
     @patch.object(UnifiedInfo, "get_gpu_generation_and_count")
     def test_get_total_gpus_per_node_no_gpus_detected(self, mock_gpu_info):
-        """Test get_total_gpus_per_node defaults to 8 when no GPUs detected."""
+        """Test get_total_gpus_per_node returns 0 for CPU-only partitions."""
         mock_gpu_info.return_value = []
 
         result = self.unified_info.get_total_gpus_per_node()
-        self.assertEqual(result, 8)  # Default fallback
+        self.assertEqual(result, 0)  # CPU-only partition
 
     @patch.object(UnifiedInfo, "get_gpu_generation_and_count")
     def test_get_total_gpus_per_node_single_gpu_type(self, mock_gpu_info):
@@ -1097,5 +1098,139 @@ def test_get_task_resource_requirements_heterogeneous_gpu_partition(
         self.assertEqual(result.gpus_per_task, 8)
 
 
+class TestGetTotalGpusPerNodeCPUOnly(unittest.TestCase):
+    """Test get_total_gpus_per_node returns 0 for CPU-only partitions."""
+
+    def test_returns_zero_when_no_gpus(self):
+        unified_info = UnifiedInfo()
+        unified_info.is_slurm_cluster = False
+        unified_info.has_nvidia_gpus = False
+        unified_info.has_amd_gpus = False
+        self.assertEqual(unified_info.get_total_gpus_per_node(), 0)
+
+    @patch.object(UnifiedInfo, "get_gpu_generation_and_count")
+    def test_returns_max_gpu_count_when_gpus_present(self, mock_gpu):
+        mock_gpu.return_value = [
+            GPUInfo(gpu_gen="A100", gpu_count=8, vendor="nvidia"),
+        ]
+        unified_info = UnifiedInfo()
+        self.assertEqual(unified_info.get_total_gpus_per_node(), 8)
+
+    @patch.object(UnifiedInfo, "get_gpu_generation_and_count")
+    def test_returns_max_across_heterogeneous_nodes(self, mock_gpu):
+        mock_gpu.return_value = [
+            GPUInfo(gpu_gen="A100", gpu_count=8, vendor="nvidia"),
+            GPUInfo(gpu_gen="A100", gpu_count=4, vendor="nvidia"),
+        ]
+        unified_info = UnifiedInfo()
+        self.assertEqual(unified_info.get_total_gpus_per_node(), 8)
+
+
+class TestGetTaskResourceRequirementsCPUOnly(unittest.TestCase):
+    """Test GPU request on CPU-only partition raises ValueError."""
+
+    @patch.object(UnifiedInfo, "get_gpu_generation_and_count", return_value=[])
+    @patch.object(UnifiedInfo, "get_cpus_per_node")
+    @patch.object(UnifiedInfo, "get_mem_per_node_MB")
+    def test_gpu_request_on_cpu_only_partition_raises(
+        self, mock_mem, mock_cpu, mock_gpu
+    ):
+        mock_cpu.return_value = CPUInfo(cpu_count=64, partition="cpu_partition")
+        mock_mem.return_value = MemInfo(
+            mem_total_MB=512000, mem_total_GB=500, partition="cpu_partition"
+        )
+        unified_info = UnifiedInfo()
+        unified_info.is_slurm_cluster = False
+        with self.assertRaises(ValueError) as ctx:
+            unified_info.get_task_resource_requirements(
+                partition="cpu_partition",
+                gpus_per_task=1,
+            )
+        self.assertIn("no GPUs", str(ctx.exception))
+
+    @patch.object(UnifiedInfo, "get_cpus_per_node")
+    @patch.object(UnifiedInfo, "get_mem_per_node_MB")
+    def test_cpu_request_on_cpu_only_partition_succeeds(self, mock_mem, mock_cpu):
+        mock_cpu.return_value = CPUInfo(cpu_count=64, partition="cpu_partition")
+        mock_mem.return_value = MemInfo(
+            mem_total_MB=512000, mem_total_GB=500, partition="cpu_partition"
+        )
+        unified_info = UnifiedInfo()
+        unified_info.is_slurm_cluster = False
+        result = unified_info.get_task_resource_requirements(
+            partition="cpu_partition",
+            cpus_per_task=16,
+        )
+        self.assertIsInstance(result, ResourceShape)
+        self.assertEqual(result.cpus_per_task, 16)
+        self.assertIsNone(result.gpus_per_task)
+
+
+class TestLocalNodeInfoGPUMem(unittest.TestCase):
+    """Test GPU memory querying methods."""
+
+    def setUp(self):
+        self.local_node_info = LocalNodeInfo()
+
+    @patch.object(LocalNodeInfo, "has_nvidia_gpus", return_value=True)
+    @patch("clusterscope.cluster_info.run_cli")
+    def test_get_nvidia_gpu_mem_MB(self, mock_run_cli, mock_has_nvidia):
+        mock_run_cli.return_value = (
+            "NVIDIA A100-SXM4-40GB, 40960\nNVIDIA A100-SXM4-40GB, 40960"
+        )
+        result = self.local_node_info._get_nvidia_gpu_mem_MB()
+        self.assertEqual(len(result), 1)
+        self.assertEqual(result[0].gpu_gen, "A100")
+        self.assertEqual(result[0].mem_total_MB, 40960)
+        self.assertEqual(result[0].mem_total_GB, 40)
+        self.assertEqual(result[0].vendor, "nvidia")
+
+    @patch.object(LocalNodeInfo, "has_nvidia_gpus", return_value=True)
+    @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=False)
+    @patch.object(LocalNodeInfo, "_get_nvidia_gpu_mem_MB")
+    def test_get_gpu_mem_MB_nvidia_only(
+        self, mock_nvidia_mem, mock_has_amd, mock_has_nvidia
+    ):
+        mock_nvidia_mem.return_value = [
+            GPUMemInfo(
+                mem_total_MB=40960, mem_total_GB=40, vendor="nvidia", gpu_gen="A100"
+            )
+        ]
+        result = self.local_node_info.get_gpu_mem_MB()
+        self.assertEqual(len(result), 1)
+        self.assertEqual(result[0].gpu_gen, "A100")
+
+    @patch.object(LocalNodeInfo, "has_nvidia_gpus", return_value=False)
+    @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=False)
+    def test_get_gpu_mem_MB_no_gpus(self, mock_has_amd, mock_has_nvidia):
+        result = self.local_node_info.get_gpu_mem_MB()
+        self.assertEqual(result, [])
+
+
+class TestUnifiedInfoGPUMem(unittest.TestCase):
+    """Test UnifiedInfo.get_gpu_mem_MB dispatches correctly."""
+
+    @patch.object(LocalNodeInfo, "get_gpu_mem_MB")
+    def test_returns_gpu_mem_when_nvidia_present(self, mock_gpu_mem):
+        mock_gpu_mem.return_value = [
+            GPUMemInfo(
+                mem_total_MB=81920, mem_total_GB=80, vendor="nvidia", gpu_gen="H100"
+            )
+        ]
+        unified_info = UnifiedInfo()
+        unified_info.has_nvidia_gpus = True
+        unified_info.has_amd_gpus = False
+        result = unified_info.get_gpu_mem_MB()
+        self.assertEqual(len(result), 1)
+        self.assertEqual(result[0].gpu_gen, "H100")
+
+    def test_returns_empty_on_cpu_only_node(self):
+        unified_info = UnifiedInfo()
+        unified_info.has_nvidia_gpus = False
+        unified_info.has_amd_gpus = False
+        result = unified_info.get_gpu_mem_MB()
+        self.assertEqual(result, [])
+
+
 if __name__ == "__main__":
     unittest.main()

From ab420d57f64807e99bfaa1eaa88a219433e571c2 Mon Sep 17 00:00:00 2001
From: Gustavo Lima <gustcol@gmail.com>
Date: Wed, 25 Feb 2026 02:34:57 +0100
Subject: [PATCH 2/3] Fix pre-existing bugs and improve code robustness

- Remove self.partition mutation in get_task_resource_requirements that
  could cause stale query results on reused UnifiedInfo instances
- Fix unreachable RuntimeError after return in get_mem_per_node_MB and
  add proper empty-result guard
- Initialize gpu_gen variable in get_amd_gpu_info to prevent
  UnboundLocalError on unknown AMD GPU models
- Simplify broken GPU availability guards in get_nvidia_gpu_info and
  get_amd_gpu_info that silently fell through on success
- Add missing partition validation in check-gpu CLI command
- Fix test_get_cluster_name to include all valid return values
- Fix _get_amd_gpu_mem_MB to query product name separately and filter
  zero-byte VRAM entries
- Add test coverage for AMD GPU memory querying
- Add contributor and gpu-mem command to README
---
 README.md                    |  3 +-
 clusterscope/cli.py          |  2 +
 clusterscope/cluster_info.py | 77 +++++++++++++++++++-----------------
 tests/test_cluster_info.py   | 61 +++++++++++++++++++++++-----
 4 files changed, 96 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index d4c449d..a6ede95 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@ Commands:
   aws        Check if running on AWS and show NCCL settings.
   check-gpu  Check if a specific GPU type exists.
   cpus       Show CPU counts per node.
+  gpu-mem    Show GPU memory per GPU model on the current node.
   gpus       Show GPU information.
   info       Show basic cluster information.
   job-gen    Generate job requirements for different job types.
@@ -55,7 +56,7 @@ clusterscope is actively maintained by [Lucca Bertoncini](https://github.com/luc
 
 ## Contributors
 
-[Lucca Bertoncini](https://github.com/luccabb), [Kalyan Saladi](https://github.com/skalyan), [Nikhil Gupta](https://github.com/gunchu), [Misko Dzamba](https://github.com/misko), <Feel free to contribute and add your name>
+[Lucca Bertoncini](https://github.com/luccabb), [Kalyan Saladi](https://github.com/skalyan), [Nikhil Gupta](https://github.com/gunchu), [Misko Dzamba](https://github.com/misko), [Gustavo Colombini](https://github.com/gustcol), <Feel free to contribute and add your name>
 
 ### License
 
diff --git a/clusterscope/cli.py b/clusterscope/cli.py
index 8de678e..168b73c 100644
--- a/clusterscope/cli.py
+++ b/clusterscope/cli.py
@@ -191,6 +191,8 @@ def check_gpu(gpu_type: str, partition: str):
 
     GPU_TYPE: GPU type to check for (e.g., A100, MI300X)
     """
+    if partition is not None:
+        validate_partition_exists(partition=partition, exit_on_error=True)
     unified_info = UnifiedInfo(partition=partition)
     has_gpu = unified_info.has_gpu_type(gpu_type)
     if has_gpu:
diff --git a/clusterscope/cluster_info.py b/clusterscope/cluster_info.py
index c6c012a..2b56e38 100644
--- a/clusterscope/cluster_info.py
+++ b/clusterscope/cluster_info.py
@@ -325,7 +325,6 @@ def get_task_resource_requirements(
         if tasks_per_node < 1:
             raise ValueError("tasks_per_node must be at least 1")
 
-        self.partition = partition
         cpus_per_node = self.get_cpus_per_node()
         total_cpus_per_node = (
             cpus_per_node[0] if isinstance(cpus_per_node, list) else cpus_per_node
@@ -535,17 +534,8 @@ def get_mem_MB(self, timeout: int = 60) -> MemInfo:
 
     def get_nvidia_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
         """Get NVIDIA GPU information using nvidia-smi."""
-        # Check if NVIDIA GPUs are available
         if not self.has_nvidia_gpus():
-            try:
-                # Try to run nvidia-smi command
-                result = run_cli(
-                    ["nvidia-smi", "--query-gpu=gpu_name", "--format=csv,noheader"],
-                    text=True,
-                    timeout=timeout,
-                )
-            except RuntimeError:
-                raise RuntimeError("No NVIDIA GPUs found")
+            raise RuntimeError("No NVIDIA GPUs found")
         try:
             result = run_cli(
                 ["nvidia-smi", "--query-gpu=gpu_name,count", "--format=csv,noheader"],
@@ -638,17 +628,8 @@ def _get_nvidia_gpu_mem_MB(self, timeout: int = 60) -> list[GPUMemInfo]:
 
     def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
         """Get AMD GPU information using rocm-smi."""
-        # Check if AMD GPUs are available
         if not self.has_amd_gpus():
-            try:
-                # Try to run rocm-smi command
-                result = run_cli(
-                    ["rocm-smi", "--showproductname"],
-                    text=True,
-                    timeout=timeout,
-                )
-            except RuntimeError:
-                raise RuntimeError("No AMD GPUs found")
+            raise RuntimeError("No AMD GPUs found")
         try:
             result = run_cli(
                 ["rocm-smi", "--showproductname"], text=True, timeout=timeout
@@ -668,6 +649,7 @@ def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
                         gpu_name_upper = gpu_name.upper()
 
                         # Check for known AMD GPU types
+                        gpu_gen = ""
                         found_gpu = False
                         for gpu_key, gpu_pattern in AMD_GPU_TYPES.items():
                             if gpu_pattern in gpu_name_upper:
@@ -688,7 +670,7 @@ def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
                                     found_gpu = True
                                     break
 
-                        if not found_gpu and gpu_gen is None:
+                        if not found_gpu and not gpu_gen:
                             gpu_gen = gpu_name_upper
 
                         gpu_info[gpu_gen] += 1
@@ -700,33 +682,51 @@ def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
             raise RuntimeError(f"Failed to get AMD GPU information: {str(e)}")
 
     def _get_amd_gpu_mem_MB(self, timeout: int = 60) -> list[GPUMemInfo]:
-        """Get AMD GPU memory using rocm-smi."""
+        """Get AMD GPU memory using rocm-smi.
+
+        Queries --showproductname for GPU generation and --showmeminfo for VRAM.
+        """
         try:
-            result = run_cli(
+            # Identify GPU generation from product name
+            name_result = run_cli(
+                ["rocm-smi", "--showproductname"], text=True, timeout=timeout
+            )
+            detected_gen = "AMD"
+            for line in name_result.strip().split("\n"):
+                if "GPU" in line and ":" in line:
+                    parts = line.split(":")
+                    if len(parts) >= 2:
+                        gpu_name_upper = parts[-1].strip().upper()
+                        for gpu_key, gpu_pattern in AMD_GPU_TYPES.items():
+                            if gpu_pattern in gpu_name_upper:
+                                detected_gen = gpu_key
+                                break
+                    if detected_gen != "AMD":
+                        break
+
+            # Get VRAM info
+            vram_result = run_cli(
                 ["rocm-smi", "--showmeminfo", "vram", "--json"],
                 text=True,
                 timeout=timeout,
             )
-            data = json.loads(result)
-            gpu_mem: Dict[str, int] = defaultdict(int)
+            data = json.loads(vram_result)
+            max_mem_mb = 0
             for card_data in data.values():
                 vram_total = card_data.get("VRAM Total Memory (B)", 0)
                 mem_mb = int(vram_total) // (1024 * 1024)
-                gpu_gen = "AMD"
-                for gpu_key, gpu_pattern in AMD_GPU_TYPES.items():
-                    if gpu_pattern in str(card_data):
-                        gpu_gen = gpu_key
-                        break
-                gpu_mem[gpu_gen] = max(gpu_mem[gpu_gen], mem_mb)
+                max_mem_mb = max(max_mem_mb, mem_mb)
+
+            if max_mem_mb == 0:
+                return []
 
             return [
                 GPUMemInfo(
-                    mem_total_MB=mem_mb,
-                    mem_total_GB=mem_mb // 1024,
+                    mem_total_MB=max_mem_mb,
+                    mem_total_GB=max_mem_mb // 1024,
                     vendor="amd",
-                    gpu_gen=gpu_gen,
+                    gpu_gen=detected_gen,
                 )
-                for gpu_gen, mem_mb in gpu_mem.items()
             ]
         except RuntimeError as e:
             raise RuntimeError(f"Failed to get AMD GPU memory: {str(e)}")
@@ -926,8 +926,11 @@ def get_mem_per_node_MB(self) -> list[MemInfo]:
                         partition=partition.strip("* "),
                     )
                 )
+            if not results:
+                raise RuntimeError(
+                    f"No mem information found in: {result.stdout}"
+                )
             return results
-            raise RuntimeError(f"No mem information found in: {result.stdout}")
         except (subprocess.SubprocessError, FileNotFoundError) as e:
             logging.error(f"Failed to get Slurm memory information: {str(e)}")
             raise RuntimeError(f"Failed to get Slurm memory information: {str(e)}")
diff --git a/tests/test_cluster_info.py b/tests/test_cluster_info.py
index 6d59b3e..babee87 100644
--- a/tests/test_cluster_info.py
+++ b/tests/test_cluster_info.py
@@ -28,12 +28,18 @@ class TestUnifiedInfo(unittest.TestCase):
     def test_get_cluster_name(self):
         unified_info = UnifiedInfo()
         unified_info.is_slurm_cluster = False
-        self.assertIn(unified_info.get_cluster_name(), ["local-node", "github"])
+        self.assertIn(
+            unified_info.get_cluster_name(),
+            ["local-node", "github", "macos", "mast"],
+        )
 
     def test_get_cluster_name_with_partition(self):
         unified_info = UnifiedInfo(partition="gpu_partition")
         unified_info.is_slurm_cluster = False
-        self.assertIn(unified_info.get_cluster_name(), ["local-node", "github"])
+        self.assertIn(
+            unified_info.get_cluster_name(),
+            ["local-node", "github", "macos", "mast"],
+        )
         self.assertEqual(unified_info.partition, "gpu_partition")
 
     def test_get_gpu_generation_and_count(self):
@@ -499,8 +505,9 @@ def test_has_amd_gpus_false_called_process_error(self, mock_run):
         mock_run.side_effect = subprocess.CalledProcessError(1, ["rocm-smi"])
         self.assertFalse(self.local_node_info.has_amd_gpus())
 
+    @patch.object(LocalNodeInfo, "has_nvidia_gpus", return_value=True)
     @patch("clusterscope.cluster_info.run_cli")
-    def test_get_nvidia_gpu_info_success(self, mock_run_cli):
+    def test_get_nvidia_gpu_info_success(self, mock_run_cli, mock_has_nvidia):
         """Test successful NVIDIA GPU information retrieval."""
         mock_run_cli.return_value = "NVIDIA A100-SXM4-40GB, 2\nNVIDIA A100-SXM4-40GB, 2\nTesla V100-SXM2-16GB, 1"
 
@@ -511,8 +518,9 @@ def test_get_nvidia_gpu_info_success(self, mock_run_cli):
         ]
         self.assertEqual(result, expected)
 
+    @patch.object(LocalNodeInfo, "has_nvidia_gpus", return_value=True)
     @patch("clusterscope.cluster_info.run_cli")
-    def test_get_nvidia_gpu_info_empty_lines(self, mock_run_cli):
+    def test_get_nvidia_gpu_info_empty_lines(self, mock_run_cli, mock_has_nvidia):
         """Test NVIDIA GPU info parsing with empty lines."""
         mock_run_cli.return_value = (
             "NVIDIA A100-SXM4-40GB, 1\n\n\nTesla V100-SXM2-16GB, 1\n"
@@ -525,8 +533,9 @@ def test_get_nvidia_gpu_info_empty_lines(self, mock_run_cli):
         ]
         self.assertEqual(result, expected)
 
+    @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=True)
     @patch("clusterscope.cluster_info.run_cli")
-    def test_get_amd_gpu_info_mi300x(self, mock_run_cli):
+    def test_get_amd_gpu_info_mi300x(self, mock_run_cli, mock_has_amd):
         """Test AMD GPU information retrieval for MI300X."""
         mock_run_cli.return_value = """GPU[0]: AMD Instinct MI300X
 GPU[1]: AMD Instinct MI300X"""
@@ -535,8 +544,9 @@ def test_get_amd_gpu_info_mi300x(self, mock_run_cli):
         expected = [GPUInfo(gpu_gen="MI300X", gpu_count=2, vendor="amd")]
         self.assertEqual(result, expected)
 
+    @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=True)
     @patch("clusterscope.cluster_info.run_cli")
-    def test_get_amd_gpu_info_mi300a(self, mock_run_cli):
+    def test_get_amd_gpu_info_mi300a(self, mock_run_cli, mock_has_amd):
         """Test AMD GPU information retrieval for MI300A."""
         mock_run_cli.return_value = "GPU[0]: AMD Instinct MI300A"
 
@@ -544,8 +554,9 @@ def test_get_amd_gpu_info_mi300a(self, mock_run_cli):
         expected = [GPUInfo(gpu_gen="MI300A", gpu_count=1, vendor="amd")]
         self.assertEqual(result, expected)
 
+    @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=True)
     @patch("clusterscope.cluster_info.run_cli")
-    def test_get_amd_gpu_info_various_models(self, mock_run_cli):
+    def test_get_amd_gpu_info_various_models(self, mock_run_cli, mock_has_amd):
         """Test AMD GPU info parsing with various GPU models."""
         mock_run_cli.return_value = """GPU[0]: AMD Instinct MI250X
 GPU[1]: AMD Instinct MI210
@@ -579,8 +590,9 @@ def test_get_amd_gpu_info_various_models(self, mock_run_cli):
         for gpu in expected:
             self.assertIn(gpu, result)
 
+    @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=True)
     @patch("clusterscope.cluster_info.run_cli")
-    def test_get_amd_gpu_info_generic_fallback(self, mock_run_cli):
+    def test_get_amd_gpu_info_generic_fallback(self, mock_run_cli, mock_has_amd):
         """Test AMD GPU info parsing with generic fallback for unknown models."""
         mock_run_cli.return_value = "GPU[0]: AMD Radeon RX 6800 XT"
 
@@ -589,8 +601,9 @@ def test_get_amd_gpu_info_generic_fallback(self, mock_run_cli):
         expected = [GPUInfo(gpu_gen="6800", gpu_count=1, vendor="amd", partition=None)]
         self.assertEqual(result, expected)
 
+    @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=True)
     @patch("clusterscope.cluster_info.run_cli")
-    def test_get_amd_gpu_info_no_gpu_lines(self, mock_run_cli):
+    def test_get_amd_gpu_info_no_gpu_lines(self, mock_run_cli, mock_has_amd):
         """Test AMD GPU info parsing with no GPU lines."""
         mock_run_cli.return_value = "Some other output\nNo GPU information here"
 
@@ -1200,6 +1213,36 @@ def test_get_gpu_mem_MB_nvidia_only(
         self.assertEqual(len(result), 1)
         self.assertEqual(result[0].gpu_gen, "A100")
 
+    @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=True)
+    @patch("clusterscope.cluster_info.run_cli")
+    def test_get_amd_gpu_mem_MB(self, mock_run_cli, mock_has_amd):
+        mock_run_cli.side_effect = [
+            # First call: --showproductname
+            "GPU[0]          : Card model:          AMD Instinct MI300X\n"
+            "GPU[1]          : Card model:          AMD Instinct MI300X\n",
+            # Second call: --showmeminfo vram --json
+            '{"card0": {"VRAM Total Memory (B)": 206158430208}, '
+            '"card1": {"VRAM Total Memory (B)": 206158430208}}',
+        ]
+        result = self.local_node_info._get_amd_gpu_mem_MB()
+        self.assertEqual(len(result), 1)
+        self.assertEqual(result[0].gpu_gen, "MI300X")
+        self.assertEqual(result[0].mem_total_MB, 196608)
+        self.assertEqual(result[0].mem_total_GB, 192)
+        self.assertEqual(result[0].vendor, "amd")
+
+    @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=True)
+    @patch("clusterscope.cluster_info.run_cli")
+    def test_get_amd_gpu_mem_MB_zero_vram_returns_empty(
+        self, mock_run_cli, mock_has_amd
+    ):
+        mock_run_cli.side_effect = [
+            "GPU[0]          : Card model:          Unknown GPU\n",
+            '{"card0": {"VRAM Total Memory (B)": 0}}',
+        ]
+        result = self.local_node_info._get_amd_gpu_mem_MB()
+        self.assertEqual(result, [])
+
     @patch.object(LocalNodeInfo, "has_nvidia_gpus", return_value=False)
     @patch.object(LocalNodeInfo, "has_amd_gpus", return_value=False)
     def test_get_gpu_mem_MB_no_gpus(self, mock_has_amd, mock_has_nvidia):

From be6fcdbbfe6b29cad2780cd550c0dddc4732186b Mon Sep 17 00:00:00 2001
From: Gustavo Lima <gustcol@gmail.com>
Date: Wed, 25 Feb 2026 02:38:40 +0100
Subject: [PATCH 3/3] Update contributor name

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a6ede95..4fd76a1 100644
--- a/README.md
+++ b/README.md
@@ -56,7 +56,7 @@ clusterscope is actively maintained by [Lucca Bertoncini](https://github.com/luc
 
 ## Contributors
 
-[Lucca Bertoncini](https://github.com/luccabb), [Kalyan Saladi](https://github.com/skalyan), [Nikhil Gupta](https://github.com/gunchu), [Misko Dzamba](https://github.com/misko), [Gustavo Colombini](https://github.com/gustcol), <Feel free to contribute and add your name>
+[Lucca Bertoncini](https://github.com/luccabb), [Kalyan Saladi](https://github.com/skalyan), [Nikhil Gupta](https://github.com/gunchu), [Misko Dzamba](https://github.com/misko), [Gustavo Lima](https://github.com/gustcol), <Feel free to contribute and add your name>
 
 ### License