facebookresearch · gustcol · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
@@ -34,6 +34,7 @@ Commands:
   aws        Check if running on AWS and show NCCL settings.
   check-gpu  Check if a specific GPU type exists.
   cpus       Show CPU counts per node.
+  gpu-mem    Show GPU memory per GPU model on the current node.
   gpus       Show GPU information.
   info       Show basic cluster information.
   job-gen    Generate job requirements for different job types.
@@ -55,7 +56,7 @@ clusterscope is actively maintained by [Lucca Bertoncini](https://github.com/luc
 
 ## Contributors
 
-[Lucca Bertoncini](https://github.com/luccabb), [Kalyan Saladi](https://github.com/skalyan), [Nikhil Gupta](https://github.com/gunchu), [Misko Dzamba](https://github.com/misko), <Feel free to contribute and add your name>
+[Lucca Bertoncini](https://github.com/luccabb), [Kalyan Saladi](https://github.com/skalyan), [Nikhil Gupta](https://github.com/gunchu), [Misko Dzamba](https://github.com/misko), [Gustavo Lima](https://github.com/gustcol), <Feel free to contribute and add your name>
 
 ### License
 

@@ -10,6 +10,7 @@
     cpus,
     get_job,
     get_tmp_dir,
+    gpu_mem,
     job_gen_task_slurm,
     local_node_gpu_generation_and_count,
     mem,
@@ -21,6 +22,7 @@
     "slurm_version",
     "cpus",
     "mem",
+    "gpu_mem",
     "local_node_gpu_generation_and_count",
     "get_job",
     "job_gen_task_slurm",

@@ -103,6 +103,19 @@ def mem(partition: str):
         click.echo(f"{mem.mem_total_MB}, {mem.mem_total_GB}, {mem.partition}")
 
 
+@cli.command(name="gpu-mem")
+def gpu_mem():
+    """Show GPU memory per GPU model on the current node."""
+    unified_info = UnifiedInfo()
+    gpu_mem_info = unified_info.get_gpu_mem_MB()
+    if not gpu_mem_info:
+        click.echo("No GPU memory information found")
+        return
+    click.echo("GPU Gen, GPU Vendor, Mem total MB, Mem total GB:")
+    for gm in gpu_mem_info:
+        click.echo(f"{gm.gpu_gen}, {gm.vendor}, {gm.mem_total_MB}, {gm.mem_total_GB}")
+
+
 @cli.command()
 @click.option(
     "--partition",
@@ -178,6 +191,8 @@ def check_gpu(gpu_type: str, partition: str):
 
     GPU_TYPE: GPU type to check for (e.g., A100, MI300X)
     """
+    if partition is not None:
+        validate_partition_exists(partition=partition, exit_on_error=True)
     unified_info = UnifiedInfo(partition=partition)
     has_gpu = unified_info.has_gpu_type(gpu_type)
     if has_gpu:

@@ -113,6 +113,16 @@ class GPUInfo(NamedTuple):
     partition: Optional[str] = None
 
 
+class GPUMemInfo(NamedTuple):
+    """Represents GPU memory information for a device."""
+
+    mem_total_MB: int
+    mem_total_GB: int
+    vendor: str
+    gpu_gen: str
+    partition: Optional[str] = None
+
+
 class MemInfo(NamedTuple):
     """Represents memory information for a host."""
 
@@ -228,6 +238,19 @@ def get_mem_per_node_MB(self) -> list[MemInfo] | MemInfo:
             return self.slurm_cluster_info.get_mem_per_node_MB()
         return self.local_node_info.get_mem_MB()
 
+    def get_gpu_mem_MB(self) -> list[GPUMemInfo]:
+        """Get GPU memory for each GPU model available on the current node.
+
+        GPU memory is a hardware property not exposed by sinfo, so this always
+        queries the local node via nvidia-smi or rocm-smi.
+
+        Returns:
+            list[GPUMemInfo]: GPU memory information per GPU model.
+        """
+        if self.has_nvidia_gpus or self.has_amd_gpus:
+            return self.local_node_info.get_gpu_mem_MB()
+        return []
+
     def get_gpu_generation_and_count(self) -> list[GPUInfo]:
         """Get the number of GPUs on the slurm cluster node.
 
@@ -258,18 +281,16 @@ def get_total_gpus_per_node(self) -> int:
         """Get the total number of GPUs available per node.
 
         Returns:
-            int: Total number of GPUs per node. Returns 8 as default if no GPUs are detected.
+            int: Total number of GPUs per node. Returns 0 for CPU-only partitions.
         """
         gpus = self.get_gpu_generation_and_count()
         if not gpus:
-            # Default to 8 if no GPUs detected (common configuration)
-            return 8
+            return 0
 
         # Use maximum GPU count across node types in the partition.
         # This handles heterogeneous partitions where some nodes may have
         # fewer GPUs than others (e.g., due to hardware issues).
-        max_gpus = max(g.gpu_count for g in gpus)
-        return max(max_gpus, 1)  # Ensure at least 1 to avoid division by zero
+        return max(g.gpu_count for g in gpus)
 
     def get_task_resource_requirements(
         self,
@@ -304,7 +325,6 @@ def get_task_resource_requirements(
         if tasks_per_node < 1:
             raise ValueError("tasks_per_node must be at least 1")
 
-        self.partition = partition
         cpus_per_node = self.get_cpus_per_node()
         total_cpus_per_node = (
             cpus_per_node[0] if isinstance(cpus_per_node, list) else cpus_per_node
@@ -326,6 +346,12 @@ def get_task_resource_requirements(
         elif gpus_per_task is not None:
             total_gpus_per_node = self.get_total_gpus_per_node()
 
+            if total_gpus_per_node == 0:
+                raise ValueError(
+                    f"Partition '{partition}' has no GPUs. "
+                    "Use cpus_per_task for CPU-only partitions."
+                )
+
             cpu_cores_per_gpu = total_cpus_per_node.cpu_count / total_gpus_per_node
             total_required_cpu_cores_per_task = math.floor(
                 cpu_cores_per_gpu * gpus_per_task
@@ -508,17 +534,8 @@ def get_mem_MB(self, timeout: int = 60) -> MemInfo:
 
     def get_nvidia_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
         """Get NVIDIA GPU information using nvidia-smi."""
-        # Check if NVIDIA GPUs are available
         if not self.has_nvidia_gpus():
-            try:
-                # Try to run nvidia-smi command
-                result = run_cli(
-                    ["nvidia-smi", "--query-gpu=gpu_name", "--format=csv,noheader"],
-                    text=True,
-                    timeout=timeout,
-                )
-            except RuntimeError:
-                raise RuntimeError("No NVIDIA GPUs found")
+            raise RuntimeError("No NVIDIA GPUs found")
         try:
             result = run_cli(
                 ["nvidia-smi", "--query-gpu=gpu_name,count", "--format=csv,noheader"],
@@ -570,19 +587,49 @@ def get_nvidia_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
         except RuntimeError as e:
             raise RuntimeError(f"Failed to get NVIDIA GPU information: {str(e)}")
 
+    def _get_nvidia_gpu_mem_MB(self, timeout: int = 60) -> list[GPUMemInfo]:
+        """Get NVIDIA GPU memory using nvidia-smi."""
+        try:
+            result = run_cli(
+                [
+                    "nvidia-smi",
+                    "--query-gpu=gpu_name,memory.total",
+                    "--format=csv,noheader,nounits",
+                ],
+                text=True,
+                timeout=timeout,
+            )
+
+            gpu_mem: Dict[str, int] = defaultdict(int)
+            for line in result.strip().split("\n"):
+                if not line or ", " not in line:
+                    continue
+                gpu_name, mem_mb_str = line.rsplit(", ", 1)
+                gpu_name_upper = gpu_name.strip().upper()
+                gpu_gen = gpu_name_upper
+                for gpu_key, gpu_pattern in NVIDIA_GPU_TYPES.items():
+                    if gpu_pattern in gpu_name_upper:
+                        gpu_gen = gpu_key
+                        break
+                mem_mb = int(mem_mb_str.strip())
+                gpu_mem[gpu_gen] = max(gpu_mem[gpu_gen], mem_mb)
+
+            return [
+                GPUMemInfo(
+                    mem_total_MB=mem_mb,
+                    mem_total_GB=mem_mb // 1024,
+                    vendor="nvidia",
+                    gpu_gen=gpu_gen,
+                )
+                for gpu_gen, mem_mb in gpu_mem.items()
+            ]
+        except RuntimeError as e:
+            raise RuntimeError(f"Failed to get NVIDIA GPU memory: {str(e)}")
+
     def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
         """Get AMD GPU information using rocm-smi."""
-        # Check if AMD GPUs are available
         if not self.has_amd_gpus():
-            try:
-                # Try to run rocm-smi command
-                result = run_cli(
-                    ["rocm-smi", "--showproductname"],
-                    text=True,
-                    timeout=timeout,
-                )
-            except RuntimeError:
-                raise RuntimeError("No AMD GPUs found")
+            raise RuntimeError("No AMD GPUs found")
         try:
             result = run_cli(
                 ["rocm-smi", "--showproductname"], text=True, timeout=timeout
@@ -602,6 +649,7 @@ def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
                         gpu_name_upper = gpu_name.upper()
 
                         # Check for known AMD GPU types
+                        gpu_gen = ""
                         found_gpu = False
                         for gpu_key, gpu_pattern in AMD_GPU_TYPES.items():
                             if gpu_pattern in gpu_name_upper:
@@ -622,7 +670,7 @@ def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
                                     found_gpu = True
                                     break
 
-                        if not found_gpu and gpu_gen is None:
+                        if not found_gpu and not gpu_gen:
                             gpu_gen = gpu_name_upper
 
                         gpu_info[gpu_gen] += 1
@@ -633,6 +681,56 @@ def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
         except RuntimeError as e:
             raise RuntimeError(f"Failed to get AMD GPU information: {str(e)}")
 
+    def _get_amd_gpu_mem_MB(self, timeout: int = 60) -> list[GPUMemInfo]:
+        """Get AMD GPU memory using rocm-smi.
+
+        Queries --showproductname for GPU generation and --showmeminfo for VRAM.
+        """
+        try:
+            # Identify GPU generation from product name
+            name_result = run_cli(
+                ["rocm-smi", "--showproductname"], text=True, timeout=timeout
+            )
+            detected_gen = "AMD"
+            for line in name_result.strip().split("\n"):
+                if "GPU" in line and ":" in line:
+                    parts = line.split(":")
+                    if len(parts) >= 2:
+                        gpu_name_upper = parts[-1].strip().upper()
+                        for gpu_key, gpu_pattern in AMD_GPU_TYPES.items():
+                            if gpu_pattern in gpu_name_upper:
+                                detected_gen = gpu_key
+                                break
+                    if detected_gen != "AMD":
+                        break
+
+            # Get VRAM info
+            vram_result = run_cli(
+                ["rocm-smi", "--showmeminfo", "vram", "--json"],
+                text=True,
+                timeout=timeout,
+            )
+            data = json.loads(vram_result)
+            max_mem_mb = 0
+            for card_data in data.values():
+                vram_total = card_data.get("VRAM Total Memory (B)", 0)
+                mem_mb = int(vram_total) // (1024 * 1024)
+                max_mem_mb = max(max_mem_mb, mem_mb)
+
+            if max_mem_mb == 0:
+                return []
+
+            return [
+                GPUMemInfo(
+                    mem_total_MB=max_mem_mb,
+                    mem_total_GB=max_mem_mb // 1024,
+                    vendor="amd",
+                    gpu_gen=detected_gen,
+                )
+            ]
+        except RuntimeError as e:
+            raise RuntimeError(f"Failed to get AMD GPU memory: {str(e)}")
+
     def get_gpu_generation_and_count(self, timeout: int = 60) -> list[GPUInfo]:
         """Get GPU information for all available GPUs on the local node.
 
@@ -666,6 +764,31 @@ def get_gpu_generation_and_count(self, timeout: int = 60) -> list[GPUInfo]:
 
         return gpu_info
 
+    def get_gpu_mem_MB(self, timeout: int = 60) -> list[GPUMemInfo]:
+        """Get GPU memory for all available GPUs on the local node.
+
+        Returns:
+            list[GPUMemInfo]: GPU memory information per GPU model.
+        """
+        gpu_mem_info = []
+
+        if self.has_nvidia_gpus():
+            try:
+                gpu_mem_info.extend(self._get_nvidia_gpu_mem_MB(timeout))
+            except RuntimeError as e:
+                logging.warning(f"Failed to get NVIDIA GPU memory: {e}")
+
+        if self.has_amd_gpus():
+            try:
+                gpu_mem_info.extend(self._get_amd_gpu_mem_MB(timeout))
+            except RuntimeError as e:
+                logging.warning(f"Failed to get AMD GPU memory: {e}")
+
+        if not gpu_mem_info:
+            logging.warning("No GPU memory information found")
+
+        return gpu_mem_info
+
     def has_gpu_type(self, gpu_type: str) -> bool:
         """Check if a specific GPU type is available on the local node.
 
@@ -803,8 +926,11 @@ def get_mem_per_node_MB(self) -> list[MemInfo]:
                         partition=partition.strip("* "),
                     )
                 )
+            if not results:
+                raise RuntimeError(
+                    f"No mem information found in: {result.stdout}"
+                )
             return results
-            raise RuntimeError(f"No mem information found in: {result.stdout}")
         except (subprocess.SubprocessError, FileNotFoundError) as e:
             logging.error(f"Failed to get Slurm memory information: {str(e)}")
             raise RuntimeError(f"Failed to get Slurm memory information: {str(e)}")

@@ -8,6 +8,7 @@
 from clusterscope.cluster_info import (
     CPUInfo,
     GPUInfo,
+    GPUMemInfo,
     LocalNodeInfo,
     MemInfo,
     UnifiedInfo,
@@ -101,6 +102,18 @@ def mem(
     return mem_info_list
 
 
+def gpu_mem() -> list[GPUMemInfo]:
+    """Get GPU memory for each GPU model on the current node.
+
+    Queries the local node via nvidia-smi (NVIDIA) or rocm-smi (AMD).
+    Returns an empty list on CPU-only nodes.
+
+    Returns:
+        list[GPUMemInfo]: GPU memory per GPU model.
+    """
+    return get_unified_info().get_gpu_mem_MB()
+
+
 def get_tmp_dir():
     tmp = get_unified_info().get_tmp_dir()
     return tmp