Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ Commands:
aws Check if running on AWS and show NCCL settings.
check-gpu Check if a specific GPU type exists.
cpus Show CPU counts per node.
gpu-mem Show GPU memory per GPU model on the current node.
gpus Show GPU information.
info Show basic cluster information.
job-gen Generate job requirements for different job types.
Expand All @@ -55,7 +56,7 @@ clusterscope is actively maintained by [Lucca Bertoncini](https://github.com/luc

## Contributors

[Lucca Bertoncini](https://github.com/luccabb), [Kalyan Saladi](https://github.com/skalyan), [Nikhil Gupta](https://github.com/gunchu), [Misko Dzamba](https://github.com/misko), <Feel free to contribute and add your name>
[Lucca Bertoncini](https://github.com/luccabb), [Kalyan Saladi](https://github.com/skalyan), [Nikhil Gupta](https://github.com/gunchu), [Misko Dzamba](https://github.com/misko), [Gustavo Lima](https://github.com/gustcol), <Feel free to contribute and add your name>

### License

Expand Down
2 changes: 2 additions & 0 deletions clusterscope/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
cpus,
get_job,
get_tmp_dir,
gpu_mem,
job_gen_task_slurm,
local_node_gpu_generation_and_count,
mem,
Expand All @@ -21,6 +22,7 @@
"slurm_version",
"cpus",
"mem",
"gpu_mem",
"local_node_gpu_generation_and_count",
"get_job",
"job_gen_task_slurm",
Expand Down
15 changes: 15 additions & 0 deletions clusterscope/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,19 @@ def mem(partition: str):
click.echo(f"{mem.mem_total_MB}, {mem.mem_total_GB}, {mem.partition}")


@cli.command(name="gpu-mem")
def gpu_mem():
"""Show GPU memory per GPU model on the current node."""
unified_info = UnifiedInfo()
gpu_mem_info = unified_info.get_gpu_mem_MB()
if not gpu_mem_info:
click.echo("No GPU memory information found")
return
click.echo("GPU Gen, GPU Vendor, Mem total MB, Mem total GB:")
for gm in gpu_mem_info:
click.echo(f"{gm.gpu_gen}, {gm.vendor}, {gm.mem_total_MB}, {gm.mem_total_GB}")


@cli.command()
@click.option(
"--partition",
Expand Down Expand Up @@ -178,6 +191,8 @@ def check_gpu(gpu_type: str, partition: str):

GPU_TYPE: GPU type to check for (e.g., A100, MI300X)
"""
if partition is not None:
validate_partition_exists(partition=partition, exit_on_error=True)
unified_info = UnifiedInfo(partition=partition)
has_gpu = unified_info.has_gpu_type(gpu_type)
if has_gpu:
Expand Down
182 changes: 154 additions & 28 deletions clusterscope/cluster_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,16 @@ class GPUInfo(NamedTuple):
partition: Optional[str] = None


class GPUMemInfo(NamedTuple):
"""Represents GPU memory information for a device."""

mem_total_MB: int
mem_total_GB: int
vendor: str
gpu_gen: str
partition: Optional[str] = None


class MemInfo(NamedTuple):
"""Represents memory information for a host."""

Expand Down Expand Up @@ -228,6 +238,19 @@ def get_mem_per_node_MB(self) -> list[MemInfo] | MemInfo:
return self.slurm_cluster_info.get_mem_per_node_MB()
return self.local_node_info.get_mem_MB()

def get_gpu_mem_MB(self) -> list[GPUMemInfo]:
"""Get GPU memory for each GPU model available on the current node.

GPU memory is a hardware property not exposed by sinfo, so this always
queries the local node via nvidia-smi or rocm-smi.

Returns:
list[GPUMemInfo]: GPU memory information per GPU model.
"""
if self.has_nvidia_gpus or self.has_amd_gpus:
return self.local_node_info.get_gpu_mem_MB()
return []

def get_gpu_generation_and_count(self) -> list[GPUInfo]:
"""Get the number of GPUs on the slurm cluster node.

Expand Down Expand Up @@ -258,18 +281,16 @@ def get_total_gpus_per_node(self) -> int:
"""Get the total number of GPUs available per node.

Returns:
int: Total number of GPUs per node. Returns 8 as default if no GPUs are detected.
int: Total number of GPUs per node. Returns 0 for CPU-only partitions.
"""
gpus = self.get_gpu_generation_and_count()
if not gpus:
# Default to 8 if no GPUs detected (common configuration)
return 8
return 0

# Use maximum GPU count across node types in the partition.
# This handles heterogeneous partitions where some nodes may have
# fewer GPUs than others (e.g., due to hardware issues).
max_gpus = max(g.gpu_count for g in gpus)
return max(max_gpus, 1) # Ensure at least 1 to avoid division by zero
return max(g.gpu_count for g in gpus)

def get_task_resource_requirements(
self,
Expand Down Expand Up @@ -304,7 +325,6 @@ def get_task_resource_requirements(
if tasks_per_node < 1:
raise ValueError("tasks_per_node must be at least 1")

self.partition = partition
cpus_per_node = self.get_cpus_per_node()
total_cpus_per_node = (
cpus_per_node[0] if isinstance(cpus_per_node, list) else cpus_per_node
Expand All @@ -326,6 +346,12 @@ def get_task_resource_requirements(
elif gpus_per_task is not None:
total_gpus_per_node = self.get_total_gpus_per_node()

if total_gpus_per_node == 0:
raise ValueError(
f"Partition '{partition}' has no GPUs. "
"Use cpus_per_task for CPU-only partitions."
)

cpu_cores_per_gpu = total_cpus_per_node.cpu_count / total_gpus_per_node
total_required_cpu_cores_per_task = math.floor(
cpu_cores_per_gpu * gpus_per_task
Expand Down Expand Up @@ -508,17 +534,8 @@ def get_mem_MB(self, timeout: int = 60) -> MemInfo:

def get_nvidia_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
"""Get NVIDIA GPU information using nvidia-smi."""
# Check if NVIDIA GPUs are available
if not self.has_nvidia_gpus():
try:
# Try to run nvidia-smi command
result = run_cli(
["nvidia-smi", "--query-gpu=gpu_name", "--format=csv,noheader"],
text=True,
timeout=timeout,
)
except RuntimeError:
raise RuntimeError("No NVIDIA GPUs found")
raise RuntimeError("No NVIDIA GPUs found")
try:
result = run_cli(
["nvidia-smi", "--query-gpu=gpu_name,count", "--format=csv,noheader"],
Expand Down Expand Up @@ -570,19 +587,49 @@ def get_nvidia_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
except RuntimeError as e:
raise RuntimeError(f"Failed to get NVIDIA GPU information: {str(e)}")

def _get_nvidia_gpu_mem_MB(self, timeout: int = 60) -> list[GPUMemInfo]:
"""Get NVIDIA GPU memory using nvidia-smi."""
try:
result = run_cli(
[
"nvidia-smi",
"--query-gpu=gpu_name,memory.total",
"--format=csv,noheader,nounits",
],
text=True,
timeout=timeout,
)

gpu_mem: Dict[str, int] = defaultdict(int)
for line in result.strip().split("\n"):
if not line or ", " not in line:
continue
gpu_name, mem_mb_str = line.rsplit(", ", 1)
gpu_name_upper = gpu_name.strip().upper()
gpu_gen = gpu_name_upper
for gpu_key, gpu_pattern in NVIDIA_GPU_TYPES.items():
if gpu_pattern in gpu_name_upper:
gpu_gen = gpu_key
break
mem_mb = int(mem_mb_str.strip())
gpu_mem[gpu_gen] = max(gpu_mem[gpu_gen], mem_mb)

return [
GPUMemInfo(
mem_total_MB=mem_mb,
mem_total_GB=mem_mb // 1024,
vendor="nvidia",
gpu_gen=gpu_gen,
)
for gpu_gen, mem_mb in gpu_mem.items()
]
except RuntimeError as e:
raise RuntimeError(f"Failed to get NVIDIA GPU memory: {str(e)}")

def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
"""Get AMD GPU information using rocm-smi."""
# Check if AMD GPUs are available
if not self.has_amd_gpus():
try:
# Try to run rocm-smi command
result = run_cli(
["rocm-smi", "--showproductname"],
text=True,
timeout=timeout,
)
except RuntimeError:
raise RuntimeError("No AMD GPUs found")
raise RuntimeError("No AMD GPUs found")
try:
result = run_cli(
["rocm-smi", "--showproductname"], text=True, timeout=timeout
Expand All @@ -602,6 +649,7 @@ def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
gpu_name_upper = gpu_name.upper()

# Check for known AMD GPU types
gpu_gen = ""
found_gpu = False
for gpu_key, gpu_pattern in AMD_GPU_TYPES.items():
if gpu_pattern in gpu_name_upper:
Expand All @@ -622,7 +670,7 @@ def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
found_gpu = True
break

if not found_gpu and gpu_gen is None:
if not found_gpu and not gpu_gen:
gpu_gen = gpu_name_upper

gpu_info[gpu_gen] += 1
Expand All @@ -633,6 +681,56 @@ def get_amd_gpu_info(self, timeout: int = 60) -> list[GPUInfo]:
except RuntimeError as e:
raise RuntimeError(f"Failed to get AMD GPU information: {str(e)}")

def _get_amd_gpu_mem_MB(self, timeout: int = 60) -> list[GPUMemInfo]:
"""Get AMD GPU memory using rocm-smi.

Queries --showproductname for GPU generation and --showmeminfo for VRAM.
"""
try:
# Identify GPU generation from product name
name_result = run_cli(
["rocm-smi", "--showproductname"], text=True, timeout=timeout
)
detected_gen = "AMD"
for line in name_result.strip().split("\n"):
if "GPU" in line and ":" in line:
parts = line.split(":")
if len(parts) >= 2:
gpu_name_upper = parts[-1].strip().upper()
for gpu_key, gpu_pattern in AMD_GPU_TYPES.items():
if gpu_pattern in gpu_name_upper:
detected_gen = gpu_key
break
if detected_gen != "AMD":
break

# Get VRAM info
vram_result = run_cli(
["rocm-smi", "--showmeminfo", "vram", "--json"],
text=True,
timeout=timeout,
)
data = json.loads(vram_result)
max_mem_mb = 0
for card_data in data.values():
vram_total = card_data.get("VRAM Total Memory (B)", 0)
mem_mb = int(vram_total) // (1024 * 1024)
max_mem_mb = max(max_mem_mb, mem_mb)

if max_mem_mb == 0:
return []

return [
GPUMemInfo(
mem_total_MB=max_mem_mb,
mem_total_GB=max_mem_mb // 1024,
vendor="amd",
gpu_gen=detected_gen,
)
]
except RuntimeError as e:
raise RuntimeError(f"Failed to get AMD GPU memory: {str(e)}")

def get_gpu_generation_and_count(self, timeout: int = 60) -> list[GPUInfo]:
"""Get GPU information for all available GPUs on the local node.

Expand Down Expand Up @@ -666,6 +764,31 @@ def get_gpu_generation_and_count(self, timeout: int = 60) -> list[GPUInfo]:

return gpu_info

def get_gpu_mem_MB(self, timeout: int = 60) -> list[GPUMemInfo]:
"""Get GPU memory for all available GPUs on the local node.

Returns:
list[GPUMemInfo]: GPU memory information per GPU model.
"""
gpu_mem_info = []

if self.has_nvidia_gpus():
try:
gpu_mem_info.extend(self._get_nvidia_gpu_mem_MB(timeout))
except RuntimeError as e:
logging.warning(f"Failed to get NVIDIA GPU memory: {e}")

if self.has_amd_gpus():
try:
gpu_mem_info.extend(self._get_amd_gpu_mem_MB(timeout))
except RuntimeError as e:
logging.warning(f"Failed to get AMD GPU memory: {e}")

if not gpu_mem_info:
logging.warning("No GPU memory information found")

return gpu_mem_info

def has_gpu_type(self, gpu_type: str) -> bool:
"""Check if a specific GPU type is available on the local node.

Expand Down Expand Up @@ -803,8 +926,11 @@ def get_mem_per_node_MB(self) -> list[MemInfo]:
partition=partition.strip("* "),
)
)
if not results:
raise RuntimeError(
f"No mem information found in: {result.stdout}"
)
return results
raise RuntimeError(f"No mem information found in: {result.stdout}")
except (subprocess.SubprocessError, FileNotFoundError) as e:
logging.error(f"Failed to get Slurm memory information: {str(e)}")
raise RuntimeError(f"Failed to get Slurm memory information: {str(e)}")
Expand Down
13 changes: 13 additions & 0 deletions clusterscope/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from clusterscope.cluster_info import (
CPUInfo,
GPUInfo,
GPUMemInfo,
LocalNodeInfo,
MemInfo,
UnifiedInfo,
Expand Down Expand Up @@ -101,6 +102,18 @@ def mem(
return mem_info_list


def gpu_mem() -> list[GPUMemInfo]:
"""Get GPU memory for each GPU model on the current node.

Queries the local node via nvidia-smi (NVIDIA) or rocm-smi (AMD).
Returns an empty list on CPU-only nodes.

Returns:
list[GPUMemInfo]: GPU memory per GPU model.
"""
return get_unified_info().get_gpu_mem_MB()


def get_tmp_dir():
tmp = get_unified_info().get_tmp_dir()
return tmp
Expand Down
Loading