facebookresearch · coketaste · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
@@ -282,3 +282,5 @@ go.work.sum
 # Meta-internal CI
 skycastle/
 scrut/
+
+*.err
@@ -33,7 +33,8 @@ Each component has its own README with detailed guides:
 
 ## Possible Expansions
 
-- Integration with more GPU types (AMD, Intel, Custom Accelerators)
+- **AMD/ROCm GPU support (implemented)**: Use `gcm rocm_monitor` for AMD GPU metrics and `health_checks check_amd_smi` for AMD GPU health checks. Requires [amd-smi](https://rocm.docs.amd.com/projects/amdsmi/) or rocm-smi on PATH; device list in prolog/epilog uses `SLURM_JOB_GPUS`, `CUDA_VISIBLE_DEVICES` (NVIDIA), or `ROCR_VISIBLE_DEVICES` (AMD).
+- Integration with more GPU types (Intel, Custom Accelerators)
 - Support for additional schedulers beyond Slurm
 - [Additional Slurm related Monitoring](gcm/docs/adding_new_collector.md)
 - [Support for new exporters](gcm/docs/adding_new_exporter.md)

@@ -51,7 +51,9 @@ def load_config(self) -> Dict[str, Any]:
                     f"Error reading toml file. {{{class_name}.config_path}} does not contain valid TOML. Error: {{e}}"
                 )
                 raise tomli.TOMLDecodeError(
-                    f"{{{class_name}.config_path}} does not contain valid TOML.",
+                    msg=f"{{{class_name}.config_path}} does not contain valid TOML. {{e.msg}}",
+                    doc=e.doc,
+                    pos=e.pos,
                 ) from e
         else:
             raise ValueError(

@@ -69,6 +69,7 @@ $ health_checks --features-config=$features_path --config=$config_path check-dcg
 - [check-syslogs](#check-syslogs)
 - [cuda-memtest](#cuda-memtest)
 - [check-nccl](#check-nccl)
+- [check-rccl](#check-rccl)
 - [check-hca](#check-hca)
 - [check-storage](#check-storage)
 - [check-ipmitool](#check-ipmitool)
@@ -211,6 +212,28 @@ $ health_checks check-nccl fair_cluster prolog -p all_reduce --pairwise --hostli
 $ health_checks check-nccl fair_cluster prolog -p all_reduce --pairwise-quick --hostlist=$SLURM_JOB_NODELIST --nccl-tdir /shared/home/abinesh/nccl-tests/build/ --critical-threshold 100 --sink=do_nothing
 ```
 
+# check-rccl <div id='check-rccl'/>
+Run RCCL (ROCm Communication Collectives Library) tests on AMD GPU nodes. Analogous to check-nccl for NVIDIA nodes.
+1. Run single node RCCL tests (e.g. from ROCm/rccl-tests build)
+2. Run pairwise RCCL tests
+
+File: `gcm/health_checks/checks/check_rccl.py`
+
+Example of execution:
+```shell
+# For a list of the available options
+$ health_checks check-rccl --help
+
+# Single node all_reduce_perf RCCL test
+$ health_checks check-rccl fair_cluster prolog -p all_reduce --rccl-tdir /opt/rccl-tests/build/ --critical-threshold 18 --sink=do_nothing
+
+# Pairwise all_reduce_perf RCCL test (hostlist required)
+$ health_checks check-rccl fair_cluster prolog -p all_reduce --pairwise --hostlist=node-[1-4] --rccl-tdir /opt/rccl-tests/build/ --critical-threshold 100 --sink=do_nothing
+
+# Quick pairwise - each node covered once. SLURM_JOB_NODELIST can be used when running inside SLURM.
+$ health_checks check-rccl fair_cluster prolog -p all_reduce --pairwise-quick --hostlist=$SLURM_JOB_NODELIST --rccl-tdir /opt/rccl-tests/build/ --critical-threshold 100 --sink=do_nothing
+```
+
 # check-hca <div id='check-hca'/>
 Check if HCAs are present and count matches the expectation.
 

@@ -1,6 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 from gcm.health_checks.checks.check_airstore import check_airstore
+from gcm.health_checks.checks.check_amd_smi import check_amd_smi
 from gcm.health_checks.checks.check_authentication import check_authentication
 from gcm.health_checks.checks.check_blockdev import check_blockdev
 from gcm.health_checks.checks.check_dcgmi import check_dcgmi
@@ -9,6 +10,7 @@
 from gcm.health_checks.checks.check_ibstat import check_ib
 from gcm.health_checks.checks.check_ipmitool import check_ipmitool
 from gcm.health_checks.checks.check_nccl import check_nccl
+from gcm.health_checks.checks.check_rccl import check_rccl
 from gcm.health_checks.checks.check_node import check_node
 from gcm.health_checks.checks.check_nvidia_smi import check_nvidia_smi
 from gcm.health_checks.checks.check_pci import check_pci
@@ -25,11 +27,13 @@
 __all__ = [
     "check_ssh_certs",
     "check_airstore",
+    "check_amd_smi",
     "check_telemetry",
     "check_dcgmi",
     "check_hca",
     "check_nvidia_smi",
     "check_nccl",
+    "check_rccl",
     "check_syslogs",
     "check_process",
     "cuda",
-Original file line number
+Diff line change
@@ Expand Up / @@ -282,3 +282,5 @@ go.work.sum @@
     # Meta-internal CI
     skycastle/
     scrut/
+    *.err