diff --git a/README.md b/README.md index fe1890d..fe8b475 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,16 @@ Default: `true` Type: `bool` +### hpc_install_nvidia_imex + +Whether to install NVIDIA IMEX (`nvidia-imex`) and enable `nvidia-imex.service`. + +Note: "This role installs and enables the nvidia-imex service but does not start it immediately. The service is configured to launch at boot only on compatible multi-node NVLink switch-fabric systems, such as NVIDIA GB200 or GB300 (NVL72) racks." + +Default: `true` + +Type: `bool` + ### hpc_install_rdma Whether to install the NVIDIA RDMA package. diff --git a/defaults/main.yml b/defaults/main.yml index 89f2d8a..07e9108 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -20,6 +20,7 @@ hpc_install_cuda_driver: true hpc_install_cuda_toolkit: true hpc_install_hpc_nvidia_nccl: true hpc_install_nvidia_fabric_manager: true +hpc_install_nvidia_imex: true hpc_install_rdma: true hpc_enable_azure_persistent_rdma_naming: true hpc_install_system_openmpi: true diff --git a/tasks/main.yml b/tasks/main.yml index fc30694..47c5ad9 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -527,6 +527,23 @@ name: nvidia-fabricmanager enabled: true +- name: Install/enable NVIDIA IMEX (NVLink multi-node runtime) + when: + - hpc_install_nvidia_imex + - ansible_facts["system_vendor"] == "Microsoft Corporation" + block: + - name: Install NVIDIA IMEX + package: + name: "{{ __hpc_nvidia_imex_package }}" + state: present + use: "{{ (__hpc_server_is_ostree | d(false)) | + ternary('ansible.posix.rhel_rpm_ostree', omit) }}" + + - name: Enable NVIDIA IMEX service + systemd: + name: nvidia-imex.service + enabled: true + - name: Install RDMA packages when: hpc_install_rdma block: diff --git a/vars/main.yml b/vars/main.yml index 5743108..860b529 100644 --- a/vars/main.yml +++ b/vars/main.yml @@ -31,6 +31,7 @@ __hpc_cuda_driver_packages: - cuda-drivers __hpc_nvidia_fabric_manager_packages: - nvidia-fabric-manager +__hpc_nvidia_imex_package: nvidia-imex __hpc_nvidia_container_toolkit_packages: - nvidia-container-toolkit __hpc_rdma_packages: