From c1e6acaf32a31b1543aa8e6b6179708ddb310e28 Mon Sep 17 00:00:00 2001 From: Renjith Ravindran Date: Fri, 6 Feb 2026 11:38:57 -0800 Subject: [PATCH 01/23] Enabling ci build for py-torch 2.9, 2.10 on rocm --- .../builtin/packages/hwloc/package.py | 2 +- .../PR152569-Update-spack-includes-2.5.patch | 5 +-- .../PR152569-Update-spack-includes-2.7.patch | 14 ++++++-- .../builtin/packages/py_torch/package.py | 36 ++++++++++++++----- stacks/ml-linux-x86_64-rocm/spack.yaml | 27 +++++++------- 5 files changed, 56 insertions(+), 28 deletions(-) diff --git a/repos/spack_repo/builtin/packages/hwloc/package.py b/repos/spack_repo/builtin/packages/hwloc/package.py index 65ad3db3056..9ab963dfdff 100644 --- a/repos/spack_repo/builtin/packages/hwloc/package.py +++ b/repos/spack_repo/builtin/packages/hwloc/package.py @@ -135,7 +135,7 @@ class Hwloc(AutotoolsPackage, CudaPackage, ROCmPackage): depends_on("mpi", when="+netloc") with when("+rocm"): - depends_on("rocm-smi-lib") + depends_on("rocm-smi-lib@7.0:") depends_on("rocm-opencl", when="+opencl") # Avoid a circular dependency since the openmp # variant of llvm-amdgpu depends on hwloc. diff --git a/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.5.patch b/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.5.patch index 2e7a80bcbe8..2c35aafac2f 100644 --- a/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.5.patch +++ b/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.5.patch @@ -25,10 +25,10 @@ index 9be7f37..39d0f24 100644 endif() diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index 1c0d3a2..e0de4b1 100644 +index 1c0d3a2..83f9f9d 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake -@@ -167,6 +167,10 @@ if(HIP_FOUND) +@@ -167,6 +167,11 @@ if(HIP_FOUND) find_package_and_print_version(hipsolver REQUIRED) find_package_and_print_version(hiprtc REQUIRED) @@ -36,6 +36,7 @@ index 1c0d3a2..e0de4b1 100644 + list(APPEND ROCM_INCLUDE ${rocprim_INCLUDE_DIR}) + list(APPEND ROCM_INCLUDE ${hipcub_INCLUDE_DIR}) + list(APPEND ROCM_INCLUDE ${rocRAND_INCLUDE_DIR}) ++ list(APPEND ROCM_INCLUDE $ENV{AOTRITON_INSTALLED_PREFIX}/include) find_library(PYTORCH_HIP_LIBRARIES amdhip64 HINTS ${ROCM_PATH}/lib) # TODO: miopen_LIBRARIES should return fullpath to the library file, diff --git a/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.7.patch b/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.7.patch index 4392e00d76a..173aabc12aa 100644 --- a/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.7.patch +++ b/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.7.patch @@ -1,5 +1,5 @@ diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt -index d2d23b7..620a89f 100644 +index d2d23b7ab65..620a89f65cb 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1379,13 +1379,6 @@ if(USE_ROCM) @@ -26,7 +26,7 @@ index d2d23b7..620a89f 100644 endif() diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index 58c74dd..d3e1ad4 100644 +index 58c74ddda35..54f96871372 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake @@ -26,12 +26,6 @@ else() @@ -78,7 +78,15 @@ index 58c74dd..d3e1ad4 100644 find_package_and_print_version(amd_comgr REQUIRED) find_package_and_print_version(rocrand REQUIRED) find_package_and_print_version(hiprand REQUIRED) -@@ -171,7 +168,11 @@ if(HIP_FOUND) +@@ -157,6 +154,7 @@ if(HIP_FOUND) + find_package_and_print_version(hipcub REQUIRED) + find_package_and_print_version(rocthrust REQUIRED) + find_package_and_print_version(hipsolver REQUIRED) ++ list(APPEND ROCM_INCLUDE_DIRS $ENV{AOTRITON_INSTALLED_PREFIX}/include) + # workaround cmake 4 build issue + if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0") + message(WARNING "Work around hiprtc cmake failure for cmake >= 4") +@@ -171,7 +169,11 @@ if(HIP_FOUND) if(UNIX) find_package_and_print_version(rccl) find_package_and_print_version(hsa-runtime64 REQUIRED) diff --git a/repos/spack_repo/builtin/packages/py_torch/package.py b/repos/spack_repo/builtin/packages/py_torch/package.py index 844a73f0e9e..c2f224b1b59 100644 --- a/repos/spack_repo/builtin/packages/py_torch/package.py +++ b/repos/spack_repo/builtin/packages/py_torch/package.py @@ -123,6 +123,7 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): conflicts("+gloo+rocm") conflicts("+rocm", when="@2.3", msg="Rocm doesn't support py-torch 2.3 release") conflicts("+rocm", when="@2.4", msg="Rocm doesn't support py-torch 2.4 release") + conflicts("+rocm", when="@2.8", msg="Rocm doesn't support py-torch 2.8 release") conflicts("+tensorpipe", when="+rocm ^hip@:5.1", msg="TensorPipe not supported until ROCm 5.2") conflicts("+breakpad", when="target=ppc64:") conflicts("+breakpad", when="target=ppc64le:") @@ -305,7 +306,8 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("valgrind", when="+valgrind") with when("+rocm"): depends_on("hsa-rocr-dev") - depends_on("hip") + depends_on("hip@7.0:", when="@2.9:") + depends_on("hip@:6.4", when="@:2.7") depends_on("rccl", when="+nccl") depends_on("rocprim") depends_on("hipcub") @@ -320,11 +322,20 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("rocfft") depends_on("rocblas") depends_on("miopen-hip") + for target in ROCmPackage.amdgpu_targets: + depends_on(f"composable-kernel amdgpu_target={target}", when=f"amdgpu_target={target}") + # This constraint applies to ANY hipblaslt in the dependency tree + # including the one used by miopen-hip + depends_on(f"hipblaslt amdgpu_target={target}", when=f"amdgpu_target={target}") + # Ensure hipblaslt version for 2.9+ + depends_on( + f"hipblaslt@7.0: amdgpu_target={target}", when=f"@2.9: amdgpu_target={target}" + ) depends_on("rocminfo") - depends_on("aotriton@0.8.1b", when="@2.5:2.6") - depends_on("aotriton@0.9.1b", when="@2.7:") - depends_on("composable-kernel@:6.3.2", when="@2.5") - depends_on("composable-kernel@6.3.2:", when="@2.6:") + depends_on("hipsparselt@7.0:", when="@2.9:") + depends_on("aotriton@0.8b", when="@2.5:2.6") + depends_on("aotriton@0.9.2b", when="@2.7") + depends_on("aotriton@0.10b", when="@2.8:") depends_on("mpi", when="+mpi") depends_on("ucc", when="+ucc") depends_on("ucx", when="+ucc") @@ -568,6 +579,14 @@ def patch(self): "torch_global_deps PROPERTIES LINKER_LANGUAGE CXX", "caffe2/CMakeLists.txt", ) + if self.spec.satisfies("@2.5:+rocm"): + filter_file( + "find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)", + "find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)\n" + "set(ROCTRACER_INCLUDE_DIR $ENV{ROCTRACER_INCLUDE_DIR})", + "cmake/public/LoadHIP.cmake", + string=True, + ) if self.spec.satisfies("@2.1:2.7+rocm"): filter_file( "${ROCM_INCLUDE_DIRS}/rocm-core/rocm_version.h", @@ -757,9 +776,10 @@ def enable_or_disable(variant, keyword="USE", var=None): env.set("BLAS", "FLAME") env.set("WITH_BLAS", "FLAME") elif self.spec["blas"].name == "intel-oneapi-mkl": - env.set("BLAS", "MKL") - env.set("WITH_BLAS", "mkl") - env.set("INTEL_MKL_DIR", self.spec["mkl"].prefix.mkl.latest) + if "+mkldnn" in self.spec: + env.set("BLAS", "MKL") + env.set("WITH_BLAS", "mkl") + env.set("INTEL_MKL_DIR", self.spec["mkl"].prefix.mkl.latest) elif self.spec["blas"].name == "openblas": env.set("BLAS", "OpenBLAS") env.set("WITH_BLAS", "open") diff --git a/stacks/ml-linux-x86_64-rocm/spack.yaml b/stacks/ml-linux-x86_64-rocm/spack.yaml index b0533168015..7d387f53431 100644 --- a/stacks/ml-linux-x86_64-rocm/spack.yaml +++ b/stacks/ml-linux-x86_64-rocm/spack.yaml @@ -48,23 +48,22 @@ spack: # - py-keras backend=torch # PyTorch - # Does not yet support Spack-installed ROCm - # - py-botorch - # - py-gpytorch - # - py-kornia - # - py-lightning - # - py-pytorch-lightning - # - py-segmentation-models-pytorch - # - py-timm - # - py-torch - # - py-torch-geometric + - py-botorch + - py-gpytorch + - py-kornia + - py-lightning + - py-pytorch-lightning + - py-segmentation-models-pytorch + - py-timm + - py-torch + - py-torch-geometric # - py-torch-nvidia-apex # - py-torchaudio - # - py-torchdata - # - py-torchgeo - # - py-torchmetrics + - py-torchdata + - py-torchgeo + - py-torchmetrics # - py-torchvision - # - py-vector-quantize-pytorch + - py-vector-quantize-pytorch # scikit-learn - py-scikit-learn From 22b06cb507640b203c8cc886ca394d05a31ad8b9 Mon Sep 17 00:00:00 2001 From: Renjith Ravindran Date: Mon, 9 Feb 2026 01:04:42 -0800 Subject: [PATCH 02/23] Removing mkldnn check and manual variant --- .../builtin/packages/py_torch/package.py | 20 +++++++------------ stacks/ml-linux-x86_64-rocm/spack.yaml | 2 +- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/repos/spack_repo/builtin/packages/py_torch/package.py b/repos/spack_repo/builtin/packages/py_torch/package.py index c2f224b1b59..d360242270a 100644 --- a/repos/spack_repo/builtin/packages/py_torch/package.py +++ b/repos/spack_repo/builtin/packages/py_torch/package.py @@ -322,15 +322,10 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("rocfft") depends_on("rocblas") depends_on("miopen-hip") - for target in ROCmPackage.amdgpu_targets: - depends_on(f"composable-kernel amdgpu_target={target}", when=f"amdgpu_target={target}") - # This constraint applies to ANY hipblaslt in the dependency tree - # including the one used by miopen-hip - depends_on(f"hipblaslt amdgpu_target={target}", when=f"amdgpu_target={target}") - # Ensure hipblaslt version for 2.9+ - depends_on( - f"hipblaslt@7.0: amdgpu_target={target}", when=f"@2.9: amdgpu_target={target}" - ) + depends_on("composable-kernel") + depends_on("hipblaslt") + # Ensure hipblaslt version for 2.9+ + depends_on("hipblaslt@7.0:", when="@2.9:") depends_on("rocminfo") depends_on("hipsparselt@7.0:", when="@2.9:") depends_on("aotriton@0.8b", when="@2.5:2.6") @@ -776,10 +771,9 @@ def enable_or_disable(variant, keyword="USE", var=None): env.set("BLAS", "FLAME") env.set("WITH_BLAS", "FLAME") elif self.spec["blas"].name == "intel-oneapi-mkl": - if "+mkldnn" in self.spec: - env.set("BLAS", "MKL") - env.set("WITH_BLAS", "mkl") - env.set("INTEL_MKL_DIR", self.spec["mkl"].prefix.mkl.latest) + env.set("BLAS", "MKL") + env.set("WITH_BLAS", "mkl") + env.set("INTEL_MKL_DIR", self.spec["mkl"].prefix.mkl.latest) elif self.spec["blas"].name == "openblas": env.set("BLAS", "OpenBLAS") env.set("WITH_BLAS", "open") diff --git a/stacks/ml-linux-x86_64-rocm/spack.yaml b/stacks/ml-linux-x86_64-rocm/spack.yaml index be94ec835c6..700c3bfa66a 100644 --- a/stacks/ml-linux-x86_64-rocm/spack.yaml +++ b/stacks/ml-linux-x86_64-rocm/spack.yaml @@ -43,7 +43,7 @@ spack: # Keras - py-keras backend=tensorflow # - py-keras backend=jax - # - py-keras backend=torch + - py-keras backend=torch # PyTorch - py-botorch From c397270c45ba1d45398ee87abb605a54cd01d20b Mon Sep 17 00:00:00 2001 From: Renjith Ravindran Date: Tue, 10 Feb 2026 22:54:17 -0800 Subject: [PATCH 03/23] version check correction for aotriton --- repos/spack_repo/builtin/packages/aotriton/package.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/repos/spack_repo/builtin/packages/aotriton/package.py b/repos/spack_repo/builtin/packages/aotriton/package.py index 673c678a94b..c23aaf7c94a 100644 --- a/repos/spack_repo/builtin/packages/aotriton/package.py +++ b/repos/spack_repo/builtin/packages/aotriton/package.py @@ -84,7 +84,7 @@ def patch(self): string=True, ) - if self.spec.satisfies("@:0.9"): + if self.spec.satisfies("@:0.9b"): filter_file( r"LLVM_INCLUDE_DIRS", f"{self.spec['aotriton-llvm'].prefix}/include", @@ -103,7 +103,7 @@ def patch(self): "third_party/triton/python/setup.py", string=True, ) - if self.spec.satisfies("@0.10:"): + if self.spec.satisfies("@0.10b:"): filter_file( r"LLVM_INCLUDE_DIRS", f"{self.spec['aotriton-llvm'].prefix}/include", From 58629f9140f04ba4c23f6ff16a5448cf28c37db5 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Wed, 11 Feb 2026 22:47:28 -0500 Subject: [PATCH 04/23] Increase timout for ck --- .ci/gitlab/configs/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/gitlab/configs/ci.yaml b/.ci/gitlab/configs/ci.yaml index 9dc89ba702b..6e6e4c3c3b5 100644 --- a/.ci/gitlab/configs/ci.yaml +++ b/.ci/gitlab/configs/ci.yaml @@ -23,7 +23,7 @@ ci: script:: - - if [ -n "$SPACK_EXTRA_MIRROR" ]; then spack mirror add local "${SPACK_EXTRA_MIRROR}/${SPACK_CI_STACK_NAME}"; fi - spack config blame mirrors - - - spack --color=always --backtrace ci rebuild -j ${SPACK_BUILD_JOBS} --tests --timeout 300 > >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_out.txt) 2> >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_err.txt >&2) + - - spack --color=always --backtrace ci rebuild -j ${SPACK_BUILD_JOBS} --tests --timeout 1200 > >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_out.txt) 2> >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_err.txt >&2) after_script: - - cat /proc/loadavg || true - cat /proc/meminfo | grep 'MemTotal\|MemFree' || true From a5e4a0ab6e21797be9566a18058078318235c4c2 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Thu, 12 Feb 2026 15:02:46 -0500 Subject: [PATCH 05/23] 24h for long ROCm/ML rebuilds; GitLab project/runner max must allow this --- .ci/gitlab/configs/linux/ci.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.ci/gitlab/configs/linux/ci.yaml b/.ci/gitlab/configs/linux/ci.yaml index 92a1cd14d9c..7edfea15803 100644 --- a/.ci/gitlab/configs/linux/ci.yaml +++ b/.ci/gitlab/configs/linux/ci.yaml @@ -13,6 +13,8 @@ ci: - wrf build-job: tags: [ "spack", "huge" ] + # 24h for long ROCm/ML rebuilds; GitLab project/runner max must allow this + timeout: 1440 minutes variables: CI_JOB_SIZE: huge SPACK_BUILD_JOBS: "12" From 12b27100c1ef65dc1d4df987fd140d6e6b03cffc Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Thu, 12 Feb 2026 15:07:00 -0500 Subject: [PATCH 06/23] Reverting commit 58629f9140f04ba4c23f6ff16a5448cf28c37db5 timout for ck --- .ci/gitlab/configs/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/gitlab/configs/ci.yaml b/.ci/gitlab/configs/ci.yaml index 6e6e4c3c3b5..9dc89ba702b 100644 --- a/.ci/gitlab/configs/ci.yaml +++ b/.ci/gitlab/configs/ci.yaml @@ -23,7 +23,7 @@ ci: script:: - - if [ -n "$SPACK_EXTRA_MIRROR" ]; then spack mirror add local "${SPACK_EXTRA_MIRROR}/${SPACK_CI_STACK_NAME}"; fi - spack config blame mirrors - - - spack --color=always --backtrace ci rebuild -j ${SPACK_BUILD_JOBS} --tests --timeout 1200 > >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_out.txt) 2> >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_err.txt >&2) + - - spack --color=always --backtrace ci rebuild -j ${SPACK_BUILD_JOBS} --tests --timeout 300 > >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_out.txt) 2> >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_err.txt >&2) after_script: - - cat /proc/loadavg || true - cat /proc/meminfo | grep 'MemTotal\|MemFree' || true From 441d218eef484184150d77a0991bad683187c148 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Tue, 17 Feb 2026 12:42:58 -0500 Subject: [PATCH 07/23] py-torchvision requires rocm math lib paths indirectly when py-torch is built with rocm --- .../builtin/packages/py_torchvision/package.py | 12 ++++++++++++ stacks/ml-linux-x86_64-rocm/spack.yaml | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/repos/spack_repo/builtin/packages/py_torchvision/package.py b/repos/spack_repo/builtin/packages/py_torchvision/package.py index 4afaa9895c7..073a4580c34 100644 --- a/repos/spack_repo/builtin/packages/py_torchvision/package.py +++ b/repos/spack_repo/builtin/packages/py_torchvision/package.py @@ -197,6 +197,18 @@ def setup_build_environment(self, env: EnvironmentModifications) -> None: query = self.spec[dep.name] include.extend(query.headers.directories) library.extend(query.libs.directories) + # PyTorch headers include rocthrust, rocprim, hipsparse, hipblas, hipblas-common, + # hipblaslt and hipsolver headers; when building with ROCm we need these headers + # in the include path (py-torch depends on these headers, but it is not a direct + # link dep of torchvision). + if "^py-torch+rocm" in self.spec: + include.extend(self.spec["rocthrust"].headers.directories) + include.extend(self.spec["rocprim"].headers.directories) + include.extend(self.spec["hipsparse"].headers.directories) + include.extend(self.spec["hipblas"].headers.directories) + include.extend(self.spec["hipblas-common"].headers.directories) + include.extend(self.spec["hipblaslt"].headers.directories) + include.extend(self.spec["hipsolver"].headers.directories) # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but # these do not work for older releases. Build uses a mix of Spack's compiler wrapper diff --git a/stacks/ml-linux-x86_64-rocm/spack.yaml b/stacks/ml-linux-x86_64-rocm/spack.yaml index 700c3bfa66a..de5cbe445e6 100644 --- a/stacks/ml-linux-x86_64-rocm/spack.yaml +++ b/stacks/ml-linux-x86_64-rocm/spack.yaml @@ -60,7 +60,7 @@ spack: - py-torchdata - py-torchgeo - py-torchmetrics - # - py-torchvision + - py-torchvision - py-vector-quantize-pytorch # scikit-learn From 02e36f6cc75f4b09b8e20f00fed6ab24b5017fac Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Wed, 18 Feb 2026 12:10:23 -0500 Subject: [PATCH 08/23] Only add paths for packages that are in the spec to avoid KeyError --- .../packages/py_torchvision/package.py | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/repos/spack_repo/builtin/packages/py_torchvision/package.py b/repos/spack_repo/builtin/packages/py_torchvision/package.py index 073a4580c34..f253fdde728 100644 --- a/repos/spack_repo/builtin/packages/py_torchvision/package.py +++ b/repos/spack_repo/builtin/packages/py_torchvision/package.py @@ -197,18 +197,24 @@ def setup_build_environment(self, env: EnvironmentModifications) -> None: query = self.spec[dep.name] include.extend(query.headers.directories) library.extend(query.libs.directories) + # PyTorch headers include rocthrust, rocprim, hipsparse, hipblas, hipblas-common, - # hipblaslt and hipsolver headers; when building with ROCm we need these headers - # in the include path (py-torch depends on these headers, but it is not a direct - # link dep of torchvision). + # hipblaslt and hipsolver headers; when building with ROCm we need these in the + # include path (py-torch depends on them, but they are not direct link deps of + # torchvision). Only add paths for packages that are in the spec to avoid KeyError. if "^py-torch+rocm" in self.spec: - include.extend(self.spec["rocthrust"].headers.directories) - include.extend(self.spec["rocprim"].headers.directories) - include.extend(self.spec["hipsparse"].headers.directories) - include.extend(self.spec["hipblas"].headers.directories) - include.extend(self.spec["hipblas-common"].headers.directories) - include.extend(self.spec["hipblaslt"].headers.directories) - include.extend(self.spec["hipsolver"].headers.directories) + rocm_include_pkgs = [ + "rocthrust", + "rocprim", + "hipsparse", + "hipblas", + "hipblas-common", + "hipblaslt", + "hipsolver", + ] + for pkg in rocm_include_pkgs: + if pkg in self.spec: + include.extend(self.spec[pkg].headers.directories) # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but # these do not work for older releases. Build uses a mix of Spack's compiler wrapper From bc1a5f081a8c4d887a042c9de9f45b4f9a814907 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Thu, 19 Feb 2026 02:17:09 -0500 Subject: [PATCH 09/23] libtorch_hip.so needs aotriton and hip libs at runtime --- .../spack_repo/builtin/packages/py_torchvision/package.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/repos/spack_repo/builtin/packages/py_torchvision/package.py b/repos/spack_repo/builtin/packages/py_torchvision/package.py index f253fdde728..533b6a6cbcd 100644 --- a/repos/spack_repo/builtin/packages/py_torchvision/package.py +++ b/repos/spack_repo/builtin/packages/py_torchvision/package.py @@ -216,6 +216,14 @@ def setup_build_environment(self, env: EnvironmentModifications) -> None: if pkg in self.spec: include.extend(self.spec[pkg].headers.directories) + # At build time, torchvision's setup imports torch; libtorch_hip.so then + # needs aotriton and hip libs at runtime. Add their lib dirs so the loader + # can resolve undefined symbols (e.g. aotriton::v2::flash::attn_bwd_fused). + for pkg in ["aotriton", "hip"]: + if pkg in self.spec: + for lib_dir in self.spec[pkg].libs.directories: + env.prepend_path("LD_LIBRARY_PATH", lib_dir) + # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but # these do not work for older releases. Build uses a mix of Spack's compiler wrapper # and the actual compiler, so this is needed to get parts of the build working. From 16ac8a8e3c77eb53c0db9506c02a79b06555f1a4 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Fri, 20 Feb 2026 13:20:06 -0500 Subject: [PATCH 10/23] Correcting the library path with prefix --- repos/spack_repo/builtin/packages/py_torchvision/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/spack_repo/builtin/packages/py_torchvision/package.py b/repos/spack_repo/builtin/packages/py_torchvision/package.py index 533b6a6cbcd..9370f87e0a0 100644 --- a/repos/spack_repo/builtin/packages/py_torchvision/package.py +++ b/repos/spack_repo/builtin/packages/py_torchvision/package.py @@ -221,7 +221,7 @@ def setup_build_environment(self, env: EnvironmentModifications) -> None: # can resolve undefined symbols (e.g. aotriton::v2::flash::attn_bwd_fused). for pkg in ["aotriton", "hip"]: if pkg in self.spec: - for lib_dir in self.spec[pkg].libs.directories: + for lib_dir in self.spec[pkg].prefix.lib: env.prepend_path("LD_LIBRARY_PATH", lib_dir) # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but From ee6e80e90e2cecd379abd9470283368d6e8d1463 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Sat, 21 Feb 2026 11:18:35 -0500 Subject: [PATCH 11/23] Add prefix lib dirs when they exist so the loader can find .so files --- .../packages/py_torchvision/package.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/repos/spack_repo/builtin/packages/py_torchvision/package.py b/repos/spack_repo/builtin/packages/py_torchvision/package.py index 9370f87e0a0..011ab95c08a 100644 --- a/repos/spack_repo/builtin/packages/py_torchvision/package.py +++ b/repos/spack_repo/builtin/packages/py_torchvision/package.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: (Apache-2.0 OR MIT) +import os from spack_repo.builtin.build_systems.python import PythonPackage @@ -220,11 +221,21 @@ def setup_build_environment(self, env: EnvironmentModifications) -> None: # needs aotriton and hip libs at runtime. Add their lib dirs so the loader # can resolve undefined symbols (e.g. aotriton::v2::flash::attn_bwd_fused). for pkg in ["aotriton", "hip"]: - if pkg in self.spec: - for lib_dir in self.spec[pkg].prefix.lib: + if pkg not in self.spec: + continue + try: + for lib_dir in self.spec[pkg].libs.directories: env.prepend_path("LD_LIBRARY_PATH", lib_dir) - - # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but + except NoLibrariesError: + # Package may not declare 'libraries' (e.g. aotriton), so Spack + # cannot recursively locate libs. Add prefix lib dirs when they + # exist so the loader can find .so files (lib, lib64, or both). + for sub in ("lib", "lib64"): + lib_dir = os.path.join(self.spec[pkg].prefix, sub) + if os.path.isdir(lib_dir): + env.prepend_path("LD_LIBRARY_PATH", lib_dir) + + # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but # these do not work for older releases. Build uses a mix of Spack's compiler wrapper # and the actual compiler, so this is needed to get parts of the build working. # See https://github.com/pytorch/vision/issues/2591 From a3fb110f9daa581e4764e102819f4514d80c1084 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Sat, 21 Feb 2026 16:42:22 -0500 Subject: [PATCH 12/23] style error fix --- repos/spack_repo/builtin/packages/py_torchvision/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/spack_repo/builtin/packages/py_torchvision/package.py b/repos/spack_repo/builtin/packages/py_torchvision/package.py index 011ab95c08a..8fe26cf7343 100644 --- a/repos/spack_repo/builtin/packages/py_torchvision/package.py +++ b/repos/spack_repo/builtin/packages/py_torchvision/package.py @@ -235,7 +235,7 @@ def setup_build_environment(self, env: EnvironmentModifications) -> None: if os.path.isdir(lib_dir): env.prepend_path("LD_LIBRARY_PATH", lib_dir) - # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but + # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but # these do not work for older releases. Build uses a mix of Spack's compiler wrapper # and the actual compiler, so this is needed to get parts of the build working. # See https://github.com/pytorch/vision/issues/2591 From 6087d68827de791a11de53d94821889e0c00b971 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Mon, 23 Feb 2026 11:35:30 -0500 Subject: [PATCH 13/23] import NoLibrariesError --- repos/spack_repo/builtin/packages/py_torchvision/package.py | 1 + 1 file changed, 1 insertion(+) diff --git a/repos/spack_repo/builtin/packages/py_torchvision/package.py b/repos/spack_repo/builtin/packages/py_torchvision/package.py index 8fe26cf7343..dff76f276c9 100644 --- a/repos/spack_repo/builtin/packages/py_torchvision/package.py +++ b/repos/spack_repo/builtin/packages/py_torchvision/package.py @@ -6,6 +6,7 @@ from spack_repo.builtin.build_systems.python import PythonPackage +from spack.error import NoLibrariesError from spack.package import * From be4d3c3ba33d4ebebe2b65f7758a5d2e26f09b93 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Tue, 24 Feb 2026 12:23:43 -0500 Subject: [PATCH 14/23] Changing dependency to rebuild aotriton --- repos/spack_repo/builtin/packages/py_torch/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/spack_repo/builtin/packages/py_torch/package.py b/repos/spack_repo/builtin/packages/py_torch/package.py index d360242270a..7211109c95a 100644 --- a/repos/spack_repo/builtin/packages/py_torch/package.py +++ b/repos/spack_repo/builtin/packages/py_torch/package.py @@ -329,7 +329,7 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("rocminfo") depends_on("hipsparselt@7.0:", when="@2.9:") depends_on("aotriton@0.8b", when="@2.5:2.6") - depends_on("aotriton@0.9.2b", when="@2.7") + depends_on("aotriton@0.9.1b", when="@2.7") depends_on("aotriton@0.10b", when="@2.8:") depends_on("mpi", when="+mpi") depends_on("ucc", when="+ucc") From a71c9aa61ab219b1723490c24bb53a34c2352c77 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Tue, 24 Feb 2026 12:39:56 -0500 Subject: [PATCH 15/23] reverting the recent change --- repos/spack_repo/builtin/packages/py_torch/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/spack_repo/builtin/packages/py_torch/package.py b/repos/spack_repo/builtin/packages/py_torch/package.py index 7211109c95a..d360242270a 100644 --- a/repos/spack_repo/builtin/packages/py_torch/package.py +++ b/repos/spack_repo/builtin/packages/py_torch/package.py @@ -329,7 +329,7 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("rocminfo") depends_on("hipsparselt@7.0:", when="@2.9:") depends_on("aotriton@0.8b", when="@2.5:2.6") - depends_on("aotriton@0.9.1b", when="@2.7") + depends_on("aotriton@0.9.2b", when="@2.7") depends_on("aotriton@0.10b", when="@2.8:") depends_on("mpi", when="+mpi") depends_on("ucc", when="+ucc") From 92ea3ac5ecfdcfbfdfc56a439d1d2dcdea2d56be Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Tue, 24 Feb 2026 12:54:36 -0500 Subject: [PATCH 16/23] Changing dependency to rebuild aotriton --- repos/spack_repo/builtin/packages/py_torch/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/spack_repo/builtin/packages/py_torch/package.py b/repos/spack_repo/builtin/packages/py_torch/package.py index d360242270a..7211109c95a 100644 --- a/repos/spack_repo/builtin/packages/py_torch/package.py +++ b/repos/spack_repo/builtin/packages/py_torch/package.py @@ -329,7 +329,7 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("rocminfo") depends_on("hipsparselt@7.0:", when="@2.9:") depends_on("aotriton@0.8b", when="@2.5:2.6") - depends_on("aotriton@0.9.2b", when="@2.7") + depends_on("aotriton@0.9.1b", when="@2.7") depends_on("aotriton@0.10b", when="@2.8:") depends_on("mpi", when="+mpi") depends_on("ucc", when="+ucc") From 10aa531d51367ccf2ee6ed7ba7164037a21b2a6e Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Tue, 24 Feb 2026 22:11:04 -0500 Subject: [PATCH 17/23] Revert "Changing dependency to rebuild aotriton" This reverts commit 92ea3ac5ecfdcfbfdfc56a439d1d2dcdea2d56be. --- repos/spack_repo/builtin/packages/py_torch/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/spack_repo/builtin/packages/py_torch/package.py b/repos/spack_repo/builtin/packages/py_torch/package.py index 7211109c95a..d360242270a 100644 --- a/repos/spack_repo/builtin/packages/py_torch/package.py +++ b/repos/spack_repo/builtin/packages/py_torch/package.py @@ -329,7 +329,7 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("rocminfo") depends_on("hipsparselt@7.0:", when="@2.9:") depends_on("aotriton@0.8b", when="@2.5:2.6") - depends_on("aotriton@0.9.1b", when="@2.7") + depends_on("aotriton@0.9.2b", when="@2.7") depends_on("aotriton@0.10b", when="@2.8:") depends_on("mpi", when="+mpi") depends_on("ucc", when="+ucc") From d6e94b3910a70eea5816e3138beaf19c191e5a61 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Tue, 24 Feb 2026 22:11:52 -0500 Subject: [PATCH 18/23] Revert "reverting the recent change" This reverts commit a71c9aa61ab219b1723490c24bb53a34c2352c77. --- repos/spack_repo/builtin/packages/py_torch/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/spack_repo/builtin/packages/py_torch/package.py b/repos/spack_repo/builtin/packages/py_torch/package.py index d360242270a..7211109c95a 100644 --- a/repos/spack_repo/builtin/packages/py_torch/package.py +++ b/repos/spack_repo/builtin/packages/py_torch/package.py @@ -329,7 +329,7 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("rocminfo") depends_on("hipsparselt@7.0:", when="@2.9:") depends_on("aotriton@0.8b", when="@2.5:2.6") - depends_on("aotriton@0.9.2b", when="@2.7") + depends_on("aotriton@0.9.1b", when="@2.7") depends_on("aotriton@0.10b", when="@2.8:") depends_on("mpi", when="+mpi") depends_on("ucc", when="+ucc") From c038ae43d36f9a3234521299908c552158ff265a Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Tue, 24 Feb 2026 22:12:27 -0500 Subject: [PATCH 19/23] Revert "Changing dependency to rebuild aotriton" This reverts commit be4d3c3ba33d4ebebe2b65f7758a5d2e26f09b93. --- repos/spack_repo/builtin/packages/py_torch/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/spack_repo/builtin/packages/py_torch/package.py b/repos/spack_repo/builtin/packages/py_torch/package.py index 7211109c95a..d360242270a 100644 --- a/repos/spack_repo/builtin/packages/py_torch/package.py +++ b/repos/spack_repo/builtin/packages/py_torch/package.py @@ -329,7 +329,7 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("rocminfo") depends_on("hipsparselt@7.0:", when="@2.9:") depends_on("aotriton@0.8b", when="@2.5:2.6") - depends_on("aotriton@0.9.1b", when="@2.7") + depends_on("aotriton@0.9.2b", when="@2.7") depends_on("aotriton@0.10b", when="@2.8:") depends_on("mpi", when="+mpi") depends_on("ucc", when="+ucc") From 9d5dac8000a0c761a19d4031b8ed1b034c35d10c Mon Sep 17 00:00:00 2001 From: Renjith Ravindran Date: Wed, 25 Feb 2026 21:52:13 -0800 Subject: [PATCH 20/23] Changing dependency again to rebuild aotriton --- repos/spack_repo/builtin/packages/py_torch/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/spack_repo/builtin/packages/py_torch/package.py b/repos/spack_repo/builtin/packages/py_torch/package.py index d360242270a..7211109c95a 100644 --- a/repos/spack_repo/builtin/packages/py_torch/package.py +++ b/repos/spack_repo/builtin/packages/py_torch/package.py @@ -329,7 +329,7 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("rocminfo") depends_on("hipsparselt@7.0:", when="@2.9:") depends_on("aotriton@0.8b", when="@2.5:2.6") - depends_on("aotriton@0.9.2b", when="@2.7") + depends_on("aotriton@0.9.1b", when="@2.7") depends_on("aotriton@0.10b", when="@2.8:") depends_on("mpi", when="+mpi") depends_on("ucc", when="+ucc") From 8520f9718dca22a3ee843ebebdf68f30440bcc64 Mon Sep 17 00:00:00 2001 From: Renjith Ravindran Date: Thu, 26 Feb 2026 09:46:00 -0800 Subject: [PATCH 21/23] Revert "Changing dependency again to rebuild aotriton" This reverts commit 9d5dac8000a0c761a19d4031b8ed1b034c35d10c. --- repos/spack_repo/builtin/packages/py_torch/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/spack_repo/builtin/packages/py_torch/package.py b/repos/spack_repo/builtin/packages/py_torch/package.py index 7211109c95a..d360242270a 100644 --- a/repos/spack_repo/builtin/packages/py_torch/package.py +++ b/repos/spack_repo/builtin/packages/py_torch/package.py @@ -329,7 +329,7 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("rocminfo") depends_on("hipsparselt@7.0:", when="@2.9:") depends_on("aotriton@0.8b", when="@2.5:2.6") - depends_on("aotriton@0.9.1b", when="@2.7") + depends_on("aotriton@0.9.2b", when="@2.7") depends_on("aotriton@0.10b", when="@2.8:") depends_on("mpi", when="+mpi") depends_on("ucc", when="+ucc") From 8ed5223a94fc8a2ec2ae82f034e64743dae73db5 Mon Sep 17 00:00:00 2001 From: Renjith Ravindran Date: Thu, 26 Feb 2026 10:04:53 -0800 Subject: [PATCH 22/23] Limiting ci build to py-torch as remaining are not yet ready --- .../builtin/packages/hwloc/package.py | 2 +- .../packages/py_torchvision/package.py | 38 ------------------- stacks/ml-linux-x86_64-rocm/spack.yaml | 27 ++++++------- 3 files changed, 15 insertions(+), 52 deletions(-) diff --git a/repos/spack_repo/builtin/packages/hwloc/package.py b/repos/spack_repo/builtin/packages/hwloc/package.py index 9ab963dfdff..65ad3db3056 100644 --- a/repos/spack_repo/builtin/packages/hwloc/package.py +++ b/repos/spack_repo/builtin/packages/hwloc/package.py @@ -135,7 +135,7 @@ class Hwloc(AutotoolsPackage, CudaPackage, ROCmPackage): depends_on("mpi", when="+netloc") with when("+rocm"): - depends_on("rocm-smi-lib@7.0:") + depends_on("rocm-smi-lib") depends_on("rocm-opencl", when="+opencl") # Avoid a circular dependency since the openmp # variant of llvm-amdgpu depends on hwloc. diff --git a/repos/spack_repo/builtin/packages/py_torchvision/package.py b/repos/spack_repo/builtin/packages/py_torchvision/package.py index dff76f276c9..4afaa9895c7 100644 --- a/repos/spack_repo/builtin/packages/py_torchvision/package.py +++ b/repos/spack_repo/builtin/packages/py_torchvision/package.py @@ -2,11 +2,9 @@ # # SPDX-License-Identifier: (Apache-2.0 OR MIT) -import os from spack_repo.builtin.build_systems.python import PythonPackage -from spack.error import NoLibrariesError from spack.package import * @@ -200,42 +198,6 @@ def setup_build_environment(self, env: EnvironmentModifications) -> None: include.extend(query.headers.directories) library.extend(query.libs.directories) - # PyTorch headers include rocthrust, rocprim, hipsparse, hipblas, hipblas-common, - # hipblaslt and hipsolver headers; when building with ROCm we need these in the - # include path (py-torch depends on them, but they are not direct link deps of - # torchvision). Only add paths for packages that are in the spec to avoid KeyError. - if "^py-torch+rocm" in self.spec: - rocm_include_pkgs = [ - "rocthrust", - "rocprim", - "hipsparse", - "hipblas", - "hipblas-common", - "hipblaslt", - "hipsolver", - ] - for pkg in rocm_include_pkgs: - if pkg in self.spec: - include.extend(self.spec[pkg].headers.directories) - - # At build time, torchvision's setup imports torch; libtorch_hip.so then - # needs aotriton and hip libs at runtime. Add their lib dirs so the loader - # can resolve undefined symbols (e.g. aotriton::v2::flash::attn_bwd_fused). - for pkg in ["aotriton", "hip"]: - if pkg not in self.spec: - continue - try: - for lib_dir in self.spec[pkg].libs.directories: - env.prepend_path("LD_LIBRARY_PATH", lib_dir) - except NoLibrariesError: - # Package may not declare 'libraries' (e.g. aotriton), so Spack - # cannot recursively locate libs. Add prefix lib dirs when they - # exist so the loader can find .so files (lib, lib64, or both). - for sub in ("lib", "lib64"): - lib_dir = os.path.join(self.spec[pkg].prefix, sub) - if os.path.isdir(lib_dir): - env.prepend_path("LD_LIBRARY_PATH", lib_dir) - # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but # these do not work for older releases. Build uses a mix of Spack's compiler wrapper # and the actual compiler, so this is needed to get parts of the build working. diff --git a/stacks/ml-linux-x86_64-rocm/spack.yaml b/stacks/ml-linux-x86_64-rocm/spack.yaml index 0a705ef6204..00e617e42da 100644 --- a/stacks/ml-linux-x86_64-rocm/spack.yaml +++ b/stacks/ml-linux-x86_64-rocm/spack.yaml @@ -46,22 +46,23 @@ spack: - py-keras backend=torch # PyTorch - - py-botorch - - py-gpytorch - - py-kornia - - py-lightning - - py-pytorch-lightning - - py-segmentation-models-pytorch - - py-timm + # Does not yet support Spack-installed ROCm + # - py-botorch + # - py-gpytorch + # - py-kornia + # - py-lightning + # - py-pytorch-lightning + # - py-segmentation-models-pytorch + # - py-timm - py-torch - - py-torch-geometric + # - py-torch-geometric # - py-torch-nvidia-apex # - py-torchaudio - - py-torchdata - - py-torchgeo - - py-torchmetrics - - py-torchvision - - py-vector-quantize-pytorch + # - py-torchdata + # - py-torchgeo + # - py-torchmetrics + # - py-torchvision + # - py-vector-quantize-pytorch # scikit-learn - py-scikit-learn From 324f5875e58d839705c4ce3b8e90b7498bdc62b9 Mon Sep 17 00:00:00 2001 From: Renjith Ravindran Date: Thu, 26 Feb 2026 14:35:06 -0800 Subject: [PATCH 23/23] py-llvmlite@0.46.0 needs llvm@20 --- repos/spack_repo/builtin/packages/hwloc/package.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/repos/spack_repo/builtin/packages/hwloc/package.py b/repos/spack_repo/builtin/packages/hwloc/package.py index 65ad3db3056..38d04f3c3e1 100644 --- a/repos/spack_repo/builtin/packages/hwloc/package.py +++ b/repos/spack_repo/builtin/packages/hwloc/package.py @@ -135,7 +135,8 @@ class Hwloc(AutotoolsPackage, CudaPackage, ROCmPackage): depends_on("mpi", when="+netloc") with when("+rocm"): - depends_on("rocm-smi-lib") + depends_on("rocm-smi-lib@:6.4", when="@:2.11.1") + depends_on("rocm-smi-lib@7.0:", when="@2.12.2:") depends_on("rocm-opencl", when="+opencl") # Avoid a circular dependency since the openmp # variant of llvm-amdgpu depends on hwloc.