FluidNumerics · fluidnumerics-joe · Feb 3, 2026 · Nov 14, 2025 · Nov 14, 2025 · Nov 14, 2025
diff --git a/.github/workflows/build-rocm.yml b/.github/workflows/build-rocm.yml
@@ -0,0 +1,93 @@
+name: Build ROCm Docker images
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - envs/x86/rocm/**
+      - .github/workflows/build-rocm.yml
+  pull_request:
+    paths:
+      - envs/x86/rocm/**
+      - .github/workflows/build-rocm.yml
+  workflow_dispatch:
+
+env:
+  REGISTRY: docker.io
+  IMAGE_NAME: higherordermethods/selfish
+
+jobs:
+  build:
+    name: Build ${{ matrix.gpu_arch }} image
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        gpu_arch: [gfx906, gfx90a, gfx942]
+        gpu_backend_version: ["6.4.3"]
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Log in to Docker Hub
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Set up Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Generate image metadata
+        id: meta
+        run: |
+          # Convert GPU_BACKEND_VERSION 6.4.3 -> rocm643
+          VERSION_NO_DOTS=$(echo "${{ matrix.gpu_backend_version }}" | tr -d '.')
+          GPU_BACKEND="rocm${VERSION_NO_DOTS}"
+
+          # Build tag components
+          CPU_PLATFORM="x86"
+          GPU_ARCH="${{ matrix.gpu_arch }}"
+
+          # Generate tags following: <version>-<cpu_platform>-<gpu_backend>-<gpu_arch>
+          echo "tags<<EOF" >> $GITHUB_OUTPUT
+          echo "${{ env.IMAGE_NAME }}:latest-${CPU_PLATFORM}-${GPU_BACKEND}-${GPU_ARCH}" >> $GITHUB_OUTPUT
+          echo "${{ env.IMAGE_NAME }}:${{ github.sha }}-${CPU_PLATFORM}-${GPU_BACKEND}-${GPU_ARCH}" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+
+          echo "gpu_backend=${GPU_BACKEND}" >> $GITHUB_OUTPUT
+
+      - name: Cache Docker layers
+        uses: actions/cache@v4
+        with:
+          path: /tmp/.buildx-cache
+          key: ${{ runner.os }}-buildx-${{ matrix.gpu_arch }}-${{ github.sha }}
+          restore-keys: |
+            ${{ runner.os }}-buildx-${{ matrix.gpu_arch }}-
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: envs/x86/rocm/Dockerfile
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          build-args: |
+            GPU_ARCH=${{ matrix.gpu_arch }}
+            GPU_BACKEND_VERSION=${{ matrix.gpu_backend_version }}
+          cache-from: type=local,src=/tmp/.buildx-cache
+          cache-to: type=local,dest=/tmp/.buildx-cache-new,mode=max
+          labels: |
+            com.fluidnumerics.rocm.target=${{ matrix.gpu_arch }}
+            com.fluidnumerics.rocm.version=${{ matrix.gpu_backend_version }}
+            org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
+            org.opencontainers.image.revision=${{ github.sha }}
+
+      - name: Move cache
+        run: |
+          rm -rf /tmp/.buildx-cache
+          mv /tmp/.buildx-cache-new /tmp/.buildx-cache
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,22 @@
+# Repository Guidelines
+
+## Project Structure & Module Organization
+Source files live under `envs/<cpu>/<gpu>/`, where each leaf directory owns a `spack.yaml` manifest and (optionally) a generated `Dockerfile`. Keep CPU targets (`x86`, …) and accelerator targets (`gfx90a`, `sm72`, `none`) granular so images stay purpose-built, and limit the root `README.md` to high-level context.
+
+## Build, Test, and Development Commands
+- `spack spec -e envs/x86/gfx90a/spack.yaml` — concretizes the manifest locally; run this before opening a PR so dependency drift is caught early.
+- `spack containerize envs/x86/gfx90a/spack.yaml > envs/x86/gfx90a/Dockerfile` — regenerates the Dockerfile after manifest edits (avoid hand-tuning output).
+- `docker build -f envs/x86/gfx90a/Dockerfile -t selfish:gfx90a .` — builds the shareable runtime image; tag images `<cpu>-<gpu>` for clarity.
+- `docker run --rm selfish:gfx90a spack find hdf5` — smoke-tests that the expected view was installed inside the image.
+
+## Coding Style & Naming Conventions
+Spack YAML uses 2-space indentation, lowercase keys, and quoted constraint strings (`"target=x86_64_v3"`). Group `specs` alphabetically, keep `packages` overrides sorted by scope, and rely on multiline `RUN` blocks with trailing `\` alignment plus brief comments for non-obvious workarounds. Name new environments after the hardware tuple (`x86/gfx942`, `x86/none`) so downstream scripts can glob predictably.
+
+## Testing Guidelines
+For each environment change, run `spack spec` followed by `spack install --fail-fast` inside a disposable builder container to verify concretization. Container builds must pass `docker build` locally before review; capture the last ~20 lines for the PR description. When adding MPI/HDF5 variants, run `docker run --rm <tag> mpichversion` (or another representative binary) to prove runtime availability. There is no coverage gate, but every new spec should ship with at least one build log, and GitHub Actions now double-checks gfx90a builds and publishes them to `higherordermethods/selfish`.
+
+## Commit & Pull Request Guidelines
+Existing history uses short, imperative subject lines (“Initial commit”); follow the same format and include the touched environment in parentheses when practical, e.g., `Add feq-parse 2.2.2 to gfx90a`. One logical change per commit keeps bisects clean. PRs should describe the motivation, list updated directories, attach the relevant `spack spec` or `docker build` excerpt, and link any upstream SELF issues. Paste terminal snippets when reviewing GPU-specific behavior.
+
+## Security & Configuration Tips
+Pin base images (`rockylinux:9`) and Spack refs in manifests, and run `dnf update -y` at build time to pick up CVEs. Never embed registry credentials or cluster hostnames in `spack.yaml`; rely on build-time secrets where required. Before publishing, scan the resulting image with `docker scout cves selfish:gfx90a` (or equivalent) to catch dependency vulnerabilities.
diff --git a/README.md b/README.md
@@ -7,10 +7,89 @@ While SELF does support bare-metal builds and those are regularly tested, the co
 
 The core SELF team at Fluid Numerics has adopted enroot+pyxis with Slurm for our deployment model due to positive experience with this approach.
 
+See [Repository Guidelines](CLAUDE.md) for contributor expectations, build commands, and review checklists.
+
 
 More docs coming soon
 
 
 ## Organization
 
-The `envs/` subdirectory defines all of the base environments that are aimed at providing base images with all the dependencies required for developing SELF. The subdirectory structure is as `envs/{cpu_platform}/{gpu_platform}`. When `{gpu_platform}=none`, that environment is an environment for working with non-gpu accelerated implementations of SELF.
+The `envs/` subdirectory defines all of the base environments that are aimed at providing base images with all the dependencies required for developing SELF. The subdirectory structure is as `envs/{cpu_platform}/{gpu_backend}`. When `{gpu_platform}=none`, that environment is an environment for working with non-gpu accelerated implementations of SELF.
+
+## Container Images
+
+SELFish provides pre-built container images with all dependencies for GPU-accelerated spectral element computations. Images are tagged using a **version-architecture** naming scheme to support multiple GPU targets.
+
+### Image Tagging Scheme
+
+Images follow the pattern: `higherordermethods/selfish:<version>-<cpu_platform>-<gpu_backend>-<gpu_arch>`
+
+- **`<version>`**: Semantic version (e.g., `v1.2.3`) or release channel (`latest`, `dev`)
+- **`<cpu_platform>`** : Target cpu architecture (e.g. `x86`, `arm` )
+- **`<gpu_backend>`** : GPU backend provider with version (e.g. `rocm643`, `cuda112`)
+- **`<gpu_arch>`**: Target GPU architecture (e.g., `gfx90a`, `gfx906`, `gfx942`)
+
+#### Examples:
+```bash
+# Stable release for MI210/MI250 (gfx90a)
+docker pull higherordermethods/selfish:v1.2.3-gfx90a
+
+# Latest stable for Radeon Instinct MI100 (gfx908)
+docker pull higherordermethods/selfish:latest-gfx908
+
+# Development build for MI300A (gfx942)
+docker pull higherordermethods/selfish:dev-gfx942
+```
+
+### Supported GPU Architectures
+
+| Architecture | GPU Models | Tag Suffix |
+|--------------|------------|------------|
+| gfx90a | MI210, MI250, MI250X | `-gfx90a` |
+| gfx908 | MI100 | `-gfx908` |
+| gfx906 | MI50, MI60, Radeon VII | `-gfx906` |
+| gfx942 | MI300A, MI300X | `-gfx942` |
+| sm_72  | V100 | -sm72 |
+
+### Determining Your GPU Architecture
+
+If you're unsure which image to use, check your GPU architecture.
+
+For AMD GPUs,
+```bash
+# Using rocminfo
+rocminfo | grep "Name:" | grep "gfx"
+
+# Using rocm-smi
+rocm-smi --showproductname
+```
+
+### Using with Slurm
+
+Specify the architecture-specific image in your job script:
+```bash
+#!/bin/bash
+#SBATCH --gpus=1
+#SBATCH --container-image=higherordermethods/selfish:v1.2.3-gfx90a
+
+./run_simulation.sh
+```
+
+### Version Pinning Recommendations
+
+- **Production**: Pin to specific versions (e.g., `v1.2.3-gfx90a`) for reproducibility
+- **Development**: Use `latest-<arch>` for convenience (auto-updates with new releases)
+- **Testing CI**: Use `dev-<arch>` to test against bleeding-edge builds
+
+### Image Metadata
+
+All images include OCI labels for programmatic inspection:
+```bash
+docker inspect higherordermethods/selfish:v1.2.3-gfx90a | grep -A5 Labels
+```
+
+Key labels:
+- `com.fluidnumerics.rocm.target`: GPU architecture target
+- `com.fluidnumerics.selfish.version`: SELFish version
+- `org.opencontainers.image.version`: Container image version
diff --git a/envs/x86/gfx90a/Dockerfile → envs/x86/rocm/Dockerfile b/envs/x86/gfx90a/Dockerfile → envs/x86/rocm/Dockerfile
@@ -1,5 +1,8 @@
 FROM docker.io/rockylinux:9 AS bootstrap
 
+ARG GPU_ARCH=gfx90a
+ARG GPU_BACKEND_VERSION=6.4.3
+
 ENV SPACK_ROOT=/opt/spack \
     CURRENTLY_BUILDING_DOCKER_IMAGE=1 \
     container=docker
@@ -9,6 +12,7 @@ RUN dnf update -y \
  && dnf update -y \
  && dnf --enablerepo epel install -y \
         bzip2 \
+        cmake \
         curl-minimal \
         file \
         findutils \
@@ -33,8 +37,18 @@ RUN dnf update -y \
  && rm -rf /var/cache/dnf \
  && dnf clean all
 
+# Install HIP  #
+COPY ./envs/x86/rocm/rocm.repo /etc/yum.repos.d/rocm.repo
+RUN sed -i "s/@GPU_BACKEND_VERSION@/${GPU_BACKEND_VERSION}/g" /etc/yum.repos.d/rocm.repo
+
+RUN dnf clean all && \
+    dnf update -y && \
+    dnf install -y rocm-hip-sdk rocm-llvm rocm-smi-lib rocminfo
+
+RUN ls -l /opt/rocm-${GPU_BACKEND_VERSION}/include/hip/hip_version.h
+
 RUN mkdir $SPACK_ROOT && cd $SPACK_ROOT && \
-    git init --quiet && git remote add origin https://github.com/spack/spack.git && git fetch --depth=1 origin v1.0.2 && git checkout --detach FETCH_HEAD && \
+    git init --quiet && git remote add origin https://github.com/spack/spack.git && git fetch --depth=1 origin develop && git checkout --detach FETCH_HEAD && \
     mkdir -p $SPACK_ROOT/opt/spack
 
 RUN ln -s $SPACK_ROOT/share/spack/docker/entrypoint.bash \
@@ -79,23 +93,56 @@ set -o noclobber \
 &&  (echo spack: \
 &&   echo '  specs:' \
 &&   echo '  - feq-parse@2.2.2' \
-&&   echo '  - mpich@4.2.3 +rocm' \
+&&   echo '  - openmpi@5.0.8 +rocm' \
 &&   echo '  - hdf5@1.14.5 +fortran +mpi' \
 &&   echo '  packages:' \
 &&   echo '    all:' \
 &&   echo '      require:' \
 &&   echo '      - target=x86_64_v3' \
 &&   echo '      prefer:' \
-&&   echo '      - amdgpu_target=gfx942' \
+&&   echo "      - amdgpu_target=$GPU_ARCH" \
+&&   echo '    cmake:' \
+&&   echo '      buildable: false' \
+&&   echo '      externals:' \
+&&   echo '        - spec: "cmake@3.26.5"' \
+&&   echo '          prefix: "/usr"' \
+&&   echo '    rocm-smi-lib:' \
+&&   echo '      buildable: false' \
+&&   echo '      externals:' \
+&&   echo "        - spec: \"rocm-smi-lib@${GPU_BACKEND_VERSION}\"" \
+&&   echo "          prefix: \"/opt/rocm-${GPU_BACKEND_VERSION}\"" \
+&&   echo '    hip:' \
+&&   echo '      buildable: false' \
+&&   echo '      externals:' \
+&&   echo "        - spec: \"hip@${GPU_BACKEND_VERSION}\"" \
+&&   echo "          prefix: \"/opt/rocm-${GPU_BACKEND_VERSION}\"" \
+&&   echo '    hsa-rocr-dev:' \
+&&   echo '      buildable: false' \
+&&   echo '      externals:' \
+&&   echo "        - spec: \"hsa-rocr-dev@${GPU_BACKEND_VERSION}\"" \
+&&   echo "          prefix: \"/opt/rocm-${GPU_BACKEND_VERSION}\"" \
+&&   echo '    llvm-amdgpu:' \
+&&   echo '      buildable: false' \
+&&   echo '      externals:' \
+&&   echo "        - spec: \"llvm-amdgpu@${GPU_BACKEND_VERSION}\"" \
+&&   echo "          prefix: \"/opt/rocm-${GPU_BACKEND_VERSION}\"" \
 &&   echo '' \
 &&   echo '  concretizer:' \
 &&   echo '    unify: true' \
 &&   echo '  config:' \
-&&   echo '    install_tree: /opt/software' \
+&&   echo '    install_tree:' \
+&&   echo '      root: /opt/software' \
 &&   echo '  view: /opt/views/view') > /opt/spack-environment/spack.yaml
 
+# Apply feq-parse patch to add "c" build dependency
+COPY ./envs/x86/rocm/feq-parse.patch /tmp/feq-parse.patch
+#
+RUN SPACK_PKGS_ROOT=$(spack repo list | awk '{print $NF}') &&\
+    SPACK_BUILTIN_PKGS_ROOT=${SPACK_PKGS_ROOT/repos\/spack_repo\/builtin} &&\
+    patch -p1 -d $SPACK_BUILTIN_PKGS_ROOT < /tmp/feq-parse.patch
+
 # Install the software, remove unnecessary deps
-RUN cd /opt/spack-environment && spack env activate . && spack install --fail-fast && spack gc -y
+RUN cd /opt/spack-environment && spack env activate . && spack repo list && spack install --fail-fast && spack gc -y
 
 # Strip all the binaries
 RUN find -L /opt/views/view/* -type f -exec readlink -f '{}' \; | \
@@ -115,6 +162,14 @@ FROM docker.io/rockylinux:9
 COPY --from=builder /opt/spack-environment /opt/spack-environment
 COPY --from=builder /opt/software /opt/software
 
+# Install HIP  #
+COPY ./envs/x86/rocm/rocm.repo /etc/yum.repos.d/rocm.repo
+ARG GPU_BACKEND_VERSION=6.4.3
+RUN sed -i "s/@GPU_BACKEND_VERSION@/${GPU_BACKEND_VERSION}/g" /etc/yum.repos.d/rocm.repo && \
+    dnf clean all && \
+    dnf update -y && \
+    dnf install -y rocm-hip-libraries rocm-hip-runtime
+
 # paths.view is a symlink, so copy the parent to avoid dereferencing and duplicating it
 COPY --from=builder /opt/views /opt/views
 

diff --git a/envs/x86/rocm/feq-parse.patch b/envs/x86/rocm/feq-parse.patch
@@ -0,0 +1,12 @@
+diff --git a/repos/spack_repo/builtin/packages/feq_parse/package.py b/repos/spack_repo/builtin/packages/feq_parse/package.py
+index e4b960b7..bc0916b9 100644
+--- a/repos/spack_repo/builtin/packages/feq_parse/package.py
++++ b/repos/spack_repo/builtin/packages/feq_parse/package.py
+@@ -29,6 +29,7 @@ class FeqParse(CMakePackage):
+     version("1.0.2", sha256="1cd1db7562908ea16fc65dc5268b654405d0b3d9dcfe11f409949c431b48a3e8")
+
+     depends_on("fortran", type="build")  # generated
++    depends_on("c", type="build")  # generated
+
+     depends_on("cmake@3.0.2:", type="build")
+
diff --git a/envs/x86/rocm/rocm.repo b/envs/x86/rocm/rocm.repo
@@ -0,0 +1,7 @@
+[rocm]
+name=ROCm @GPU_BACKEND_VERSION@ repository
+baseurl=https://repo.radeon.com/rocm/el9/@GPU_BACKEND_VERSION@/main
+enabled=1
+priority=50
+gpgcheck=1
+gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
diff --git a/envs/x86/gfx90a/spack.yaml → envs/x86/rocm/spack.yaml b/envs/x86/gfx90a/spack.yaml → envs/x86/rocm/spack.yaml