diff --git a/.github/workflows/build-rocm.yml b/.github/workflows/build-rocm.yml new file mode 100644 index 0000000..25ddde8 --- /dev/null +++ b/.github/workflows/build-rocm.yml @@ -0,0 +1,93 @@ +name: Build ROCm Docker images + +on: + push: + branches: + - main + paths: + - envs/x86/rocm/** + - .github/workflows/build-rocm.yml + pull_request: + paths: + - envs/x86/rocm/** + - .github/workflows/build-rocm.yml + workflow_dispatch: + +env: + REGISTRY: docker.io + IMAGE_NAME: higherordermethods/selfish + +jobs: + build: + name: Build ${{ matrix.gpu_arch }} image + runs-on: ubuntu-latest + permissions: + contents: read + strategy: + fail-fast: false + matrix: + gpu_arch: [gfx906, gfx90a, gfx942] + gpu_backend_version: ["6.4.3"] + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Log in to Docker Hub + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Set up Buildx + uses: docker/setup-buildx-action@v3 + + - name: Generate image metadata + id: meta + run: | + # Convert GPU_BACKEND_VERSION 6.4.3 -> rocm643 + VERSION_NO_DOTS=$(echo "${{ matrix.gpu_backend_version }}" | tr -d '.') + GPU_BACKEND="rocm${VERSION_NO_DOTS}" + + # Build tag components + CPU_PLATFORM="x86" + GPU_ARCH="${{ matrix.gpu_arch }}" + + # Generate tags following: --- + echo "tags<> $GITHUB_OUTPUT + echo "${{ env.IMAGE_NAME }}:latest-${CPU_PLATFORM}-${GPU_BACKEND}-${GPU_ARCH}" >> $GITHUB_OUTPUT + echo "${{ env.IMAGE_NAME }}:${{ github.sha }}-${CPU_PLATFORM}-${GPU_BACKEND}-${GPU_ARCH}" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + + echo "gpu_backend=${GPU_BACKEND}" >> $GITHUB_OUTPUT + + - name: Cache Docker layers + uses: actions/cache@v4 + with: + path: /tmp/.buildx-cache + key: ${{ runner.os }}-buildx-${{ matrix.gpu_arch }}-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-buildx-${{ matrix.gpu_arch }}- + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: . + file: envs/x86/rocm/Dockerfile + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + build-args: | + GPU_ARCH=${{ matrix.gpu_arch }} + GPU_BACKEND_VERSION=${{ matrix.gpu_backend_version }} + cache-from: type=local,src=/tmp/.buildx-cache + cache-to: type=local,dest=/tmp/.buildx-cache-new,mode=max + labels: | + com.fluidnumerics.rocm.target=${{ matrix.gpu_arch }} + com.fluidnumerics.rocm.version=${{ matrix.gpu_backend_version }} + org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} + org.opencontainers.image.revision=${{ github.sha }} + + - name: Move cache + run: | + rm -rf /tmp/.buildx-cache + mv /tmp/.buildx-cache-new /tmp/.buildx-cache diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..73620c0 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,22 @@ +# Repository Guidelines + +## Project Structure & Module Organization +Source files live under `envs///`, where each leaf directory owns a `spack.yaml` manifest and (optionally) a generated `Dockerfile`. Keep CPU targets (`x86`, …) and accelerator targets (`gfx90a`, `sm72`, `none`) granular so images stay purpose-built, and limit the root `README.md` to high-level context. + +## Build, Test, and Development Commands +- `spack spec -e envs/x86/gfx90a/spack.yaml` — concretizes the manifest locally; run this before opening a PR so dependency drift is caught early. +- `spack containerize envs/x86/gfx90a/spack.yaml > envs/x86/gfx90a/Dockerfile` — regenerates the Dockerfile after manifest edits (avoid hand-tuning output). +- `docker build -f envs/x86/gfx90a/Dockerfile -t selfish:gfx90a .` — builds the shareable runtime image; tag images `-` for clarity. +- `docker run --rm selfish:gfx90a spack find hdf5` — smoke-tests that the expected view was installed inside the image. + +## Coding Style & Naming Conventions +Spack YAML uses 2-space indentation, lowercase keys, and quoted constraint strings (`"target=x86_64_v3"`). Group `specs` alphabetically, keep `packages` overrides sorted by scope, and rely on multiline `RUN` blocks with trailing `\` alignment plus brief comments for non-obvious workarounds. Name new environments after the hardware tuple (`x86/gfx942`, `x86/none`) so downstream scripts can glob predictably. + +## Testing Guidelines +For each environment change, run `spack spec` followed by `spack install --fail-fast` inside a disposable builder container to verify concretization. Container builds must pass `docker build` locally before review; capture the last ~20 lines for the PR description. When adding MPI/HDF5 variants, run `docker run --rm mpichversion` (or another representative binary) to prove runtime availability. There is no coverage gate, but every new spec should ship with at least one build log, and GitHub Actions now double-checks gfx90a builds and publishes them to `higherordermethods/selfish`. + +## Commit & Pull Request Guidelines +Existing history uses short, imperative subject lines (“Initial commit”); follow the same format and include the touched environment in parentheses when practical, e.g., `Add feq-parse 2.2.2 to gfx90a`. One logical change per commit keeps bisects clean. PRs should describe the motivation, list updated directories, attach the relevant `spack spec` or `docker build` excerpt, and link any upstream SELF issues. Paste terminal snippets when reviewing GPU-specific behavior. + +## Security & Configuration Tips +Pin base images (`rockylinux:9`) and Spack refs in manifests, and run `dnf update -y` at build time to pick up CVEs. Never embed registry credentials or cluster hostnames in `spack.yaml`; rely on build-time secrets where required. Before publishing, scan the resulting image with `docker scout cves selfish:gfx90a` (or equivalent) to catch dependency vulnerabilities. diff --git a/README.md b/README.md index d4c97f7..7aa295b 100644 --- a/README.md +++ b/README.md @@ -7,10 +7,89 @@ While SELF does support bare-metal builds and those are regularly tested, the co The core SELF team at Fluid Numerics has adopted enroot+pyxis with Slurm for our deployment model due to positive experience with this approach. +See [Repository Guidelines](CLAUDE.md) for contributor expectations, build commands, and review checklists. + More docs coming soon ## Organization -The `envs/` subdirectory defines all of the base environments that are aimed at providing base images with all the dependencies required for developing SELF. The subdirectory structure is as `envs/{cpu_platform}/{gpu_platform}`. When `{gpu_platform}=none`, that environment is an environment for working with non-gpu accelerated implementations of SELF. +The `envs/` subdirectory defines all of the base environments that are aimed at providing base images with all the dependencies required for developing SELF. The subdirectory structure is as `envs/{cpu_platform}/{gpu_backend}`. When `{gpu_platform}=none`, that environment is an environment for working with non-gpu accelerated implementations of SELF. + +## Container Images + +SELFish provides pre-built container images with all dependencies for GPU-accelerated spectral element computations. Images are tagged using a **version-architecture** naming scheme to support multiple GPU targets. + +### Image Tagging Scheme + +Images follow the pattern: `higherordermethods/selfish:---` + +- **``**: Semantic version (e.g., `v1.2.3`) or release channel (`latest`, `dev`) +- **``** : Target cpu architecture (e.g. `x86`, `arm` ) +- **``** : GPU backend provider with version (e.g. `rocm643`, `cuda112`) +- **``**: Target GPU architecture (e.g., `gfx90a`, `gfx906`, `gfx942`) + +#### Examples: +```bash +# Stable release for MI210/MI250 (gfx90a) +docker pull higherordermethods/selfish:v1.2.3-gfx90a + +# Latest stable for Radeon Instinct MI100 (gfx908) +docker pull higherordermethods/selfish:latest-gfx908 + +# Development build for MI300A (gfx942) +docker pull higherordermethods/selfish:dev-gfx942 +``` + +### Supported GPU Architectures + +| Architecture | GPU Models | Tag Suffix | +|--------------|------------|------------| +| gfx90a | MI210, MI250, MI250X | `-gfx90a` | +| gfx908 | MI100 | `-gfx908` | +| gfx906 | MI50, MI60, Radeon VII | `-gfx906` | +| gfx942 | MI300A, MI300X | `-gfx942` | +| sm_72 | V100 | -sm72 | + +### Determining Your GPU Architecture + +If you're unsure which image to use, check your GPU architecture. + +For AMD GPUs, +```bash +# Using rocminfo +rocminfo | grep "Name:" | grep "gfx" + +# Using rocm-smi +rocm-smi --showproductname +``` + +### Using with Slurm + +Specify the architecture-specific image in your job script: +```bash +#!/bin/bash +#SBATCH --gpus=1 +#SBATCH --container-image=higherordermethods/selfish:v1.2.3-gfx90a + +./run_simulation.sh +``` + +### Version Pinning Recommendations + +- **Production**: Pin to specific versions (e.g., `v1.2.3-gfx90a`) for reproducibility +- **Development**: Use `latest-` for convenience (auto-updates with new releases) +- **Testing CI**: Use `dev-` to test against bleeding-edge builds + +### Image Metadata + +All images include OCI labels for programmatic inspection: +```bash +docker inspect higherordermethods/selfish:v1.2.3-gfx90a | grep -A5 Labels +``` + +Key labels: +- `com.fluidnumerics.rocm.target`: GPU architecture target +- `com.fluidnumerics.selfish.version`: SELFish version +- `org.opencontainers.image.version`: Container image version diff --git a/envs/x86/gfx90a/Dockerfile b/envs/x86/rocm/Dockerfile similarity index 58% rename from envs/x86/gfx90a/Dockerfile rename to envs/x86/rocm/Dockerfile index fe8347c..7f5ce6e 100644 --- a/envs/x86/gfx90a/Dockerfile +++ b/envs/x86/rocm/Dockerfile @@ -1,5 +1,8 @@ FROM docker.io/rockylinux:9 AS bootstrap +ARG GPU_ARCH=gfx90a +ARG GPU_BACKEND_VERSION=6.4.3 + ENV SPACK_ROOT=/opt/spack \ CURRENTLY_BUILDING_DOCKER_IMAGE=1 \ container=docker @@ -9,6 +12,7 @@ RUN dnf update -y \ && dnf update -y \ && dnf --enablerepo epel install -y \ bzip2 \ + cmake \ curl-minimal \ file \ findutils \ @@ -33,8 +37,18 @@ RUN dnf update -y \ && rm -rf /var/cache/dnf \ && dnf clean all +# Install HIP # +COPY ./envs/x86/rocm/rocm.repo /etc/yum.repos.d/rocm.repo +RUN sed -i "s/@GPU_BACKEND_VERSION@/${GPU_BACKEND_VERSION}/g" /etc/yum.repos.d/rocm.repo + +RUN dnf clean all && \ + dnf update -y && \ + dnf install -y rocm-hip-sdk rocm-llvm rocm-smi-lib rocminfo + +RUN ls -l /opt/rocm-${GPU_BACKEND_VERSION}/include/hip/hip_version.h + RUN mkdir $SPACK_ROOT && cd $SPACK_ROOT && \ - git init --quiet && git remote add origin https://github.com/spack/spack.git && git fetch --depth=1 origin v1.0.2 && git checkout --detach FETCH_HEAD && \ + git init --quiet && git remote add origin https://github.com/spack/spack.git && git fetch --depth=1 origin develop && git checkout --detach FETCH_HEAD && \ mkdir -p $SPACK_ROOT/opt/spack RUN ln -s $SPACK_ROOT/share/spack/docker/entrypoint.bash \ @@ -79,23 +93,56 @@ set -o noclobber \ && (echo spack: \ && echo ' specs:' \ && echo ' - feq-parse@2.2.2' \ -&& echo ' - mpich@4.2.3 +rocm' \ +&& echo ' - openmpi@5.0.8 +rocm' \ && echo ' - hdf5@1.14.5 +fortran +mpi' \ && echo ' packages:' \ && echo ' all:' \ && echo ' require:' \ && echo ' - target=x86_64_v3' \ && echo ' prefer:' \ -&& echo ' - amdgpu_target=gfx942' \ +&& echo " - amdgpu_target=$GPU_ARCH" \ +&& echo ' cmake:' \ +&& echo ' buildable: false' \ +&& echo ' externals:' \ +&& echo ' - spec: "cmake@3.26.5"' \ +&& echo ' prefix: "/usr"' \ +&& echo ' rocm-smi-lib:' \ +&& echo ' buildable: false' \ +&& echo ' externals:' \ +&& echo " - spec: \"rocm-smi-lib@${GPU_BACKEND_VERSION}\"" \ +&& echo " prefix: \"/opt/rocm-${GPU_BACKEND_VERSION}\"" \ +&& echo ' hip:' \ +&& echo ' buildable: false' \ +&& echo ' externals:' \ +&& echo " - spec: \"hip@${GPU_BACKEND_VERSION}\"" \ +&& echo " prefix: \"/opt/rocm-${GPU_BACKEND_VERSION}\"" \ +&& echo ' hsa-rocr-dev:' \ +&& echo ' buildable: false' \ +&& echo ' externals:' \ +&& echo " - spec: \"hsa-rocr-dev@${GPU_BACKEND_VERSION}\"" \ +&& echo " prefix: \"/opt/rocm-${GPU_BACKEND_VERSION}\"" \ +&& echo ' llvm-amdgpu:' \ +&& echo ' buildable: false' \ +&& echo ' externals:' \ +&& echo " - spec: \"llvm-amdgpu@${GPU_BACKEND_VERSION}\"" \ +&& echo " prefix: \"/opt/rocm-${GPU_BACKEND_VERSION}\"" \ && echo '' \ && echo ' concretizer:' \ && echo ' unify: true' \ && echo ' config:' \ -&& echo ' install_tree: /opt/software' \ +&& echo ' install_tree:' \ +&& echo ' root: /opt/software' \ && echo ' view: /opt/views/view') > /opt/spack-environment/spack.yaml +# Apply feq-parse patch to add "c" build dependency +COPY ./envs/x86/rocm/feq-parse.patch /tmp/feq-parse.patch +# +RUN SPACK_PKGS_ROOT=$(spack repo list | awk '{print $NF}') &&\ + SPACK_BUILTIN_PKGS_ROOT=${SPACK_PKGS_ROOT/repos\/spack_repo\/builtin} &&\ + patch -p1 -d $SPACK_BUILTIN_PKGS_ROOT < /tmp/feq-parse.patch + # Install the software, remove unnecessary deps -RUN cd /opt/spack-environment && spack env activate . && spack install --fail-fast && spack gc -y +RUN cd /opt/spack-environment && spack env activate . && spack repo list && spack install --fail-fast && spack gc -y # Strip all the binaries RUN find -L /opt/views/view/* -type f -exec readlink -f '{}' \; | \ @@ -115,6 +162,14 @@ FROM docker.io/rockylinux:9 COPY --from=builder /opt/spack-environment /opt/spack-environment COPY --from=builder /opt/software /opt/software +# Install HIP # +COPY ./envs/x86/rocm/rocm.repo /etc/yum.repos.d/rocm.repo +ARG GPU_BACKEND_VERSION=6.4.3 +RUN sed -i "s/@GPU_BACKEND_VERSION@/${GPU_BACKEND_VERSION}/g" /etc/yum.repos.d/rocm.repo && \ + dnf clean all && \ + dnf update -y && \ + dnf install -y rocm-hip-libraries rocm-hip-runtime + # paths.view is a symlink, so copy the parent to avoid dereferencing and duplicating it COPY --from=builder /opt/views /opt/views diff --git a/envs/x86/rocm/feq-parse.patch b/envs/x86/rocm/feq-parse.patch new file mode 100644 index 0000000..399f387 --- /dev/null +++ b/envs/x86/rocm/feq-parse.patch @@ -0,0 +1,12 @@ +diff --git a/repos/spack_repo/builtin/packages/feq_parse/package.py b/repos/spack_repo/builtin/packages/feq_parse/package.py +index e4b960b7..bc0916b9 100644 +--- a/repos/spack_repo/builtin/packages/feq_parse/package.py ++++ b/repos/spack_repo/builtin/packages/feq_parse/package.py +@@ -29,6 +29,7 @@ class FeqParse(CMakePackage): + version("1.0.2", sha256="1cd1db7562908ea16fc65dc5268b654405d0b3d9dcfe11f409949c431b48a3e8") + + depends_on("fortran", type="build") # generated ++ depends_on("c", type="build") # generated + + depends_on("cmake@3.0.2:", type="build") + diff --git a/envs/x86/rocm/rocm.repo b/envs/x86/rocm/rocm.repo new file mode 100644 index 0000000..2df98d1 --- /dev/null +++ b/envs/x86/rocm/rocm.repo @@ -0,0 +1,7 @@ +[rocm] +name=ROCm @GPU_BACKEND_VERSION@ repository +baseurl=https://repo.radeon.com/rocm/el9/@GPU_BACKEND_VERSION@/main +enabled=1 +priority=50 +gpgcheck=1 +gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key diff --git a/envs/x86/gfx90a/spack.yaml b/envs/x86/rocm/spack.yaml similarity index 100% rename from envs/x86/gfx90a/spack.yaml rename to envs/x86/rocm/spack.yaml