From 6048202663e8030a903b9ca726ad781bfd93042d Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Wed, 4 Mar 2026 20:00:31 +0800 Subject: [PATCH 01/25] [plugin] add plugin dockerfile Signed-off-by: zejunchen-zejun --- docker/Dockerfile_For_OOT | 138 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 docker/Dockerfile_For_OOT diff --git a/docker/Dockerfile_For_OOT b/docker/Dockerfile_For_OOT new file mode 100644 index 000000000..b1967e299 --- /dev/null +++ b/docker/Dockerfile_For_OOT @@ -0,0 +1,138 @@ +ARG BASE_IMAGE="rocm/vllm-dev:base" +ARG GPU_ARCH="gfx942;gfx950" + +FROM ${BASE_IMAGE} + +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +ARG GPU_ARCH +ENV GPU_ARCH_LIST=${GPU_ARCH} +ENV PYTORCH_ROCM_ARCH=${GPU_ARCH} +ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" +ENV UV_LINK_MODE=copy +ENV TOKENIZERS_PARALLELISM=false +ENV SAFETENSORS_FAST_GPU=1 +ENV HIP_FORCE_DEV_KERNARG=1 + +# ---- ATOM build args ---- +ARG ATOM_REPO="https://github.com/ROCm/ATOM.git" +ARG ATOM_COMMIT="HEAD" +ARG AITER_REPO="https://github.com/ROCm/aiter.git" +ARG AITER_COMMIT="HEAD" +ARG MORI_COMMIT="b0dce4beebeb1f26c784eee17d5fd9785ee9447f" +ARG PREBUILD_KERNELS=1 +ARG MAX_JOBS=64 + +# ---- vLLM build args ---- +ARG VLLM_REPO="https://github.com/vllm-project/vllm.git" +ARG VLLM_COMMIT="main" + +# ---- common dependencies (union of atom + vllm rocm requirements) ---- +RUN apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + git \ + curl \ + wget \ + ca-certificates \ + apt-transport-https \ + sqlite3 \ + libsqlite3-dev \ + libfmt-dev \ + libmsgpack-dev \ + libsuitesparse-dev \ + cython3 \ + ibverbs-utils \ + openmpi-bin \ + libopenmpi-dev \ + libpci-dev \ + cmake \ + libdw1 \ + locales \ + && rm -rf /var/lib/apt/lists/* + +RUN python3 -m pip install --upgrade pip \ + && python3 -m pip install lm-eval[api] cmake + +# vLLM docker uses uv heavily; keep installer consistent with upstream rocm flow +RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh + +WORKDIR /app + +# --------------------------- +# Install MORI +# --------------------------- +RUN git clone https://github.com/ROCm/mori.git /app/mori \ + && cd /app/mori \ + && git checkout ${MORI_COMMIT} \ + && python3 -m pip install -r requirements-build.txt \ + && git submodule update --init --recursive \ + && python3 setup.py install + +# --------------------------- +# Update RCCL +# --------------------------- +ARG RCCL_REPO="https://github.com/ROCm/rccl.git" +ARG RCCL_BRANCH="29e1567b95e28823b0beb1a988adc587bfab5b4f" +ARG RCCL_SRC_DIR="/app/rccl" +RUN git clone ${RCCL_REPO} ${RCCL_SRC_DIR} \ + && cd ${RCCL_SRC_DIR} \ + && git checkout ${RCCL_BRANCH} \ + && ./install.sh -p --amdgpu_targets=${GPU_ARCH_LIST} \ + && DEBIAN_FRONTEND=noninteractive dpkg -i --force-all ${RCCL_SRC_DIR}/build/release/*.deb \ + && rm -rf ${RCCL_SRC_DIR}/build + +# --------------------------- +# Update Triton +# --------------------------- +RUN python3 -m pip uninstall -y triton || true \ + && git clone --depth=1 --branch release/internal/3.5.x https://github.com/ROCm/triton.git /app/triton \ + && cd /app/triton \ + && python3 -m pip install -r python/requirements.txt \ + && python3 -m pip install filecheck \ + && MAX_JOBS=${MAX_JOBS} python3 -m pip --retries=10 --default-timeout=60 install . + +# --------------------------- +# Install AITER +# --------------------------- +RUN python3 -m pip uninstall -y aiter || true \ + && git clone ${AITER_REPO} /app/aiter \ + && cd /app/aiter \ + && git checkout ${AITER_COMMIT} \ + && python3 -m pip install -r requirements.txt \ + && git submodule sync \ + && git submodule update --init --recursive \ + && MAX_JOBS=${MAX_JOBS} PREBUILD_KERNELS=${PREBUILD_KERNELS} GPU_ARCHS=${GPU_ARCH_LIST} python3 setup.py develop + +# --------------------------- +# Build and install vLLM (commit selectable) +# --------------------------- +RUN git clone ${VLLM_REPO} /app/vllm \ + && cd /app/vllm \ + && git fetch --all --tags \ + && git checkout ${VLLM_COMMIT} \ + && python3 -m pip install -r requirements/rocm.txt \ + && python3 setup.py clean --all \ + && python3 setup.py bdist_wheel --dist-dir=dist \ + && python3 -m pip uninstall -y vllm || true \ + && python3 -m pip install dist/*.whl + +# --------------------------- +# Install ATOM +# --------------------------- +RUN python3 -m pip uninstall -y atom || true \ + && git clone ${ATOM_REPO} /app/ATOM \ + && cd /app/ATOM \ + && git checkout ${ATOM_COMMIT} \ + && python3 -m pip install -e . + +# A common profiler workaround from vllm ROCm image +RUN echo "ROCTRACER_MAX_EVENTS=10000000" > /app/libkineto.conf +ENV KINETO_CONFIG="/app/libkineto.conf" + +# Print key versions for quick smoke verification +RUN python3 -m pip show vllm || true \ + && python3 -m pip show atom || true \ + && python3 -m pip show amd-aiter || true \ + && python3 -m pip show mori || true + +CMD ["/bin/bash"] From cc6a3e27a155b308399887987d4e7ec4ee83f28e Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Wed, 4 Mar 2026 20:04:52 +0800 Subject: [PATCH 02/25] add build sh Signed-off-by: zejunchen-zejun --- docker/build_oot_docker.sh | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 docker/build_oot_docker.sh diff --git a/docker/build_oot_docker.sh b/docker/build_oot_docker.sh new file mode 100644 index 000000000..f7bff6fbb --- /dev/null +++ b/docker/build_oot_docker.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile_For_OOT" +VLLM_COMMIT="f5d17400303149bbb480f6abfb6f7bb646c1d895" +IMAGE_TAG="${IMAGE_TAG:-atom-vllm-oot:f5d17400}" + +echo "Building OOT image with:" +echo " Dockerfile : ${DOCKERFILE_PATH}" +echo " Image tag : ${IMAGE_TAG}" +echo " VLLM commit: ${VLLM_COMMIT}" + +docker build \ + -f "${DOCKERFILE_PATH}" \ + -t "${IMAGE_TAG}" \ + --build-arg "VLLM_COMMIT=${VLLM_COMMIT}" \ + "$@" \ + "${REPO_ROOT}" + From e64b92053a4ba078ce680eed998e633aec5b7e0f Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Wed, 4 Mar 2026 20:16:30 +0800 Subject: [PATCH 03/25] add Signed-off-by: zejunchen-zejun --- docker/build_oot_docker.sh | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/docker/build_oot_docker.sh b/docker/build_oot_docker.sh index f7bff6fbb..791e3412b 100644 --- a/docker/build_oot_docker.sh +++ b/docker/build_oot_docker.sh @@ -5,17 +5,32 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile_For_OOT" +VLLM_BASE_DOCKERFILE="${REPO_ROOT}/vllm/docker/Dockerfile.rocm_base" VLLM_COMMIT="f5d17400303149bbb480f6abfb6f7bb646c1d895" -IMAGE_TAG="${IMAGE_TAG:-atom-vllm-oot:f5d17400}" +IMAGE_TAG="${IMAGE_TAG:-atom-vllm-oot}" +VLLM_BASE_IMAGE="${VLLM_BASE_IMAGE:-rocm/vllm-dev:base}" +BUILD_VLLM_BASE="${BUILD_VLLM_BASE:-1}" -echo "Building OOT image with:" +if [[ "${BUILD_VLLM_BASE}" == "1" ]]; then + echo "Step 1/2: build vLLM ROCm base image" + echo " Dockerfile : ${VLLM_BASE_DOCKERFILE}" + echo " Image tag : ${VLLM_BASE_IMAGE}" + DOCKER_BUILDKIT=1 docker build \ + -f "${VLLM_BASE_DOCKERFILE}" \ + -t "${VLLM_BASE_IMAGE}" \ + "${REPO_ROOT}/vllm" +fi + +echo "Step 2/2: build ATOM OOT image" echo " Dockerfile : ${DOCKERFILE_PATH}" echo " Image tag : ${IMAGE_TAG}" +echo " Base image : ${VLLM_BASE_IMAGE}" echo " VLLM commit: ${VLLM_COMMIT}" -docker build \ +DOCKER_BUILDKIT=1 docker build \ -f "${DOCKERFILE_PATH}" \ -t "${IMAGE_TAG}" \ + --build-arg "BASE_IMAGE=${VLLM_BASE_IMAGE}" \ --build-arg "VLLM_COMMIT=${VLLM_COMMIT}" \ "$@" \ "${REPO_ROOT}" From eab9017a8e030f51fb614e260c8d83133915e2ec Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Wed, 4 Mar 2026 20:19:25 +0800 Subject: [PATCH 04/25] add Signed-off-by: zejunchen-zejun --- docker/build_oot_docker.sh | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/docker/build_oot_docker.sh b/docker/build_oot_docker.sh index 791e3412b..3c90c6916 100644 --- a/docker/build_oot_docker.sh +++ b/docker/build_oot_docker.sh @@ -5,20 +5,35 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile_For_OOT" -VLLM_BASE_DOCKERFILE="${REPO_ROOT}/vllm/docker/Dockerfile.rocm_base" +VLLM_REPO="${VLLM_REPO:-https://github.com/vllm-project/vllm.git}" +VLLM_DOCKER_REF="${VLLM_DOCKER_REF:-main}" VLLM_COMMIT="f5d17400303149bbb480f6abfb6f7bb646c1d895" IMAGE_TAG="${IMAGE_TAG:-atom-vllm-oot}" VLLM_BASE_IMAGE="${VLLM_BASE_IMAGE:-rocm/vllm-dev:base}" BUILD_VLLM_BASE="${BUILD_VLLM_BASE:-1}" if [[ "${BUILD_VLLM_BASE}" == "1" ]]; then - echo "Step 1/2: build vLLM ROCm base image" - echo " Dockerfile : ${VLLM_BASE_DOCKERFILE}" - echo " Image tag : ${VLLM_BASE_IMAGE}" + VLLM_TMP_DIR="$(mktemp -d)" + trap 'rm -rf "${VLLM_TMP_DIR}"' EXIT + + echo "Step 1/2: clone vLLM and build ROCm base image" + echo " vLLM repo : ${VLLM_REPO}" + echo " vLLM ref : ${VLLM_DOCKER_REF}" + echo " Image tag : ${VLLM_BASE_IMAGE}" + + git clone "${VLLM_REPO}" "${VLLM_TMP_DIR}" + git -C "${VLLM_TMP_DIR}" checkout "${VLLM_DOCKER_REF}" + + VLLM_BASE_DOCKERFILE="${VLLM_TMP_DIR}/docker/Dockerfile.rocm_base" + if [[ ! -f "${VLLM_BASE_DOCKERFILE}" ]]; then + echo "ERROR: cannot find ${VLLM_BASE_DOCKERFILE}" + exit 1 + fi + DOCKER_BUILDKIT=1 docker build \ -f "${VLLM_BASE_DOCKERFILE}" \ -t "${VLLM_BASE_IMAGE}" \ - "${REPO_ROOT}/vllm" + "${VLLM_TMP_DIR}" fi echo "Step 2/2: build ATOM OOT image" From 64610493a7d2092e79125f5a0c5fc035e188bcd9 Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Wed, 4 Mar 2026 21:00:44 +0800 Subject: [PATCH 05/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_For_OOT | 80 +++----------------------------------- docker/build_oot_docker.sh | 18 ++++++--- 2 files changed, 18 insertions(+), 80 deletions(-) diff --git a/docker/Dockerfile_For_OOT b/docker/Dockerfile_For_OOT index b1967e299..f6eb62e49 100644 --- a/docker/Dockerfile_For_OOT +++ b/docker/Dockerfile_For_OOT @@ -1,4 +1,4 @@ -ARG BASE_IMAGE="rocm/vllm-dev:base" +ARG BASE_IMAGE="rocm/atom-dev:nightly_202603040155" ARG GPU_ARCH="gfx942;gfx950" FROM ${BASE_IMAGE} @@ -15,20 +15,11 @@ ENV TOKENIZERS_PARALLELISM=false ENV SAFETENSORS_FAST_GPU=1 ENV HIP_FORCE_DEV_KERNARG=1 -# ---- ATOM build args ---- -ARG ATOM_REPO="https://github.com/ROCm/ATOM.git" -ARG ATOM_COMMIT="HEAD" -ARG AITER_REPO="https://github.com/ROCm/aiter.git" -ARG AITER_COMMIT="HEAD" -ARG MORI_COMMIT="b0dce4beebeb1f26c784eee17d5fd9785ee9447f" -ARG PREBUILD_KERNELS=1 -ARG MAX_JOBS=64 - # ---- vLLM build args ---- ARG VLLM_REPO="https://github.com/vllm-project/vllm.git" ARG VLLM_COMMIT="main" -# ---- common dependencies (union of atom + vllm rocm requirements) ---- +# ---- dependencies needed by vLLM build ---- RUN apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y \ git \ curl \ @@ -40,14 +31,7 @@ RUN apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y \ libfmt-dev \ libmsgpack-dev \ libsuitesparse-dev \ - cython3 \ - ibverbs-utils \ - openmpi-bin \ - libopenmpi-dev \ - libpci-dev \ cmake \ - libdw1 \ - locales \ && rm -rf /var/lib/apt/lists/* RUN python3 -m pip install --upgrade pip \ @@ -58,73 +42,21 @@ RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/ WORKDIR /app -# --------------------------- -# Install MORI -# --------------------------- -RUN git clone https://github.com/ROCm/mori.git /app/mori \ - && cd /app/mori \ - && git checkout ${MORI_COMMIT} \ - && python3 -m pip install -r requirements-build.txt \ - && git submodule update --init --recursive \ - && python3 setup.py install - -# --------------------------- -# Update RCCL -# --------------------------- -ARG RCCL_REPO="https://github.com/ROCm/rccl.git" -ARG RCCL_BRANCH="29e1567b95e28823b0beb1a988adc587bfab5b4f" -ARG RCCL_SRC_DIR="/app/rccl" -RUN git clone ${RCCL_REPO} ${RCCL_SRC_DIR} \ - && cd ${RCCL_SRC_DIR} \ - && git checkout ${RCCL_BRANCH} \ - && ./install.sh -p --amdgpu_targets=${GPU_ARCH_LIST} \ - && DEBIAN_FRONTEND=noninteractive dpkg -i --force-all ${RCCL_SRC_DIR}/build/release/*.deb \ - && rm -rf ${RCCL_SRC_DIR}/build - -# --------------------------- -# Update Triton -# --------------------------- -RUN python3 -m pip uninstall -y triton || true \ - && git clone --depth=1 --branch release/internal/3.5.x https://github.com/ROCm/triton.git /app/triton \ - && cd /app/triton \ - && python3 -m pip install -r python/requirements.txt \ - && python3 -m pip install filecheck \ - && MAX_JOBS=${MAX_JOBS} python3 -m pip --retries=10 --default-timeout=60 install . - -# --------------------------- -# Install AITER -# --------------------------- -RUN python3 -m pip uninstall -y aiter || true \ - && git clone ${AITER_REPO} /app/aiter \ - && cd /app/aiter \ - && git checkout ${AITER_COMMIT} \ - && python3 -m pip install -r requirements.txt \ - && git submodule sync \ - && git submodule update --init --recursive \ - && MAX_JOBS=${MAX_JOBS} PREBUILD_KERNELS=${PREBUILD_KERNELS} GPU_ARCHS=${GPU_ARCH_LIST} python3 setup.py develop - # --------------------------- # Build and install vLLM (commit selectable) +# Reuse triton and aiter from atom base image. # --------------------------- RUN git clone ${VLLM_REPO} /app/vllm \ && cd /app/vllm \ && git fetch --all --tags \ && git checkout ${VLLM_COMMIT} \ - && python3 -m pip install -r requirements/rocm.txt \ + && sed -E '/(^|[<>=[:space:]])(triton|aiter|amd-aiter)([<>=[:space:]]|$)/Id' requirements/rocm.txt > /tmp/rocm_no_triton_aiter.txt \ + && python3 -m pip install -r /tmp/rocm_no_triton_aiter.txt \ && python3 setup.py clean --all \ && python3 setup.py bdist_wheel --dist-dir=dist \ && python3 -m pip uninstall -y vllm || true \ && python3 -m pip install dist/*.whl -# --------------------------- -# Install ATOM -# --------------------------- -RUN python3 -m pip uninstall -y atom || true \ - && git clone ${ATOM_REPO} /app/ATOM \ - && cd /app/ATOM \ - && git checkout ${ATOM_COMMIT} \ - && python3 -m pip install -e . - # A common profiler workaround from vllm ROCm image RUN echo "ROCTRACER_MAX_EVENTS=10000000" > /app/libkineto.conf ENV KINETO_CONFIG="/app/libkineto.conf" @@ -133,6 +65,6 @@ ENV KINETO_CONFIG="/app/libkineto.conf" RUN python3 -m pip show vllm || true \ && python3 -m pip show atom || true \ && python3 -m pip show amd-aiter || true \ - && python3 -m pip show mori || true + && python3 -m pip show triton || true CMD ["/bin/bash"] diff --git a/docker/build_oot_docker.sh b/docker/build_oot_docker.sh index 3c90c6916..ee68c6fb5 100644 --- a/docker/build_oot_docker.sh +++ b/docker/build_oot_docker.sh @@ -6,17 +6,19 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile_For_OOT" VLLM_REPO="${VLLM_REPO:-https://github.com/vllm-project/vllm.git}" -VLLM_DOCKER_REF="${VLLM_DOCKER_REF:-main}" VLLM_COMMIT="f5d17400303149bbb480f6abfb6f7bb646c1d895" +VLLM_DOCKER_REF="${VLLM_DOCKER_REF:-${VLLM_COMMIT}}" IMAGE_TAG="${IMAGE_TAG:-atom-vllm-oot}" +ATOM_BASE_IMAGE="${ATOM_BASE_IMAGE:-rocm/atom-dev:nightly_202603040155}" +BASE_IMAGE="${BASE_IMAGE:-${ATOM_BASE_IMAGE}}" VLLM_BASE_IMAGE="${VLLM_BASE_IMAGE:-rocm/vllm-dev:base}" -BUILD_VLLM_BASE="${BUILD_VLLM_BASE:-1}" +BUILD_VLLM_BASE="${BUILD_VLLM_BASE:-0}" if [[ "${BUILD_VLLM_BASE}" == "1" ]]; then VLLM_TMP_DIR="$(mktemp -d)" trap 'rm -rf "${VLLM_TMP_DIR}"' EXIT - echo "Step 1/2: clone vLLM and build ROCm base image" + echo "Step 1/2: build vLLM ROCm base image" echo " vLLM repo : ${VLLM_REPO}" echo " vLLM ref : ${VLLM_DOCKER_REF}" echo " Image tag : ${VLLM_BASE_IMAGE}" @@ -34,18 +36,22 @@ if [[ "${BUILD_VLLM_BASE}" == "1" ]]; then -f "${VLLM_BASE_DOCKERFILE}" \ -t "${VLLM_BASE_IMAGE}" \ "${VLLM_TMP_DIR}" + + BASE_IMAGE="${VLLM_BASE_IMAGE}" +else + echo "Step 1/2: skip vLLM base build (using atom base image)" fi -echo "Step 2/2: build ATOM OOT image" +echo "Step 2/2: build remaining vLLM + ATOM OOT image" echo " Dockerfile : ${DOCKERFILE_PATH}" echo " Image tag : ${IMAGE_TAG}" -echo " Base image : ${VLLM_BASE_IMAGE}" +echo " Base image : ${BASE_IMAGE}" echo " VLLM commit: ${VLLM_COMMIT}" DOCKER_BUILDKIT=1 docker build \ -f "${DOCKERFILE_PATH}" \ -t "${IMAGE_TAG}" \ - --build-arg "BASE_IMAGE=${VLLM_BASE_IMAGE}" \ + --build-arg "BASE_IMAGE=${BASE_IMAGE}" \ --build-arg "VLLM_COMMIT=${VLLM_COMMIT}" \ "$@" \ "${REPO_ROOT}" From 70084daa1e9127518a6efaa84c8bb51d283ad0a5 Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 09:06:41 +0800 Subject: [PATCH 06/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_For_OOT | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/docker/Dockerfile_For_OOT b/docker/Dockerfile_For_OOT index f6eb62e49..9283ce3b7 100644 --- a/docker/Dockerfile_For_OOT +++ b/docker/Dockerfile_For_OOT @@ -8,9 +8,6 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"] ARG GPU_ARCH ENV GPU_ARCH_LIST=${GPU_ARCH} ENV PYTORCH_ROCM_ARCH=${GPU_ARCH} -ENV UV_HTTP_TIMEOUT=500 -ENV UV_INDEX_STRATEGY="unsafe-best-match" -ENV UV_LINK_MODE=copy ENV TOKENIZERS_PARALLELISM=false ENV SAFETENSORS_FAST_GPU=1 ENV HIP_FORCE_DEV_KERNARG=1 @@ -19,26 +16,8 @@ ENV HIP_FORCE_DEV_KERNARG=1 ARG VLLM_REPO="https://github.com/vllm-project/vllm.git" ARG VLLM_COMMIT="main" -# ---- dependencies needed by vLLM build ---- -RUN apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y \ - git \ - curl \ - wget \ - ca-certificates \ - apt-transport-https \ - sqlite3 \ - libsqlite3-dev \ - libfmt-dev \ - libmsgpack-dev \ - libsuitesparse-dev \ - cmake \ - && rm -rf /var/lib/apt/lists/* - RUN python3 -m pip install --upgrade pip \ - && python3 -m pip install lm-eval[api] cmake - -# vLLM docker uses uv heavily; keep installer consistent with upstream rocm flow -RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh + && python3 -m pip install cmake WORKDIR /app From 49b16fd316640a9ed68d015de0aae1239d376cbe Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 09:12:18 +0800 Subject: [PATCH 07/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_For_OOT | 6 +++--- docker/build_oot_docker.sh | 12 ++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile_For_OOT b/docker/Dockerfile_For_OOT index 9283ce3b7..79b688600 100644 --- a/docker/Dockerfile_For_OOT +++ b/docker/Dockerfile_For_OOT @@ -29,11 +29,11 @@ RUN git clone ${VLLM_REPO} /app/vllm \ && cd /app/vllm \ && git fetch --all --tags \ && git checkout ${VLLM_COMMIT} \ - && sed -E '/(^|[<>=[:space:]])(triton|aiter|amd-aiter)([<>=[:space:]]|$)/Id' requirements/rocm.txt > /tmp/rocm_no_triton_aiter.txt \ - && python3 -m pip install -r /tmp/rocm_no_triton_aiter.txt \ + && sed -E '/(^|[<>=[:space:]])(triton|aiter|amd-aiter)([<>=[:space:]]|$)/Id' requirements/rocm.txt > requirements/rocm_no_triton_aiter.txt \ + && python3 -m pip install -r requirements/rocm_no_triton_aiter.txt \ && python3 setup.py clean --all \ && python3 setup.py bdist_wheel --dist-dir=dist \ - && python3 -m pip uninstall -y vllm || true \ + && (python3 -m pip uninstall -y vllm || true) \ && python3 -m pip install dist/*.whl # A common profiler workaround from vllm ROCm image diff --git a/docker/build_oot_docker.sh b/docker/build_oot_docker.sh index ee68c6fb5..eb2a1d2e0 100644 --- a/docker/build_oot_docker.sh +++ b/docker/build_oot_docker.sh @@ -14,6 +14,13 @@ BASE_IMAGE="${BASE_IMAGE:-${ATOM_BASE_IMAGE}}" VLLM_BASE_IMAGE="${VLLM_BASE_IMAGE:-rocm/vllm-dev:base}" BUILD_VLLM_BASE="${BUILD_VLLM_BASE:-0}" +echo "========================================" +echo "OOT Docker build config" +echo " Base image : ${BASE_IMAGE}" +echo " VLLM commit : ${VLLM_COMMIT}" +echo " Final image name : ${IMAGE_TAG}" +echo "========================================" + if [[ "${BUILD_VLLM_BASE}" == "1" ]]; then VLLM_TMP_DIR="$(mktemp -d)" trap 'rm -rf "${VLLM_TMP_DIR}"' EXIT @@ -56,3 +63,8 @@ DOCKER_BUILDKIT=1 docker build \ "$@" \ "${REPO_ROOT}" +echo "Build finished." +echo " Final image name : ${IMAGE_TAG}" +echo " Base image used : ${BASE_IMAGE}" +echo " VLLM commit used : ${VLLM_COMMIT}" + From 1230c139a7562df68c8c4f9e2d8a0ef1f707a103 Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 09:41:28 +0800 Subject: [PATCH 08/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_For_OOT | 101 +++++++++++++++++++++++++++++++++---- docker/build_oot_docker.sh | 14 +++-- 2 files changed, 100 insertions(+), 15 deletions(-) diff --git a/docker/Dockerfile_For_OOT b/docker/Dockerfile_For_OOT index 79b688600..5b964583e 100644 --- a/docker/Dockerfile_For_OOT +++ b/docker/Dockerfile_For_OOT @@ -1,4 +1,4 @@ -ARG BASE_IMAGE="rocm/atom-dev:nightly_202603040155" +ARG BASE_IMAGE="rocm/vllm-dev:base" ARG GPU_ARCH="gfx942;gfx950" FROM ${BASE_IMAGE} @@ -8,6 +8,7 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"] ARG GPU_ARCH ENV GPU_ARCH_LIST=${GPU_ARCH} ENV PYTORCH_ROCM_ARCH=${GPU_ARCH} +ENV VLLM_TARGET_DEVICE=rocm ENV TOKENIZERS_PARALLELISM=false ENV SAFETENSORS_FAST_GPU=1 ENV HIP_FORCE_DEV_KERNARG=1 @@ -16,32 +17,112 @@ ENV HIP_FORCE_DEV_KERNARG=1 ARG VLLM_REPO="https://github.com/vllm-project/vllm.git" ARG VLLM_COMMIT="main" -RUN python3 -m pip install --upgrade pip \ - && python3 -m pip install cmake +# ---- ATOM build args ---- +ARG ATOM_REPO="https://github.com/ROCm/ATOM.git" +ARG ATOM_COMMIT="HEAD" +ARG AITER_REPO="https://github.com/ROCm/aiter.git" +ARG AITER_COMMIT="HEAD" +ARG MORI_COMMIT="b0dce4beebeb1f26c784eee17d5fd9785ee9447f" +ARG PREBUILD_KERNELS=1 +ARG MAX_JOBS=64 WORKDIR /app +RUN apt-get update && apt --fix-broken install -y \ + && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + git \ + cython3 \ + ibverbs-utils \ + openmpi-bin \ + libopenmpi-dev \ + libpci-dev \ + cmake \ + libdw1 \ + locales \ + && rm -rf /var/lib/apt/lists/* + +RUN python3 -m pip install --upgrade pip \ + && python3 -m pip install lm-eval[api] + # --------------------------- -# Build and install vLLM (commit selectable) -# Reuse triton and aiter from atom base image. +# Build and install vLLM (ROCm path) +# Matches upstream Dockerfile.rocm flow. # --------------------------- RUN git clone ${VLLM_REPO} /app/vllm \ && cd /app/vllm \ && git fetch --all --tags \ && git checkout ${VLLM_COMMIT} \ - && sed -E '/(^|[<>=[:space:]])(triton|aiter|amd-aiter)([<>=[:space:]]|$)/Id' requirements/rocm.txt > requirements/rocm_no_triton_aiter.txt \ - && python3 -m pip install -r requirements/rocm_no_triton_aiter.txt \ + && python3 -m pip install -r requirements/rocm.txt \ && python3 setup.py clean --all \ && python3 setup.py bdist_wheel --dist-dir=dist \ && (python3 -m pip uninstall -y vllm || true) \ && python3 -m pip install dist/*.whl -# A common profiler workaround from vllm ROCm image +# Lock torch version after vLLM install so ATOM stage cannot change it. +RUN TORCH_VERSION="$(python3 -m pip show torch | sed -n 's/^Version: //p')" \ + && echo "torch==${TORCH_VERSION}" > /tmp/torch_pin.txt \ + && cat /tmp/torch_pin.txt + +# --------------------------- +# Install MORI +# --------------------------- +RUN git clone https://github.com/ROCm/mori.git /app/mori \ + && cd /app/mori \ + && git checkout ${MORI_COMMIT} \ + && python3 -m pip install -c /tmp/torch_pin.txt -r requirements-build.txt \ + && git submodule update --init --recursive \ + && python3 setup.py install + +# --------------------------- +# Update RCCL +# --------------------------- +ARG RCCL_REPO="https://github.com/ROCm/rccl.git" +ARG RCCL_BRANCH="29e1567b95e28823b0beb1a988adc587bfab5b4f" +ARG RCCL_SRC_DIR="/app/rccl" +RUN python3 -m pip install cmake \ + && git clone "${RCCL_REPO}" "${RCCL_SRC_DIR}" \ + && cd "${RCCL_SRC_DIR}" \ + && git checkout "${RCCL_BRANCH}" \ + && ./install.sh -p --amdgpu_targets=${GPU_ARCH_LIST} \ + && DEBIAN_FRONTEND=noninteractive dpkg -i --force-all ${RCCL_SRC_DIR}/build/release/*.deb \ + && rm -rf ${RCCL_SRC_DIR}/build + +# --------------------------- +# Update Triton +# --------------------------- +RUN (python3 -m pip uninstall -y triton || true) \ + && git clone --depth=1 --branch release/internal/3.5.x https://github.com/ROCm/triton.git /app/triton-test \ + && cd /app/triton-test \ + && python3 -m pip install -c /tmp/torch_pin.txt -r python/requirements.txt \ + && python3 -m pip install -c /tmp/torch_pin.txt filecheck \ + && MAX_JOBS=${MAX_JOBS} python3 -m pip --retries=10 --default-timeout=60 install . + +# --------------------------- +# Install AITER +# --------------------------- +RUN (python3 -m pip uninstall -y aiter || true) \ + && git clone ${AITER_REPO} /app/aiter-test \ + && cd /app/aiter-test \ + && git checkout ${AITER_COMMIT} \ + && python3 -m pip install -c /tmp/torch_pin.txt -r requirements.txt \ + && git submodule sync \ + && git submodule update --init --recursive \ + && MAX_JOBS=${MAX_JOBS} PREBUILD_KERNELS=${PREBUILD_KERNELS} GPU_ARCHS=${GPU_ARCH_LIST} python3 setup.py develop + +# --------------------------- +# Install ATOM (do not change torch from vLLM stage) +# --------------------------- +RUN (python3 -m pip uninstall -y atom || true) \ + && git clone ${ATOM_REPO} /app/ATOM \ + && cd /app/ATOM \ + && git checkout ${ATOM_COMMIT} \ + && python3 -m pip install -c /tmp/torch_pin.txt -e . + RUN echo "ROCTRACER_MAX_EVENTS=10000000" > /app/libkineto.conf ENV KINETO_CONFIG="/app/libkineto.conf" -# Print key versions for quick smoke verification -RUN python3 -m pip show vllm || true \ +RUN python3 -m pip show torch || true \ + && python3 -m pip show vllm || true \ && python3 -m pip show atom || true \ && python3 -m pip show amd-aiter || true \ && python3 -m pip show triton || true diff --git a/docker/build_oot_docker.sh b/docker/build_oot_docker.sh index eb2a1d2e0..3ce79b34f 100644 --- a/docker/build_oot_docker.sh +++ b/docker/build_oot_docker.sh @@ -9,16 +9,17 @@ VLLM_REPO="${VLLM_REPO:-https://github.com/vllm-project/vllm.git}" VLLM_COMMIT="f5d17400303149bbb480f6abfb6f7bb646c1d895" VLLM_DOCKER_REF="${VLLM_DOCKER_REF:-${VLLM_COMMIT}}" IMAGE_TAG="${IMAGE_TAG:-atom-vllm-oot}" -ATOM_BASE_IMAGE="${ATOM_BASE_IMAGE:-rocm/atom-dev:nightly_202603040155}" -BASE_IMAGE="${BASE_IMAGE:-${ATOM_BASE_IMAGE}}" +VLLM_ROCM_BASE_IMAGE="${VLLM_ROCM_BASE_IMAGE:-rocm/dev-ubuntu-22.04:7.0-complete}" VLLM_BASE_IMAGE="${VLLM_BASE_IMAGE:-rocm/vllm-dev:base}" -BUILD_VLLM_BASE="${BUILD_VLLM_BASE:-0}" +BASE_IMAGE="${BASE_IMAGE:-${VLLM_BASE_IMAGE}}" +BUILD_VLLM_BASE="${BUILD_VLLM_BASE:-1}" echo "========================================" echo "OOT Docker build config" echo " Base image : ${BASE_IMAGE}" echo " VLLM commit : ${VLLM_COMMIT}" echo " Final image name : ${IMAGE_TAG}" +echo " Build vLLM base : ${BUILD_VLLM_BASE}" echo "========================================" if [[ "${BUILD_VLLM_BASE}" == "1" ]]; then @@ -28,6 +29,7 @@ if [[ "${BUILD_VLLM_BASE}" == "1" ]]; then echo "Step 1/2: build vLLM ROCm base image" echo " vLLM repo : ${VLLM_REPO}" echo " vLLM ref : ${VLLM_DOCKER_REF}" + echo " Base image : ${VLLM_ROCM_BASE_IMAGE}" echo " Image tag : ${VLLM_BASE_IMAGE}" git clone "${VLLM_REPO}" "${VLLM_TMP_DIR}" @@ -42,14 +44,15 @@ if [[ "${BUILD_VLLM_BASE}" == "1" ]]; then DOCKER_BUILDKIT=1 docker build \ -f "${VLLM_BASE_DOCKERFILE}" \ -t "${VLLM_BASE_IMAGE}" \ + --build-arg "BASE_IMAGE=${VLLM_ROCM_BASE_IMAGE}" \ "${VLLM_TMP_DIR}" BASE_IMAGE="${VLLM_BASE_IMAGE}" else - echo "Step 1/2: skip vLLM base build (using atom base image)" + echo "Step 1/2: skip vLLM base build (use existing ${VLLM_BASE_IMAGE})" fi -echo "Step 2/2: build remaining vLLM + ATOM OOT image" +echo "Step 2/2: build vLLM + ATOM OOT image" echo " Dockerfile : ${DOCKERFILE_PATH}" echo " Image tag : ${IMAGE_TAG}" echo " Base image : ${BASE_IMAGE}" @@ -59,6 +62,7 @@ DOCKER_BUILDKIT=1 docker build \ -f "${DOCKERFILE_PATH}" \ -t "${IMAGE_TAG}" \ --build-arg "BASE_IMAGE=${BASE_IMAGE}" \ + --build-arg "VLLM_REPO=${VLLM_REPO}" \ --build-arg "VLLM_COMMIT=${VLLM_COMMIT}" \ "$@" \ "${REPO_ROOT}" From 6d2ccfa61b4197380d9e75ee61a727b27b47df0e Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 10:42:28 +0800 Subject: [PATCH 09/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_vllm_atom_oot | 88 +++++++++++++++++++++++++++++++++ docker/build_vllm_atom_oot.sh | 34 +++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 docker/Dockerfile_vllm_atom_oot create mode 100644 docker/build_vllm_atom_oot.sh diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot new file mode 100644 index 000000000..2e1a9b0b3 --- /dev/null +++ b/docker/Dockerfile_vllm_atom_oot @@ -0,0 +1,88 @@ +ARG BASE_IMAGE="rocm/vllm-dev:nightly_main_20260118" +ARG GPU_ARCH="gfx942;gfx950" + +FROM ${BASE_IMAGE} + +ARG GPU_ARCH +ENV GPU_ARCH_LIST=${GPU_ARCH} +ENV PYTORCH_ROCM_ARCH=${GPU_ARCH} + +ARG ATOM_REPO="https://github.com/ROCm/ATOM.git" +ARG ATOM_COMMIT="HEAD" +ARG AITER_REPO="https://github.com/ROCm/aiter.git" +ARG AITER_COMMIT="HEAD" +ARG MORI_COMMIT="b0dce4beebeb1f26c784eee17d5fd9785ee9447f" +ARG BUILD_MORI=0 +ARG PREBUILD_KERNELS=1 +ARG MAX_JOBS=64 + +RUN pip install --upgrade pip +RUN pip install lm-eval[api] +RUN pip show lm-eval || true + +# Install MORI (Modular RDMA Interface) +RUN apt-get update && apt --fix-broken install -y +RUN apt-get update && apt-get install -y \ + git \ + cython3 \ + ibverbs-utils \ + openmpi-bin \ + libopenmpi-dev \ + libpci-dev \ + cmake \ + libdw1 \ + locales + +RUN if [ "${BUILD_MORI}" = "1" ]; then \ + git clone https://github.com/ROCm/mori.git /app/mori && \ + cd /app/mori && \ + git checkout ${MORI_COMMIT} && \ + pip install -r requirements-build.txt && \ + git submodule update --init --recursive && \ + python setup.py install && \ + pip show mori || true; \ + else \ + echo "Skip MORI build (BUILD_MORI=${BUILD_MORI})"; \ + fi + +# Update RCCL +ARG RCCL_REPO="https://github.com/ROCm/rccl.git" +ARG RCCL_BRANCH="29e1567b95e28823b0beb1a988adc587bfab5b4f" +ARG RCCL_SRC_DIR="/app/rccl/" +RUN pip install cmake +RUN git clone "${RCCL_REPO}" ${RCCL_SRC_DIR} \ + && cd ${RCCL_SRC_DIR} \ + && git checkout "${RCCL_BRANCH}" \ + && ./install.sh -p --amdgpu_targets=${GPU_ARCH_LIST} +RUN DEBIAN_FRONTEND=noninteractive dpkg -i --force-all ${RCCL_SRC_DIR}/build/release/*.deb && rm -rf ${RCCL_SRC_DIR}/build + +# Update Triton +RUN pip show triton || true +RUN pip uninstall -y triton || true +RUN git clone --depth=1 --branch release/internal/3.5.x https://github.com/ROCm/triton.git /triton-test && \ + cd /triton-test && \ + pip install -r python/requirements.txt && \ + pip install filecheck && \ + MAX_JOBS=${MAX_JOBS} pip --retries=10 --default-timeout=60 install . +RUN pip show triton || true + +# Install AITER +RUN mkdir -p /app +RUN pip uninstall -y aiter || true +RUN git clone ${AITER_REPO} /app/aiter-test && \ + cd /app/aiter-test && \ + pip install -r requirements.txt && \ + git checkout ${AITER_COMMIT} && \ + git submodule sync && git submodule update --init --recursive && \ + MAX_JOBS=${MAX_JOBS} PREBUILD_KERNELS=${PREBUILD_KERNELS} GPU_ARCHS=${GPU_ARCH_LIST} python3 setup.py develop +RUN pip show amd-aiter || true + +# Install ATOM +RUN pip uninstall -y atom || true +RUN git clone ${ATOM_REPO} /app/ATOM && \ + cd /app/ATOM && \ + git checkout ${ATOM_COMMIT} && \ + pip install -e . +RUN pip show atom || true + +CMD ["/bin/bash"] diff --git a/docker/build_vllm_atom_oot.sh b/docker/build_vllm_atom_oot.sh new file mode 100644 index 000000000..fddfe7ab1 --- /dev/null +++ b/docker/build_vllm_atom_oot.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile_vllm_atom_oot" +IMAGE_TAG="${IMAGE_TAG:-atom-vllm-oot}" +BASE_IMAGE="${BASE_IMAGE:-rocm/vllm-dev:nightly_main_20260118}" +ATOM_COMMIT="${ATOM_COMMIT:-HEAD}" +AITER_COMMIT="${AITER_COMMIT:-HEAD}" +BUILD_MORI="${BUILD_MORI:-0}" + +echo "========================================" +echo "Build vLLM + ATOM OOT image" +echo " Dockerfile : ${DOCKERFILE_PATH}" +echo " Image name : ${IMAGE_TAG}" +echo " Base image : ${BASE_IMAGE}" +echo " ATOM commit: ${ATOM_COMMIT}" +echo " AITER commit: ${AITER_COMMIT}" +echo " Build MORI : ${BUILD_MORI}" +echo "========================================" + +DOCKER_BUILDKIT=1 docker build \ + -f "${DOCKERFILE_PATH}" \ + -t "${IMAGE_TAG}" \ + --build-arg "BASE_IMAGE=${BASE_IMAGE}" \ + --build-arg "ATOM_COMMIT=${ATOM_COMMIT}" \ + --build-arg "AITER_COMMIT=${AITER_COMMIT}" \ + --build-arg "BUILD_MORI=${BUILD_MORI}" \ + "$@" \ + "${REPO_ROOT}" + +echo "Build finished: ${IMAGE_TAG}" From 8e0f507370b5362fd9eabe75fc401f14e2347e3b Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 13:18:17 +0800 Subject: [PATCH 10/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_For_OOT | 130 -------------------------------- docker/Dockerfile_vllm_atom_oot | 2 +- docker/build_oot_docker.sh | 74 ------------------ 3 files changed, 1 insertion(+), 205 deletions(-) delete mode 100644 docker/Dockerfile_For_OOT delete mode 100644 docker/build_oot_docker.sh diff --git a/docker/Dockerfile_For_OOT b/docker/Dockerfile_For_OOT deleted file mode 100644 index 5b964583e..000000000 --- a/docker/Dockerfile_For_OOT +++ /dev/null @@ -1,130 +0,0 @@ -ARG BASE_IMAGE="rocm/vllm-dev:base" -ARG GPU_ARCH="gfx942;gfx950" - -FROM ${BASE_IMAGE} - -SHELL ["/bin/bash", "-o", "pipefail", "-c"] - -ARG GPU_ARCH -ENV GPU_ARCH_LIST=${GPU_ARCH} -ENV PYTORCH_ROCM_ARCH=${GPU_ARCH} -ENV VLLM_TARGET_DEVICE=rocm -ENV TOKENIZERS_PARALLELISM=false -ENV SAFETENSORS_FAST_GPU=1 -ENV HIP_FORCE_DEV_KERNARG=1 - -# ---- vLLM build args ---- -ARG VLLM_REPO="https://github.com/vllm-project/vllm.git" -ARG VLLM_COMMIT="main" - -# ---- ATOM build args ---- -ARG ATOM_REPO="https://github.com/ROCm/ATOM.git" -ARG ATOM_COMMIT="HEAD" -ARG AITER_REPO="https://github.com/ROCm/aiter.git" -ARG AITER_COMMIT="HEAD" -ARG MORI_COMMIT="b0dce4beebeb1f26c784eee17d5fd9785ee9447f" -ARG PREBUILD_KERNELS=1 -ARG MAX_JOBS=64 - -WORKDIR /app - -RUN apt-get update && apt --fix-broken install -y \ - && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ - git \ - cython3 \ - ibverbs-utils \ - openmpi-bin \ - libopenmpi-dev \ - libpci-dev \ - cmake \ - libdw1 \ - locales \ - && rm -rf /var/lib/apt/lists/* - -RUN python3 -m pip install --upgrade pip \ - && python3 -m pip install lm-eval[api] - -# --------------------------- -# Build and install vLLM (ROCm path) -# Matches upstream Dockerfile.rocm flow. -# --------------------------- -RUN git clone ${VLLM_REPO} /app/vllm \ - && cd /app/vllm \ - && git fetch --all --tags \ - && git checkout ${VLLM_COMMIT} \ - && python3 -m pip install -r requirements/rocm.txt \ - && python3 setup.py clean --all \ - && python3 setup.py bdist_wheel --dist-dir=dist \ - && (python3 -m pip uninstall -y vllm || true) \ - && python3 -m pip install dist/*.whl - -# Lock torch version after vLLM install so ATOM stage cannot change it. -RUN TORCH_VERSION="$(python3 -m pip show torch | sed -n 's/^Version: //p')" \ - && echo "torch==${TORCH_VERSION}" > /tmp/torch_pin.txt \ - && cat /tmp/torch_pin.txt - -# --------------------------- -# Install MORI -# --------------------------- -RUN git clone https://github.com/ROCm/mori.git /app/mori \ - && cd /app/mori \ - && git checkout ${MORI_COMMIT} \ - && python3 -m pip install -c /tmp/torch_pin.txt -r requirements-build.txt \ - && git submodule update --init --recursive \ - && python3 setup.py install - -# --------------------------- -# Update RCCL -# --------------------------- -ARG RCCL_REPO="https://github.com/ROCm/rccl.git" -ARG RCCL_BRANCH="29e1567b95e28823b0beb1a988adc587bfab5b4f" -ARG RCCL_SRC_DIR="/app/rccl" -RUN python3 -m pip install cmake \ - && git clone "${RCCL_REPO}" "${RCCL_SRC_DIR}" \ - && cd "${RCCL_SRC_DIR}" \ - && git checkout "${RCCL_BRANCH}" \ - && ./install.sh -p --amdgpu_targets=${GPU_ARCH_LIST} \ - && DEBIAN_FRONTEND=noninteractive dpkg -i --force-all ${RCCL_SRC_DIR}/build/release/*.deb \ - && rm -rf ${RCCL_SRC_DIR}/build - -# --------------------------- -# Update Triton -# --------------------------- -RUN (python3 -m pip uninstall -y triton || true) \ - && git clone --depth=1 --branch release/internal/3.5.x https://github.com/ROCm/triton.git /app/triton-test \ - && cd /app/triton-test \ - && python3 -m pip install -c /tmp/torch_pin.txt -r python/requirements.txt \ - && python3 -m pip install -c /tmp/torch_pin.txt filecheck \ - && MAX_JOBS=${MAX_JOBS} python3 -m pip --retries=10 --default-timeout=60 install . - -# --------------------------- -# Install AITER -# --------------------------- -RUN (python3 -m pip uninstall -y aiter || true) \ - && git clone ${AITER_REPO} /app/aiter-test \ - && cd /app/aiter-test \ - && git checkout ${AITER_COMMIT} \ - && python3 -m pip install -c /tmp/torch_pin.txt -r requirements.txt \ - && git submodule sync \ - && git submodule update --init --recursive \ - && MAX_JOBS=${MAX_JOBS} PREBUILD_KERNELS=${PREBUILD_KERNELS} GPU_ARCHS=${GPU_ARCH_LIST} python3 setup.py develop - -# --------------------------- -# Install ATOM (do not change torch from vLLM stage) -# --------------------------- -RUN (python3 -m pip uninstall -y atom || true) \ - && git clone ${ATOM_REPO} /app/ATOM \ - && cd /app/ATOM \ - && git checkout ${ATOM_COMMIT} \ - && python3 -m pip install -c /tmp/torch_pin.txt -e . - -RUN echo "ROCTRACER_MAX_EVENTS=10000000" > /app/libkineto.conf -ENV KINETO_CONFIG="/app/libkineto.conf" - -RUN python3 -m pip show torch || true \ - && python3 -m pip show vllm || true \ - && python3 -m pip show atom || true \ - && python3 -m pip show amd-aiter || true \ - && python3 -m pip show triton || true - -CMD ["/bin/bash"] diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot index 2e1a9b0b3..2641d80ed 100644 --- a/docker/Dockerfile_vllm_atom_oot +++ b/docker/Dockerfile_vllm_atom_oot @@ -13,7 +13,7 @@ ARG AITER_REPO="https://github.com/ROCm/aiter.git" ARG AITER_COMMIT="HEAD" ARG MORI_COMMIT="b0dce4beebeb1f26c784eee17d5fd9785ee9447f" ARG BUILD_MORI=0 -ARG PREBUILD_KERNELS=1 +ARG PREBUILD_KERNELS=0 ARG MAX_JOBS=64 RUN pip install --upgrade pip diff --git a/docker/build_oot_docker.sh b/docker/build_oot_docker.sh deleted file mode 100644 index 3ce79b34f..000000000 --- a/docker/build_oot_docker.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" - -DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile_For_OOT" -VLLM_REPO="${VLLM_REPO:-https://github.com/vllm-project/vllm.git}" -VLLM_COMMIT="f5d17400303149bbb480f6abfb6f7bb646c1d895" -VLLM_DOCKER_REF="${VLLM_DOCKER_REF:-${VLLM_COMMIT}}" -IMAGE_TAG="${IMAGE_TAG:-atom-vllm-oot}" -VLLM_ROCM_BASE_IMAGE="${VLLM_ROCM_BASE_IMAGE:-rocm/dev-ubuntu-22.04:7.0-complete}" -VLLM_BASE_IMAGE="${VLLM_BASE_IMAGE:-rocm/vllm-dev:base}" -BASE_IMAGE="${BASE_IMAGE:-${VLLM_BASE_IMAGE}}" -BUILD_VLLM_BASE="${BUILD_VLLM_BASE:-1}" - -echo "========================================" -echo "OOT Docker build config" -echo " Base image : ${BASE_IMAGE}" -echo " VLLM commit : ${VLLM_COMMIT}" -echo " Final image name : ${IMAGE_TAG}" -echo " Build vLLM base : ${BUILD_VLLM_BASE}" -echo "========================================" - -if [[ "${BUILD_VLLM_BASE}" == "1" ]]; then - VLLM_TMP_DIR="$(mktemp -d)" - trap 'rm -rf "${VLLM_TMP_DIR}"' EXIT - - echo "Step 1/2: build vLLM ROCm base image" - echo " vLLM repo : ${VLLM_REPO}" - echo " vLLM ref : ${VLLM_DOCKER_REF}" - echo " Base image : ${VLLM_ROCM_BASE_IMAGE}" - echo " Image tag : ${VLLM_BASE_IMAGE}" - - git clone "${VLLM_REPO}" "${VLLM_TMP_DIR}" - git -C "${VLLM_TMP_DIR}" checkout "${VLLM_DOCKER_REF}" - - VLLM_BASE_DOCKERFILE="${VLLM_TMP_DIR}/docker/Dockerfile.rocm_base" - if [[ ! -f "${VLLM_BASE_DOCKERFILE}" ]]; then - echo "ERROR: cannot find ${VLLM_BASE_DOCKERFILE}" - exit 1 - fi - - DOCKER_BUILDKIT=1 docker build \ - -f "${VLLM_BASE_DOCKERFILE}" \ - -t "${VLLM_BASE_IMAGE}" \ - --build-arg "BASE_IMAGE=${VLLM_ROCM_BASE_IMAGE}" \ - "${VLLM_TMP_DIR}" - - BASE_IMAGE="${VLLM_BASE_IMAGE}" -else - echo "Step 1/2: skip vLLM base build (use existing ${VLLM_BASE_IMAGE})" -fi - -echo "Step 2/2: build vLLM + ATOM OOT image" -echo " Dockerfile : ${DOCKERFILE_PATH}" -echo " Image tag : ${IMAGE_TAG}" -echo " Base image : ${BASE_IMAGE}" -echo " VLLM commit: ${VLLM_COMMIT}" - -DOCKER_BUILDKIT=1 docker build \ - -f "${DOCKERFILE_PATH}" \ - -t "${IMAGE_TAG}" \ - --build-arg "BASE_IMAGE=${BASE_IMAGE}" \ - --build-arg "VLLM_REPO=${VLLM_REPO}" \ - --build-arg "VLLM_COMMIT=${VLLM_COMMIT}" \ - "$@" \ - "${REPO_ROOT}" - -echo "Build finished." -echo " Final image name : ${IMAGE_TAG}" -echo " Base image used : ${BASE_IMAGE}" -echo " VLLM commit used : ${VLLM_COMMIT}" - From 727d4cc4fb883a74ed8622de2af554d430af15fd Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 16:18:19 +0800 Subject: [PATCH 11/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_vllm_atom_oot | 129 +++++++++++++------------------- docker/build_vllm_atom_oot.sh | 60 ++++++++++----- 2 files changed, 95 insertions(+), 94 deletions(-) diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot index 2641d80ed..64d04cb07 100644 --- a/docker/Dockerfile_vllm_atom_oot +++ b/docker/Dockerfile_vllm_atom_oot @@ -1,88 +1,65 @@ -ARG BASE_IMAGE="rocm/vllm-dev:nightly_main_20260118" -ARG GPU_ARCH="gfx942;gfx950" - +ARG BASE_IMAGE="rocm/atom-dev:nightly_202603040155" FROM ${BASE_IMAGE} -ARG GPU_ARCH -ENV GPU_ARCH_LIST=${GPU_ARCH} -ENV PYTORCH_ROCM_ARCH=${GPU_ARCH} - -ARG ATOM_REPO="https://github.com/ROCm/ATOM.git" -ARG ATOM_COMMIT="HEAD" -ARG AITER_REPO="https://github.com/ROCm/aiter.git" -ARG AITER_COMMIT="HEAD" -ARG MORI_COMMIT="b0dce4beebeb1f26c784eee17d5fd9785ee9447f" -ARG BUILD_MORI=0 -ARG PREBUILD_KERNELS=0 +ARG VLLM_REPO="https://github.com/vllm-project/vllm.git" +ARG VLLM_COMMIT="f5d17400303149bbb480f6abfb6f7bb646c1d895" ARG MAX_JOBS=64 +ARG INSTALL_LM_EVAL=1 + +ENV DEBIAN_FRONTEND=noninteractive +ENV MAX_JOBS=${MAX_JOBS} +WORKDIR /app -RUN pip install --upgrade pip -RUN pip install lm-eval[api] -RUN pip show lm-eval || true +# Keep toolchain utilities lightweight and deterministic. +RUN echo "========== [1/7] Prepare build utilities ==========" \ + && apt-get update \ + && apt-get install -y --no-install-recommends git ca-certificates \ + && rm -rf /var/lib/apt/lists/* -# Install MORI (Modular RDMA Interface) -RUN apt-get update && apt --fix-broken install -y -RUN apt-get update && apt-get install -y \ - git \ - cython3 \ - ibverbs-utils \ - openmpi-bin \ - libopenmpi-dev \ - libpci-dev \ - cmake \ - libdw1 \ - locales +# Do not rebuild or overwrite torch/triton/aiter from atom base image. +RUN echo "========== [2/7] Verify atom base core packages ==========" \ + && python3 -m pip --version \ + && python3 -m pip show torch || true \ + && python3 -m pip show triton || true \ + && python3 -m pip show amd-aiter || true -RUN if [ "${BUILD_MORI}" = "1" ]; then \ - git clone https://github.com/ROCm/mori.git /app/mori && \ - cd /app/mori && \ - git checkout ${MORI_COMMIT} && \ - pip install -r requirements-build.txt && \ - git submodule update --init --recursive && \ - python setup.py install && \ - pip show mori || true; \ - else \ - echo "Skip MORI build (BUILD_MORI=${BUILD_MORI})"; \ - fi +RUN echo "========== [3/7] Clone vLLM and checkout target commit ==========" \ + && git clone "${VLLM_REPO}" /app/vllm \ + && cd /app/vllm \ + && git checkout "${VLLM_COMMIT}" \ + && git submodule update --init --recursive \ + && echo "vLLM commit:" \ + && git rev-parse HEAD -# Update RCCL -ARG RCCL_REPO="https://github.com/ROCm/rccl.git" -ARG RCCL_BRANCH="29e1567b95e28823b0beb1a988adc587bfab5b4f" -ARG RCCL_SRC_DIR="/app/rccl/" -RUN pip install cmake -RUN git clone "${RCCL_REPO}" ${RCCL_SRC_DIR} \ - && cd ${RCCL_SRC_DIR} \ - && git checkout "${RCCL_BRANCH}" \ - && ./install.sh -p --amdgpu_targets=${GPU_ARCH_LIST} -RUN DEBIAN_FRONTEND=noninteractive dpkg -i --force-all ${RCCL_SRC_DIR}/build/release/*.deb && rm -rf ${RCCL_SRC_DIR}/build +# Install vLLM Python dependencies only (torch/triton/aiter remain from base image). +RUN echo "========== [4/7] Install vLLM ROCm Python requirements ==========" \ + && cd /app/vllm \ + && python3 -m pip install --upgrade pip \ + && python3 -m pip install -r requirements/rocm.txt -# Update Triton -RUN pip show triton || true -RUN pip uninstall -y triton || true -RUN git clone --depth=1 --branch release/internal/3.5.x https://github.com/ROCm/triton.git /triton-test && \ - cd /triton-test && \ - pip install -r python/requirements.txt && \ - pip install filecheck && \ - MAX_JOBS=${MAX_JOBS} pip --retries=10 --default-timeout=60 install . -RUN pip show triton || true +RUN echo "========== [5/7] Build vLLM wheel ==========" \ + && cd /app/vllm \ + && python3 setup.py clean --all \ + && MAX_JOBS="${MAX_JOBS}" python3 setup.py bdist_wheel --dist-dir=/tmp/vllm-wheels \ + && ls -lh /tmp/vllm-wheels -# Install AITER -RUN mkdir -p /app -RUN pip uninstall -y aiter || true -RUN git clone ${AITER_REPO} /app/aiter-test && \ - cd /app/aiter-test && \ - pip install -r requirements.txt && \ - git checkout ${AITER_COMMIT} && \ - git submodule sync && git submodule update --init --recursive && \ - MAX_JOBS=${MAX_JOBS} PREBUILD_KERNELS=${PREBUILD_KERNELS} GPU_ARCHS=${GPU_ARCH_LIST} python3 setup.py develop -RUN pip show amd-aiter || true +# Install vLLM wheel without dependency resolution to avoid overriding base torch/triton/aiter. +RUN echo "========== [6/7] Install vLLM wheel (no dependency override) ==========" \ + && python3 -m pip uninstall -y vllm || true \ + && python3 -m pip install --no-deps /tmp/vllm-wheels/*.whl \ + && python3 -m pip show vllm -# Install ATOM -RUN pip uninstall -y atom || true -RUN git clone ${ATOM_REPO} /app/ATOM && \ - cd /app/ATOM && \ - git checkout ${ATOM_COMMIT} && \ - pip install -e . -RUN pip show atom || true +RUN echo "========== [7/7] Optional utilities and final version print ==========" \ + && if [ "${INSTALL_LM_EVAL}" = "1" ]; then python3 -m pip install "lm-eval[api]"; else echo "Skip lm-eval install"; fi \ + && python3 - <<'PY' \ +import importlib.metadata as m; \ +pkgs = ["vllm", "torch", "triton", "amd-aiter", "atom"]; \ +print("Final package versions:"); \ +for p in pkgs: \ + try: \ + print(f" {p}: {m.version(p)}"); \ + except Exception: \ + print(f" {p}: "); \ +PY CMD ["/bin/bash"] diff --git a/docker/build_vllm_atom_oot.sh b/docker/build_vllm_atom_oot.sh index fddfe7ab1..89bb1d51f 100644 --- a/docker/build_vllm_atom_oot.sh +++ b/docker/build_vllm_atom_oot.sh @@ -5,30 +5,54 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile_vllm_atom_oot" -IMAGE_TAG="${IMAGE_TAG:-atom-vllm-oot}" -BASE_IMAGE="${BASE_IMAGE:-rocm/vllm-dev:nightly_main_20260118}" -ATOM_COMMIT="${ATOM_COMMIT:-HEAD}" -AITER_COMMIT="${AITER_COMMIT:-HEAD}" -BUILD_MORI="${BUILD_MORI:-0}" +IMAGE_TAG="${IMAGE_TAG:-rocm/atom-vllm-dev:nightly_202603040155}" +BASE_IMAGE="${BASE_IMAGE:-rocm/atom-dev:nightly_202603040155}" +VLLM_REPO="${VLLM_REPO:-https://github.com/vllm-project/vllm.git}" +VLLM_COMMIT="${VLLM_COMMIT:-f5d17400303149bbb480f6abfb6f7bb646c1d895}" +MAX_JOBS="${MAX_JOBS:-64}" +INSTALL_LM_EVAL="${INSTALL_LM_EVAL:-1}" +PULL_BASE_IMAGE="${PULL_BASE_IMAGE:-1}" -echo "========================================" -echo "Build vLLM + ATOM OOT image" -echo " Dockerfile : ${DOCKERFILE_PATH}" -echo " Image name : ${IMAGE_TAG}" -echo " Base image : ${BASE_IMAGE}" -echo " ATOM commit: ${ATOM_COMMIT}" -echo " AITER commit: ${AITER_COMMIT}" -echo " Build MORI : ${BUILD_MORI}" -echo "========================================" +print_banner() { + echo "============================================================" + echo "$1" + echo "============================================================" +} +print_banner "Build vLLM on top of ATOM base image" +echo "Dockerfile : ${DOCKERFILE_PATH}" +echo "Build context : ${REPO_ROOT}" +echo "Target image : ${IMAGE_TAG}" +echo "Base image : ${BASE_IMAGE}" +echo "vLLM repo : ${VLLM_REPO}" +echo "vLLM commit : ${VLLM_COMMIT}" +echo "MAX_JOBS : ${MAX_JOBS}" +echo "INSTALL_LM_EVAL : ${INSTALL_LM_EVAL}" +echo +echo "Build plan:" +echo " Step 1/3: (optional) pull base image" +echo " Step 2/3: build image from Dockerfile_vllm_atom_oot" +echo " Step 3/3: print final image info" +echo + +if [[ "${PULL_BASE_IMAGE}" == "1" ]]; then + print_banner "Step 1/3 - Pull base image: ${BASE_IMAGE}" + docker pull "${BASE_IMAGE}" +else + print_banner "Step 1/3 - Skip base image pull (PULL_BASE_IMAGE=${PULL_BASE_IMAGE})" +fi + +print_banner "Step 2/3 - Build target image: ${IMAGE_TAG}" DOCKER_BUILDKIT=1 docker build \ -f "${DOCKERFILE_PATH}" \ -t "${IMAGE_TAG}" \ --build-arg "BASE_IMAGE=${BASE_IMAGE}" \ - --build-arg "ATOM_COMMIT=${ATOM_COMMIT}" \ - --build-arg "AITER_COMMIT=${AITER_COMMIT}" \ - --build-arg "BUILD_MORI=${BUILD_MORI}" \ + --build-arg "VLLM_REPO=${VLLM_REPO}" \ + --build-arg "VLLM_COMMIT=${VLLM_COMMIT}" \ + --build-arg "MAX_JOBS=${MAX_JOBS}" \ + --build-arg "INSTALL_LM_EVAL=${INSTALL_LM_EVAL}" \ "$@" \ "${REPO_ROOT}" -echo "Build finished: ${IMAGE_TAG}" +print_banner "Step 3/3 - Build completed" +docker image inspect "${IMAGE_TAG}" --format 'Image={{.RepoTags}} ID={{.Id}} Created={{.Created}}' From 07aab5a91009579137a9cf1f9c94d8df4b3e5090 Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 16:35:59 +0800 Subject: [PATCH 12/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_vllm_atom_oot | 24 +++++++++++++----------- docker/build_vllm_atom_oot.sh | 7 +++++++ 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot index 64d04cb07..f0b7e7852 100644 --- a/docker/Dockerfile_vllm_atom_oot +++ b/docker/Dockerfile_vllm_atom_oot @@ -49,17 +49,19 @@ RUN echo "========== [6/7] Install vLLM wheel (no dependency override) ========= && python3 -m pip install --no-deps /tmp/vllm-wheels/*.whl \ && python3 -m pip show vllm -RUN echo "========== [7/7] Optional utilities and final version print ==========" \ - && if [ "${INSTALL_LM_EVAL}" = "1" ]; then python3 -m pip install "lm-eval[api]"; else echo "Skip lm-eval install"; fi \ - && python3 - <<'PY' \ -import importlib.metadata as m; \ -pkgs = ["vllm", "torch", "triton", "amd-aiter", "atom"]; \ -print("Final package versions:"); \ -for p in pkgs: \ - try: \ - print(f" {p}: {m.version(p)}"); \ - except Exception: \ - print(f" {p}: "); \ +RUN echo "========== [7/7] Optional utilities ==========" \ + && if [ "${INSTALL_LM_EVAL}" = "1" ]; then python3 -m pip install "lm-eval[api]"; else echo "Skip lm-eval install"; fi + +RUN echo "========== [7/7] Final version print ==========" \ + && python3 - <<'PY' +import importlib.metadata as m +pkgs = ["vllm", "torch", "triton", "amd-aiter", "atom"] +print("Final package versions:") +for p in pkgs: + try: + print(f" {p}: {m.version(p)}") + except Exception: + print(f" {p}: ") PY CMD ["/bin/bash"] diff --git a/docker/build_vllm_atom_oot.sh b/docker/build_vllm_atom_oot.sh index 89bb1d51f..36d236622 100644 --- a/docker/build_vllm_atom_oot.sh +++ b/docker/build_vllm_atom_oot.sh @@ -3,6 +3,12 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" +LOG_DIR="${LOG_DIR:-${SCRIPT_DIR}/logs}" +LOG_FILE="${LOG_FILE:-${LOG_DIR}/build_vllm_atom_oot_$(date +%Y%m%d_%H%M%S).log}" + +mkdir -p "${LOG_DIR}" +# Mirror all stdout/stderr to terminal and log file. +exec > >(tee -a "${LOG_FILE}") 2>&1 DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile_vllm_atom_oot" IMAGE_TAG="${IMAGE_TAG:-rocm/atom-vllm-dev:nightly_202603040155}" @@ -20,6 +26,7 @@ print_banner() { } print_banner "Build vLLM on top of ATOM base image" +echo "Log file : ${LOG_FILE}" echo "Dockerfile : ${DOCKERFILE_PATH}" echo "Build context : ${REPO_ROOT}" echo "Target image : ${IMAGE_TAG}" From 40a7379884320db374d8ca46bd5a74dcf6ace5b0 Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 16:38:56 +0800 Subject: [PATCH 13/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_vllm_atom_oot | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot index f0b7e7852..54b12f1d1 100644 --- a/docker/Dockerfile_vllm_atom_oot +++ b/docker/Dockerfile_vllm_atom_oot @@ -10,9 +10,10 @@ ENV DEBIAN_FRONTEND=noninteractive ENV MAX_JOBS=${MAX_JOBS} WORKDIR /app -# Keep toolchain utilities lightweight and deterministic. -RUN echo "========== [1/7] Prepare build utilities ==========" \ +# Repair apt dependency state first, then install required utilities. +RUN echo "========== [1/7] Fix apt dependencies and prepare build utilities ==========" \ && apt-get update \ + && apt --fix-broken install -y \ && apt-get install -y --no-install-recommends git ca-certificates \ && rm -rf /var/lib/apt/lists/* From 493c28d4a5940740ae556e918a4773c03524bc8c Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 16:52:33 +0800 Subject: [PATCH 14/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_vllm_atom_oot | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot index 54b12f1d1..de9f1f9d1 100644 --- a/docker/Dockerfile_vllm_atom_oot +++ b/docker/Dockerfile_vllm_atom_oot @@ -8,6 +8,7 @@ ARG INSTALL_LM_EVAL=1 ENV DEBIAN_FRONTEND=noninteractive ENV MAX_JOBS=${MAX_JOBS} +ENV VLLM_TARGET_DEVICE=rocm WORKDIR /app # Repair apt dependency state first, then install required utilities. @@ -32,16 +33,35 @@ RUN echo "========== [3/7] Clone vLLM and checkout target commit ==========" \ && echo "vLLM commit:" \ && git rev-parse HEAD -# Install vLLM Python dependencies only (torch/triton/aiter remain from base image). +# Install vLLM Python dependencies while preserving atom base torch/triton/aiter. +RUN echo "========== [4/7] Build constraints from atom base packages ==========" \ + && python3 - <<'PY' +import importlib.metadata as m +from importlib.metadata import PackageNotFoundError + +pins = ["torch", "triton", "amd-aiter", "torchvision", "torchaudio"] +out = "/tmp/atom-base-constraints.txt" +with open(out, "w", encoding="utf-8") as f: + for pkg in pins: + try: + ver = m.version(pkg) + f.write(f"{pkg}=={ver}\n") + print(f"Pin {pkg}=={ver}") + except PackageNotFoundError: + pass +print(f"Constraints file: {out}") +PY + RUN echo "========== [4/7] Install vLLM ROCm Python requirements ==========" \ + && cat /tmp/atom-base-constraints.txt \ && cd /app/vllm \ && python3 -m pip install --upgrade pip \ - && python3 -m pip install -r requirements/rocm.txt + && python3 -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt -r requirements/rocm.txt RUN echo "========== [5/7] Build vLLM wheel ==========" \ && cd /app/vllm \ - && python3 setup.py clean --all \ - && MAX_JOBS="${MAX_JOBS}" python3 setup.py bdist_wheel --dist-dir=/tmp/vllm-wheels \ + && VLLM_TARGET_DEVICE=rocm python3 setup.py clean --all \ + && MAX_JOBS="${MAX_JOBS}" VLLM_TARGET_DEVICE=rocm python3 setup.py bdist_wheel --dist-dir=/tmp/vllm-wheels \ && ls -lh /tmp/vllm-wheels # Install vLLM wheel without dependency resolution to avoid overriding base torch/triton/aiter. From 91a8e5db1bbaa81d3bc620d7b356b587ceccc33c Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 16:57:43 +0800 Subject: [PATCH 15/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_vllm_atom_oot | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot index de9f1f9d1..bf78e6b8c 100644 --- a/docker/Dockerfile_vllm_atom_oot +++ b/docker/Dockerfile_vllm_atom_oot @@ -52,11 +52,22 @@ with open(out, "w", encoding="utf-8") as f: print(f"Constraints file: {out}") PY -RUN echo "========== [4/7] Install vLLM ROCm Python requirements ==========" \ +RUN echo "========== [4/7] Install vLLM build-only requirements ==========" \ && cat /tmp/atom-base-constraints.txt \ && cd /app/vllm \ && python3 -m pip install --upgrade pip \ - && python3 -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt -r requirements/rocm.txt + && python3 -m pip install --upgrade-strategy only-if-needed \ + "cmake>=3.26.1" \ + "ninja" \ + "packaging>=24.2" \ + "setuptools>=77.0.3,<81.0.0" \ + "setuptools-scm>=8" \ + "wheel" \ + "jinja2>=3.1.6" \ + "regex" \ + "build" \ + "protobuf>=6.33.2" \ + "grpcio-tools>=1.76.0" RUN echo "========== [5/7] Build vLLM wheel ==========" \ && cd /app/vllm \ From df5d2539287da364b3605b3a6e1205ed65639bcd Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 17:47:19 +0800 Subject: [PATCH 16/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_vllm_atom_oot | 16 +++++++++++++++- docker/build_vllm_atom_oot.sh | 26 +++++++++++++++++++------- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot index bf78e6b8c..f23804c95 100644 --- a/docker/Dockerfile_vllm_atom_oot +++ b/docker/Dockerfile_vllm_atom_oot @@ -5,6 +5,7 @@ ARG VLLM_REPO="https://github.com/vllm-project/vllm.git" ARG VLLM_COMMIT="f5d17400303149bbb480f6abfb6f7bb646c1d895" ARG MAX_JOBS=64 ARG INSTALL_LM_EVAL=1 +ARG INSTALL_FASTSAFETENSORS=1 ENV DEBIAN_FRONTEND=noninteractive ENV MAX_JOBS=${MAX_JOBS} @@ -81,8 +82,21 @@ RUN echo "========== [6/7] Install vLLM wheel (no dependency override) ========= && python3 -m pip install --no-deps /tmp/vllm-wheels/*.whl \ && python3 -m pip show vllm +# Install vLLM runtime Python packages without touching core stack versions. +RUN echo "========== [6.5/7] Install vLLM runtime requirements (no deps) ==========" \ + && cd /app/vllm \ + && python3 -m pip install --no-deps -r requirements/common.txt \ + && python3 -m pip install --no-deps -r requirements/rocm.txt \ + && if [ -d /opt/rocm/share/amd_smi ]; then \ + python3 -m pip install /opt/rocm/share/amd_smi; \ + else \ + python3 -m pip install amdsmi || true; \ + fi \ + && python3 -m pip show cbor2 gguf amdsmi || true + RUN echo "========== [7/7] Optional utilities ==========" \ - && if [ "${INSTALL_LM_EVAL}" = "1" ]; then python3 -m pip install "lm-eval[api]"; else echo "Skip lm-eval install"; fi + && if [ "${INSTALL_LM_EVAL}" = "1" ]; then python3 -m pip install "lm-eval[api]"; else echo "Skip lm-eval install"; fi \ + && if [ "${INSTALL_FASTSAFETENSORS}" = "1" ]; then python3 -m pip install "git+https://github.com/foundation-model-stack/fastsafetensors.git"; else echo "Skip fastsafetensors install"; fi RUN echo "========== [7/7] Final version print ==========" \ && python3 - <<'PY' diff --git a/docker/build_vllm_atom_oot.sh b/docker/build_vllm_atom_oot.sh index 36d236622..e85ff68b9 100644 --- a/docker/build_vllm_atom_oot.sh +++ b/docker/build_vllm_atom_oot.sh @@ -37,19 +37,31 @@ echo "MAX_JOBS : ${MAX_JOBS}" echo "INSTALL_LM_EVAL : ${INSTALL_LM_EVAL}" echo echo "Build plan:" -echo " Step 1/3: (optional) pull base image" -echo " Step 2/3: build image from Dockerfile_vllm_atom_oot" -echo " Step 3/3: print final image info" +echo " Step 1/4: (optional) pull base image" +echo " Step 2/4: check/remove existing target image" +echo " Step 3/4: build image from Dockerfile_vllm_atom_oot" +echo " Step 4/4: print final image info" echo if [[ "${PULL_BASE_IMAGE}" == "1" ]]; then - print_banner "Step 1/3 - Pull base image: ${BASE_IMAGE}" + print_banner "Step 1/4 - Pull base image: ${BASE_IMAGE}" docker pull "${BASE_IMAGE}" else - print_banner "Step 1/3 - Skip base image pull (PULL_BASE_IMAGE=${PULL_BASE_IMAGE})" + print_banner "Step 1/4 - Skip base image pull (PULL_BASE_IMAGE=${PULL_BASE_IMAGE})" fi -print_banner "Step 2/3 - Build target image: ${IMAGE_TAG}" +print_banner "Step 2/4 - Check whether target image already exists" +if docker image inspect "${IMAGE_TAG}" >/dev/null 2>&1; then + echo "Target image already exists: ${IMAGE_TAG}" + docker image inspect "${IMAGE_TAG}" --format 'Existing image -> ID={{.Id}} Created={{.Created}}' + echo "Removing existing target image: ${IMAGE_TAG}" + docker image rm -f "${IMAGE_TAG}" +else + echo "Target image does not exist yet: ${IMAGE_TAG}" +fi +echo + +print_banner "Step 3/4 - Build target image: ${IMAGE_TAG}" DOCKER_BUILDKIT=1 docker build \ -f "${DOCKERFILE_PATH}" \ -t "${IMAGE_TAG}" \ @@ -61,5 +73,5 @@ DOCKER_BUILDKIT=1 docker build \ "$@" \ "${REPO_ROOT}" -print_banner "Step 3/3 - Build completed" +print_banner "Step 4/4 - Build completed" docker image inspect "${IMAGE_TAG}" --format 'Image={{.RepoTags}} ID={{.Id}} Created={{.Created}}' From ad088148fde18ff6a4edf4a96df24a50e5ebb1b1 Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 18:10:01 +0800 Subject: [PATCH 17/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_vllm_atom_oot | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot index f23804c95..fd65126e7 100644 --- a/docker/Dockerfile_vllm_atom_oot +++ b/docker/Dockerfile_vllm_atom_oot @@ -34,13 +34,14 @@ RUN echo "========== [3/7] Clone vLLM and checkout target commit ==========" \ && echo "vLLM commit:" \ && git rev-parse HEAD -# Install vLLM Python dependencies while preserving atom base torch/triton/aiter. +# Install vLLM Python dependencies while preserving atom base core stack. RUN echo "========== [4/7] Build constraints from atom base packages ==========" \ && python3 - <<'PY' import importlib.metadata as m from importlib.metadata import PackageNotFoundError -pins = ["torch", "triton", "amd-aiter", "torchvision", "torchaudio"] +# Keep only core packages pinned. Let pip freely resolve non-core runtime deps. +pins = ["torch", "amd-aiter"] out = "/tmp/atom-base-constraints.txt" with open(out, "w", encoding="utf-8") as f: for pkg in pins: @@ -83,10 +84,10 @@ RUN echo "========== [6/7] Install vLLM wheel (no dependency override) ========= && python3 -m pip show vllm # Install vLLM runtime Python packages without touching core stack versions. -RUN echo "========== [6.5/7] Install vLLM runtime requirements (no deps) ==========" \ +RUN echo "========== [6.5/7] Install vLLM runtime requirements ==========" \ && cd /app/vllm \ - && python3 -m pip install --no-deps -r requirements/common.txt \ - && python3 -m pip install --no-deps -r requirements/rocm.txt \ + && python3 -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt -r requirements/common.txt \ + && python3 -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt -r requirements/rocm.txt \ && if [ -d /opt/rocm/share/amd_smi ]; then \ python3 -m pip install /opt/rocm/share/amd_smi; \ else \ @@ -98,6 +99,13 @@ RUN echo "========== [7/7] Optional utilities ==========" \ && if [ "${INSTALL_LM_EVAL}" = "1" ]; then python3 -m pip install "lm-eval[api]"; else echo "Skip lm-eval install"; fi \ && if [ "${INSTALL_FASTSAFETENSORS}" = "1" ]; then python3 -m pip install "git+https://github.com/foundation-model-stack/fastsafetensors.git"; else echo "Skip fastsafetensors install"; fi +RUN echo "========== [7/7] Re-align critical non-torch dependencies ==========" \ + && python3 -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt \ + "transformers>=4.56.0,<5" \ + "huggingface-hub>=0.34.0,<1.0" \ + "tokenizers>=0.21.1" \ + && python3 -m pip show transformers huggingface-hub tokenizers || true + RUN echo "========== [7/7] Final version print ==========" \ && python3 - <<'PY' import importlib.metadata as m From 42e4ddc9caca0930c25f39fd0fd85dc7af3573c0 Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 18:25:03 +0800 Subject: [PATCH 18/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_vllm_atom_oot | 48 ++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot index fd65126e7..215f89e58 100644 --- a/docker/Dockerfile_vllm_atom_oot +++ b/docker/Dockerfile_vllm_atom_oot @@ -6,10 +6,12 @@ ARG VLLM_COMMIT="f5d17400303149bbb480f6abfb6f7bb646c1d895" ARG MAX_JOBS=64 ARG INSTALL_LM_EVAL=1 ARG INSTALL_FASTSAFETENSORS=1 +ARG VENV_PYTHON="/opt/venv/bin/python" ENV DEBIAN_FRONTEND=noninteractive ENV MAX_JOBS=${MAX_JOBS} ENV VLLM_TARGET_DEVICE=rocm +ENV PATH="/opt/venv/bin:${PATH}" WORKDIR /app # Repair apt dependency state first, then install required utilities. @@ -21,10 +23,12 @@ RUN echo "========== [1/7] Fix apt dependencies and prepare build utilities ==== # Do not rebuild or overwrite torch/triton/aiter from atom base image. RUN echo "========== [2/7] Verify atom base core packages ==========" \ - && python3 -m pip --version \ - && python3 -m pip show torch || true \ - && python3 -m pip show triton || true \ - && python3 -m pip show amd-aiter || true + && which python \ + && which pip \ + && "${VENV_PYTHON}" -m pip --version \ + && "${VENV_PYTHON}" -m pip show torch || true \ + && "${VENV_PYTHON}" -m pip show triton || true \ + && "${VENV_PYTHON}" -m pip show amd-aiter || true RUN echo "========== [3/7] Clone vLLM and checkout target commit ==========" \ && git clone "${VLLM_REPO}" /app/vllm \ @@ -36,7 +40,7 @@ RUN echo "========== [3/7] Clone vLLM and checkout target commit ==========" \ # Install vLLM Python dependencies while preserving atom base core stack. RUN echo "========== [4/7] Build constraints from atom base packages ==========" \ - && python3 - <<'PY' + && "${VENV_PYTHON}" - <<'PY' import importlib.metadata as m from importlib.metadata import PackageNotFoundError @@ -57,8 +61,8 @@ PY RUN echo "========== [4/7] Install vLLM build-only requirements ==========" \ && cat /tmp/atom-base-constraints.txt \ && cd /app/vllm \ - && python3 -m pip install --upgrade pip \ - && python3 -m pip install --upgrade-strategy only-if-needed \ + && "${VENV_PYTHON}" -m pip install --upgrade pip \ + && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed \ "cmake>=3.26.1" \ "ninja" \ "packaging>=24.2" \ @@ -73,41 +77,41 @@ RUN echo "========== [4/7] Install vLLM build-only requirements ==========" \ RUN echo "========== [5/7] Build vLLM wheel ==========" \ && cd /app/vllm \ - && VLLM_TARGET_DEVICE=rocm python3 setup.py clean --all \ - && MAX_JOBS="${MAX_JOBS}" VLLM_TARGET_DEVICE=rocm python3 setup.py bdist_wheel --dist-dir=/tmp/vllm-wheels \ + && VLLM_TARGET_DEVICE=rocm "${VENV_PYTHON}" setup.py clean --all \ + && MAX_JOBS="${MAX_JOBS}" VLLM_TARGET_DEVICE=rocm "${VENV_PYTHON}" setup.py bdist_wheel --dist-dir=/tmp/vllm-wheels \ && ls -lh /tmp/vllm-wheels # Install vLLM wheel without dependency resolution to avoid overriding base torch/triton/aiter. RUN echo "========== [6/7] Install vLLM wheel (no dependency override) ==========" \ - && python3 -m pip uninstall -y vllm || true \ - && python3 -m pip install --no-deps /tmp/vllm-wheels/*.whl \ - && python3 -m pip show vllm + && "${VENV_PYTHON}" -m pip uninstall -y vllm || true \ + && "${VENV_PYTHON}" -m pip install --no-deps /tmp/vllm-wheels/*.whl \ + && "${VENV_PYTHON}" -m pip show vllm # Install vLLM runtime Python packages without touching core stack versions. RUN echo "========== [6.5/7] Install vLLM runtime requirements ==========" \ && cd /app/vllm \ - && python3 -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt -r requirements/common.txt \ - && python3 -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt -r requirements/rocm.txt \ + && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt -r requirements/common.txt \ + && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt -r requirements/rocm.txt \ && if [ -d /opt/rocm/share/amd_smi ]; then \ - python3 -m pip install /opt/rocm/share/amd_smi; \ + "${VENV_PYTHON}" -m pip install /opt/rocm/share/amd_smi; \ else \ - python3 -m pip install amdsmi || true; \ + "${VENV_PYTHON}" -m pip install amdsmi || true; \ fi \ - && python3 -m pip show cbor2 gguf amdsmi || true + && "${VENV_PYTHON}" -m pip show cbor2 gguf amdsmi || true RUN echo "========== [7/7] Optional utilities ==========" \ - && if [ "${INSTALL_LM_EVAL}" = "1" ]; then python3 -m pip install "lm-eval[api]"; else echo "Skip lm-eval install"; fi \ - && if [ "${INSTALL_FASTSAFETENSORS}" = "1" ]; then python3 -m pip install "git+https://github.com/foundation-model-stack/fastsafetensors.git"; else echo "Skip fastsafetensors install"; fi + && if [ "${INSTALL_LM_EVAL}" = "1" ]; then "${VENV_PYTHON}" -m pip install "lm-eval[api]"; else echo "Skip lm-eval install"; fi \ + && if [ "${INSTALL_FASTSAFETENSORS}" = "1" ]; then "${VENV_PYTHON}" -m pip install "git+https://github.com/foundation-model-stack/fastsafetensors.git"; else echo "Skip fastsafetensors install"; fi RUN echo "========== [7/7] Re-align critical non-torch dependencies ==========" \ - && python3 -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt \ + && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt \ "transformers>=4.56.0,<5" \ "huggingface-hub>=0.34.0,<1.0" \ "tokenizers>=0.21.1" \ - && python3 -m pip show transformers huggingface-hub tokenizers || true + && "${VENV_PYTHON}" -m pip show transformers huggingface-hub tokenizers || true RUN echo "========== [7/7] Final version print ==========" \ - && python3 - <<'PY' + && "${VENV_PYTHON}" - <<'PY' import importlib.metadata as m pkgs = ["vllm", "torch", "triton", "amd-aiter", "atom"] print("Final package versions:") From f0d36975418c18507251d21af0aedde18e862d1b Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 18:45:26 +0800 Subject: [PATCH 19/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_vllm_atom_oot | 5 +++-- docker/build_vllm_atom_oot.sh | 8 ++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot index 215f89e58..b8df64743 100644 --- a/docker/Dockerfile_vllm_atom_oot +++ b/docker/Dockerfile_vllm_atom_oot @@ -95,9 +95,10 @@ RUN echo "========== [6.5/7] Install vLLM runtime requirements ==========" \ && if [ -d /opt/rocm/share/amd_smi ]; then \ "${VENV_PYTHON}" -m pip install /opt/rocm/share/amd_smi; \ else \ - "${VENV_PYTHON}" -m pip install amdsmi || true; \ + "${VENV_PYTHON}" -m pip install amdsmi; \ fi \ - && "${VENV_PYTHON}" -m pip show cbor2 gguf amdsmi || true + && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed cbor2 gguf amdsmi \ + && "${VENV_PYTHON}" -m pip show cbor2 gguf amdsmi RUN echo "========== [7/7] Optional utilities ==========" \ && if [ "${INSTALL_LM_EVAL}" = "1" ]; then "${VENV_PYTHON}" -m pip install "lm-eval[api]"; else echo "Skip lm-eval install"; fi \ diff --git a/docker/build_vllm_atom_oot.sh b/docker/build_vllm_atom_oot.sh index e85ff68b9..d80869d94 100644 --- a/docker/build_vllm_atom_oot.sh +++ b/docker/build_vllm_atom_oot.sh @@ -18,6 +18,7 @@ VLLM_COMMIT="${VLLM_COMMIT:-f5d17400303149bbb480f6abfb6f7bb646c1d895}" MAX_JOBS="${MAX_JOBS:-64}" INSTALL_LM_EVAL="${INSTALL_LM_EVAL:-1}" PULL_BASE_IMAGE="${PULL_BASE_IMAGE:-1}" +BUILD_NO_CACHE="${BUILD_NO_CACHE:-1}" print_banner() { echo "============================================================" @@ -35,6 +36,7 @@ echo "vLLM repo : ${VLLM_REPO}" echo "vLLM commit : ${VLLM_COMMIT}" echo "MAX_JOBS : ${MAX_JOBS}" echo "INSTALL_LM_EVAL : ${INSTALL_LM_EVAL}" +echo "BUILD_NO_CACHE : ${BUILD_NO_CACHE}" echo echo "Build plan:" echo " Step 1/4: (optional) pull base image" @@ -62,7 +64,13 @@ fi echo print_banner "Step 3/4 - Build target image: ${IMAGE_TAG}" +NO_CACHE_FLAG="" +if [[ "${BUILD_NO_CACHE}" == "1" ]]; then + NO_CACHE_FLAG="--no-cache" +fi + DOCKER_BUILDKIT=1 docker build \ + ${NO_CACHE_FLAG} \ -f "${DOCKERFILE_PATH}" \ -t "${IMAGE_TAG}" \ --build-arg "BASE_IMAGE=${BASE_IMAGE}" \ From ef0359691e8ca4c028f5c7f620fe4b31a939144c Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 19:02:01 +0800 Subject: [PATCH 20/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_vllm_atom_oot | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot index b8df64743..42121b411 100644 --- a/docker/Dockerfile_vllm_atom_oot +++ b/docker/Dockerfile_vllm_atom_oot @@ -90,6 +90,8 @@ RUN echo "========== [6/7] Install vLLM wheel (no dependency override) ========= # Install vLLM runtime Python packages without touching core stack versions. RUN echo "========== [6.5/7] Install vLLM runtime requirements ==========" \ && cd /app/vllm \ + && "${VENV_PYTHON}" -m pip uninstall -y triton || true \ + && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed "triton==3.5.1" \ && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt -r requirements/common.txt \ && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt -r requirements/rocm.txt \ && if [ -d /opt/rocm/share/amd_smi ]; then \ From 0528ecf90b35bbeed3497a9b73eeeeecf1277646 Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 19:41:09 +0800 Subject: [PATCH 21/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_vllm_atom_oot | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot index 42121b411..54d03d06a 100644 --- a/docker/Dockerfile_vllm_atom_oot +++ b/docker/Dockerfile_vllm_atom_oot @@ -90,9 +90,9 @@ RUN echo "========== [6/7] Install vLLM wheel (no dependency override) ========= # Install vLLM runtime Python packages without touching core stack versions. RUN echo "========== [6.5/7] Install vLLM runtime requirements ==========" \ && cd /app/vllm \ - && "${VENV_PYTHON}" -m pip uninstall -y triton || true \ - && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed "triton==3.5.1" \ - && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt -r requirements/common.txt \ + && sed '/^xgrammar[[:space:]]*==/d' requirements/common.txt > /tmp/requirements.common.no_xgrammar.txt \ + && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt -r /tmp/requirements.common.no_xgrammar.txt \ + && "${VENV_PYTHON}" -m pip install --no-deps "xgrammar==0.1.29" \ && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt -r requirements/rocm.txt \ && if [ -d /opt/rocm/share/amd_smi ]; then \ "${VENV_PYTHON}" -m pip install /opt/rocm/share/amd_smi; \ From f38053fc22a7f4aac3f1677c0ad82dbbe6c598bf Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 20:58:55 +0800 Subject: [PATCH 22/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_vllm_atom_oot | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot index 54d03d06a..ab8901501 100644 --- a/docker/Dockerfile_vllm_atom_oot +++ b/docker/Dockerfile_vllm_atom_oot @@ -91,15 +91,15 @@ RUN echo "========== [6/7] Install vLLM wheel (no dependency override) ========= RUN echo "========== [6.5/7] Install vLLM runtime requirements ==========" \ && cd /app/vllm \ && sed '/^xgrammar[[:space:]]*==/d' requirements/common.txt > /tmp/requirements.common.no_xgrammar.txt \ - && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt -r /tmp/requirements.common.no_xgrammar.txt \ + && "${VENV_PYTHON}" -m pip install --no-deps -r /tmp/requirements.common.no_xgrammar.txt \ && "${VENV_PYTHON}" -m pip install --no-deps "xgrammar==0.1.29" \ - && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt -r requirements/rocm.txt \ + && "${VENV_PYTHON}" -m pip install --no-deps -r requirements/rocm.txt \ && if [ -d /opt/rocm/share/amd_smi ]; then \ - "${VENV_PYTHON}" -m pip install /opt/rocm/share/amd_smi; \ + "${VENV_PYTHON}" -m pip install --no-deps /opt/rocm/share/amd_smi; \ else \ - "${VENV_PYTHON}" -m pip install amdsmi; \ + "${VENV_PYTHON}" -m pip install --no-deps amdsmi; \ fi \ - && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed cbor2 gguf amdsmi \ + && "${VENV_PYTHON}" -m pip install --no-deps cbor2 gguf amdsmi \ && "${VENV_PYTHON}" -m pip show cbor2 gguf amdsmi RUN echo "========== [7/7] Optional utilities ==========" \ From 38d34de7002cb5bdcf2a56d16f816357448bba08 Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 21:17:43 +0800 Subject: [PATCH 23/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_vllm_atom_oot | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot index ab8901501..ef3981aff 100644 --- a/docker/Dockerfile_vllm_atom_oot +++ b/docker/Dockerfile_vllm_atom_oot @@ -94,13 +94,19 @@ RUN echo "========== [6.5/7] Install vLLM runtime requirements ==========" \ && "${VENV_PYTHON}" -m pip install --no-deps -r /tmp/requirements.common.no_xgrammar.txt \ && "${VENV_PYTHON}" -m pip install --no-deps "xgrammar==0.1.29" \ && "${VENV_PYTHON}" -m pip install --no-deps -r requirements/rocm.txt \ + && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed \ + "openai>=1.99.1" \ + "anthropic>=0.71.0" \ + "fastapi[standard]>=0.115.0" \ + "aiohttp" \ + "runai-model-streamer[s3,gcs]==0.15.3" \ && if [ -d /opt/rocm/share/amd_smi ]; then \ "${VENV_PYTHON}" -m pip install --no-deps /opt/rocm/share/amd_smi; \ else \ "${VENV_PYTHON}" -m pip install --no-deps amdsmi; \ fi \ - && "${VENV_PYTHON}" -m pip install --no-deps cbor2 gguf amdsmi \ - && "${VENV_PYTHON}" -m pip show cbor2 gguf amdsmi + && "${VENV_PYTHON}" -m pip install --no-deps cbor2 gguf amdsmi uvloop httptools websockets \ + && "${VENV_PYTHON}" -m pip show cbor2 gguf amdsmi uvloop RUN echo "========== [7/7] Optional utilities ==========" \ && if [ "${INSTALL_LM_EVAL}" = "1" ]; then "${VENV_PYTHON}" -m pip install "lm-eval[api]"; else echo "Skip lm-eval install"; fi \ From 2110a7a1614d989809c6415feea9ffbe368c17b6 Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 22:06:37 +0800 Subject: [PATCH 24/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_vllm_atom_oot | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot index ef3981aff..b6a88b29c 100644 --- a/docker/Dockerfile_vllm_atom_oot +++ b/docker/Dockerfile_vllm_atom_oot @@ -94,7 +94,7 @@ RUN echo "========== [6.5/7] Install vLLM runtime requirements ==========" \ && "${VENV_PYTHON}" -m pip install --no-deps -r /tmp/requirements.common.no_xgrammar.txt \ && "${VENV_PYTHON}" -m pip install --no-deps "xgrammar==0.1.29" \ && "${VENV_PYTHON}" -m pip install --no-deps -r requirements/rocm.txt \ - && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed \ + && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt \ "openai>=1.99.1" \ "anthropic>=0.71.0" \ "fastapi[standard]>=0.115.0" \ From b5ab957729dd0f0942d2b43cb267eeed9cb7e5f7 Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Thu, 5 Mar 2026 22:25:15 +0800 Subject: [PATCH 25/25] add Signed-off-by: zejunchen-zejun --- docker/Dockerfile_vllm_atom_oot | 127 +++++++++++--------------------- 1 file changed, 43 insertions(+), 84 deletions(-) diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot index b6a88b29c..4bf2cae08 100644 --- a/docker/Dockerfile_vllm_atom_oot +++ b/docker/Dockerfile_vllm_atom_oot @@ -9,28 +9,24 @@ ARG INSTALL_FASTSAFETENSORS=1 ARG VENV_PYTHON="/opt/venv/bin/python" ENV DEBIAN_FRONTEND=noninteractive -ENV MAX_JOBS=${MAX_JOBS} -ENV VLLM_TARGET_DEVICE=rocm ENV PATH="/opt/venv/bin:${PATH}" +ENV VLLM_TARGET_DEVICE=rocm +ENV MAX_JOBS=${MAX_JOBS} +ENV LD_LIBRARY_PATH="/opt/venv/lib/python3.12/site-packages/torch/lib:${LD_LIBRARY_PATH}" WORKDIR /app -# Repair apt dependency state first, then install required utilities. -RUN echo "========== [1/7] Fix apt dependencies and prepare build utilities ==========" \ +RUN echo "========== [1/7] Prepare build tools ==========" \ && apt-get update \ && apt --fix-broken install -y \ && apt-get install -y --no-install-recommends git ca-certificates \ && rm -rf /var/lib/apt/lists/* -# Do not rebuild or overwrite torch/triton/aiter from atom base image. -RUN echo "========== [2/7] Verify atom base core packages ==========" \ - && which python \ - && which pip \ - && "${VENV_PYTHON}" -m pip --version \ - && "${VENV_PYTHON}" -m pip show torch || true \ - && "${VENV_PYTHON}" -m pip show triton || true \ - && "${VENV_PYTHON}" -m pip show amd-aiter || true +RUN echo "========== [2/7] Verify base packages (atom/aiter/mori) ==========" \ + && "${VENV_PYTHON}" -m pip show atom || true \ + && "${VENV_PYTHON}" -m pip show amd-aiter || true \ + && "${VENV_PYTHON}" -m pip show mori || true -RUN echo "========== [3/7] Clone vLLM and checkout target commit ==========" \ +RUN echo "========== [3/7] Clone vLLM ==========" \ && git clone "${VLLM_REPO}" /app/vllm \ && cd /app/vllm \ && git checkout "${VLLM_COMMIT}" \ @@ -38,42 +34,12 @@ RUN echo "========== [3/7] Clone vLLM and checkout target commit ==========" \ && echo "vLLM commit:" \ && git rev-parse HEAD -# Install vLLM Python dependencies while preserving atom base core stack. -RUN echo "========== [4/7] Build constraints from atom base packages ==========" \ - && "${VENV_PYTHON}" - <<'PY' -import importlib.metadata as m -from importlib.metadata import PackageNotFoundError - -# Keep only core packages pinned. Let pip freely resolve non-core runtime deps. -pins = ["torch", "amd-aiter"] -out = "/tmp/atom-base-constraints.txt" -with open(out, "w", encoding="utf-8") as f: - for pkg in pins: - try: - ver = m.version(pkg) - f.write(f"{pkg}=={ver}\n") - print(f"Pin {pkg}=={ver}") - except PackageNotFoundError: - pass -print(f"Constraints file: {out}") -PY - -RUN echo "========== [4/7] Install vLLM build-only requirements ==========" \ - && cat /tmp/atom-base-constraints.txt \ +# Follow vLLM ROCm standard: allow torch/triton override. +RUN echo "========== [4/7] Install vLLM ROCm build dependencies ==========" \ && cd /app/vllm \ && "${VENV_PYTHON}" -m pip install --upgrade pip \ - && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed \ - "cmake>=3.26.1" \ - "ninja" \ - "packaging>=24.2" \ - "setuptools>=77.0.3,<81.0.0" \ - "setuptools-scm>=8" \ - "wheel" \ - "jinja2>=3.1.6" \ - "regex" \ - "build" \ - "protobuf>=6.33.2" \ - "grpcio-tools>=1.76.0" + && "${VENV_PYTHON}" -m pip uninstall -y torch triton torchvision torchaudio || true \ + && "${VENV_PYTHON}" -m pip install -r requirements/rocm-build.txt RUN echo "========== [5/7] Build vLLM wheel ==========" \ && cd /app/vllm \ @@ -81,48 +47,41 @@ RUN echo "========== [5/7] Build vLLM wheel ==========" \ && MAX_JOBS="${MAX_JOBS}" VLLM_TARGET_DEVICE=rocm "${VENV_PYTHON}" setup.py bdist_wheel --dist-dir=/tmp/vllm-wheels \ && ls -lh /tmp/vllm-wheels -# Install vLLM wheel without dependency resolution to avoid overriding base torch/triton/aiter. -RUN echo "========== [6/7] Install vLLM wheel (no dependency override) ==========" \ - && "${VENV_PYTHON}" -m pip uninstall -y vllm || true \ - && "${VENV_PYTHON}" -m pip install --no-deps /tmp/vllm-wheels/*.whl \ - && "${VENV_PYTHON}" -m pip show vllm - -# Install vLLM runtime Python packages without touching core stack versions. -RUN echo "========== [6.5/7] Install vLLM runtime requirements ==========" \ +RUN echo "========== [6/7] Install vLLM runtime dependencies ==========" \ && cd /app/vllm \ - && sed '/^xgrammar[[:space:]]*==/d' requirements/common.txt > /tmp/requirements.common.no_xgrammar.txt \ - && "${VENV_PYTHON}" -m pip install --no-deps -r /tmp/requirements.common.no_xgrammar.txt \ - && "${VENV_PYTHON}" -m pip install --no-deps "xgrammar==0.1.29" \ - && "${VENV_PYTHON}" -m pip install --no-deps -r requirements/rocm.txt \ - && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt \ - "openai>=1.99.1" \ - "anthropic>=0.71.0" \ - "fastapi[standard]>=0.115.0" \ - "aiohttp" \ - "runai-model-streamer[s3,gcs]==0.15.3" \ - && if [ -d /opt/rocm/share/amd_smi ]; then \ - "${VENV_PYTHON}" -m pip install --no-deps /opt/rocm/share/amd_smi; \ - else \ - "${VENV_PYTHON}" -m pip install --no-deps amdsmi; \ - fi \ - && "${VENV_PYTHON}" -m pip install --no-deps cbor2 gguf amdsmi uvloop httptools websockets \ - && "${VENV_PYTHON}" -m pip show cbor2 gguf amdsmi uvloop + && "${VENV_PYTHON}" -m pip install -r requirements/rocm.txt \ + && "${VENV_PYTHON}" -m pip uninstall -y vllm || true \ + && "${VENV_PYTHON}" -m pip install /tmp/vllm-wheels/*.whl \ + && "${VENV_PYTHON}" -m pip install uvloop -RUN echo "========== [7/7] Optional utilities ==========" \ +RUN echo "========== [7/7] Optional tools and final checks ==========" \ && if [ "${INSTALL_LM_EVAL}" = "1" ]; then "${VENV_PYTHON}" -m pip install "lm-eval[api]"; else echo "Skip lm-eval install"; fi \ - && if [ "${INSTALL_FASTSAFETENSORS}" = "1" ]; then "${VENV_PYTHON}" -m pip install "git+https://github.com/foundation-model-stack/fastsafetensors.git"; else echo "Skip fastsafetensors install"; fi - -RUN echo "========== [7/7] Re-align critical non-torch dependencies ==========" \ - && "${VENV_PYTHON}" -m pip install --upgrade-strategy only-if-needed -c /tmp/atom-base-constraints.txt \ - "transformers>=4.56.0,<5" \ - "huggingface-hub>=0.34.0,<1.0" \ - "tokenizers>=0.21.1" \ - && "${VENV_PYTHON}" -m pip show transformers huggingface-hub tokenizers || true - -RUN echo "========== [7/7] Final version print ==========" \ + && if [ "${INSTALL_FASTSAFETENSORS}" = "1" ]; then "${VENV_PYTHON}" -m pip install "git+https://github.com/foundation-model-stack/fastsafetensors.git"; else echo "Skip fastsafetensors install"; fi \ && "${VENV_PYTHON}" - <<'PY' import importlib.metadata as m -pkgs = ["vllm", "torch", "triton", "amd-aiter", "atom"] +import glob +import os +import torch + +print(f"torch.version.hip: {torch.version.hip}") +print(f"torch.version.cuda: {torch.version.cuda}") +torch_lib_dir = os.path.join(os.path.dirname(torch.__file__), "lib") +print(f"torch lib dir: {torch_lib_dir}") +print(f"libtorch_hip candidates: {glob.glob(os.path.join(torch_lib_dir, 'libtorch_hip.so*'))}") +if torch.version.hip is None: + raise RuntimeError("Torch is not ROCm build (torch.version.hip is None).") + +pkgs = [ + "vllm", + "torch", + "triton", + "torchvision", + "torchaudio", + "amdsmi", + "amd-aiter", + "atom", + "mori", +] print("Final package versions:") for p in pkgs: try: