Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions docker/Dockerfile_vllm_atom_oot
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
ARG BASE_IMAGE="rocm/atom-dev:nightly_202603040155"
FROM ${BASE_IMAGE}

ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
ARG VLLM_COMMIT="f5d17400303149bbb480f6abfb6f7bb646c1d895"
ARG MAX_JOBS=64
ARG INSTALL_LM_EVAL=1
ARG INSTALL_FASTSAFETENSORS=1
ARG VENV_PYTHON="/opt/venv/bin/python"

ENV DEBIAN_FRONTEND=noninteractive
ENV PATH="/opt/venv/bin:${PATH}"
ENV VLLM_TARGET_DEVICE=rocm
ENV MAX_JOBS=${MAX_JOBS}
ENV LD_LIBRARY_PATH="/opt/venv/lib/python3.12/site-packages/torch/lib:${LD_LIBRARY_PATH}"
WORKDIR /app

RUN echo "========== [1/7] Prepare build tools ==========" \
&& apt-get update \
&& apt --fix-broken install -y \
&& apt-get install -y --no-install-recommends git ca-certificates \
&& rm -rf /var/lib/apt/lists/*

RUN echo "========== [2/7] Verify base packages (atom/aiter/mori) ==========" \
&& "${VENV_PYTHON}" -m pip show atom || true \
&& "${VENV_PYTHON}" -m pip show amd-aiter || true \
&& "${VENV_PYTHON}" -m pip show mori || true

RUN echo "========== [3/7] Clone vLLM ==========" \
&& git clone "${VLLM_REPO}" /app/vllm \
&& cd /app/vllm \
&& git checkout "${VLLM_COMMIT}" \
&& git submodule update --init --recursive \
&& echo "vLLM commit:" \
&& git rev-parse HEAD

# Follow vLLM ROCm standard: allow torch/triton override.
RUN echo "========== [4/7] Install vLLM ROCm build dependencies ==========" \
&& cd /app/vllm \
&& "${VENV_PYTHON}" -m pip install --upgrade pip \
&& "${VENV_PYTHON}" -m pip uninstall -y torch triton torchvision torchaudio || true \
&& "${VENV_PYTHON}" -m pip install -r requirements/rocm-build.txt

RUN echo "========== [5/7] Build vLLM wheel ==========" \
&& cd /app/vllm \
&& VLLM_TARGET_DEVICE=rocm "${VENV_PYTHON}" setup.py clean --all \
&& MAX_JOBS="${MAX_JOBS}" VLLM_TARGET_DEVICE=rocm "${VENV_PYTHON}" setup.py bdist_wheel --dist-dir=/tmp/vllm-wheels \
&& ls -lh /tmp/vllm-wheels

RUN echo "========== [6/7] Install vLLM runtime dependencies ==========" \
&& cd /app/vllm \
&& "${VENV_PYTHON}" -m pip install -r requirements/rocm.txt \
&& "${VENV_PYTHON}" -m pip uninstall -y vllm || true \
&& "${VENV_PYTHON}" -m pip install /tmp/vllm-wheels/*.whl \
&& "${VENV_PYTHON}" -m pip install uvloop

RUN echo "========== [7/7] Optional tools and final checks ==========" \
&& if [ "${INSTALL_LM_EVAL}" = "1" ]; then "${VENV_PYTHON}" -m pip install "lm-eval[api]"; else echo "Skip lm-eval install"; fi \
&& if [ "${INSTALL_FASTSAFETENSORS}" = "1" ]; then "${VENV_PYTHON}" -m pip install "git+https://github.com/foundation-model-stack/fastsafetensors.git"; else echo "Skip fastsafetensors install"; fi \
&& "${VENV_PYTHON}" - <<'PY'
import importlib.metadata as m
import glob
import os
import torch

print(f"torch.version.hip: {torch.version.hip}")
print(f"torch.version.cuda: {torch.version.cuda}")
torch_lib_dir = os.path.join(os.path.dirname(torch.__file__), "lib")
print(f"torch lib dir: {torch_lib_dir}")
print(f"libtorch_hip candidates: {glob.glob(os.path.join(torch_lib_dir, 'libtorch_hip.so*'))}")
if torch.version.hip is None:
raise RuntimeError("Torch is not ROCm build (torch.version.hip is None).")

pkgs = [
"vllm",
"torch",
"triton",
"torchvision",
"torchaudio",
"amdsmi",
"amd-aiter",
"atom",
"mori",
]
print("Final package versions:")
for p in pkgs:
try:
print(f" {p}: {m.version(p)}")
except Exception:
print(f" {p}: <not installed>")
PY

CMD ["/bin/bash"]
85 changes: 85 additions & 0 deletions docker/build_vllm_atom_oot.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
Copy link

Copilot AI Mar 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

REPO_ROOT is computed as ${SCRIPT_DIR}/../.., but this script lives in <repo>/docker, so that resolves to the parent of the repository root. This makes the Docker build context unexpectedly large and can break relative paths (and may leak extra files into the build context). Compute the repo root as ${SCRIPT_DIR}/.. instead.

Suggested change
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"

Copilot uses AI. Check for mistakes.
LOG_DIR="${LOG_DIR:-${SCRIPT_DIR}/logs}"
LOG_FILE="${LOG_FILE:-${LOG_DIR}/build_vllm_atom_oot_$(date +%Y%m%d_%H%M%S).log}"

mkdir -p "${LOG_DIR}"
# Mirror all stdout/stderr to terminal and log file.
exec > >(tee -a "${LOG_FILE}") 2>&1

DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile_vllm_atom_oot"
IMAGE_TAG="${IMAGE_TAG:-rocm/atom-vllm-dev:nightly_202603040155}"
BASE_IMAGE="${BASE_IMAGE:-rocm/atom-dev:nightly_202603040155}"
VLLM_REPO="${VLLM_REPO:-https://github.com/vllm-project/vllm.git}"
VLLM_COMMIT="${VLLM_COMMIT:-f5d17400303149bbb480f6abfb6f7bb646c1d895}"
MAX_JOBS="${MAX_JOBS:-64}"
INSTALL_LM_EVAL="${INSTALL_LM_EVAL:-1}"
PULL_BASE_IMAGE="${PULL_BASE_IMAGE:-1}"
BUILD_NO_CACHE="${BUILD_NO_CACHE:-1}"

print_banner() {
echo "============================================================"
echo "$1"
echo "============================================================"
}

print_banner "Build vLLM on top of ATOM base image"
echo "Log file : ${LOG_FILE}"
echo "Dockerfile : ${DOCKERFILE_PATH}"
echo "Build context : ${REPO_ROOT}"
echo "Target image : ${IMAGE_TAG}"
echo "Base image : ${BASE_IMAGE}"
echo "vLLM repo : ${VLLM_REPO}"
echo "vLLM commit : ${VLLM_COMMIT}"
echo "MAX_JOBS : ${MAX_JOBS}"
echo "INSTALL_LM_EVAL : ${INSTALL_LM_EVAL}"
echo "BUILD_NO_CACHE : ${BUILD_NO_CACHE}"
echo
echo "Build plan:"
echo " Step 1/4: (optional) pull base image"
echo " Step 2/4: check/remove existing target image"
echo " Step 3/4: build image from Dockerfile_vllm_atom_oot"
echo " Step 4/4: print final image info"
echo

if [[ "${PULL_BASE_IMAGE}" == "1" ]]; then
print_banner "Step 1/4 - Pull base image: ${BASE_IMAGE}"
docker pull "${BASE_IMAGE}"
else
print_banner "Step 1/4 - Skip base image pull (PULL_BASE_IMAGE=${PULL_BASE_IMAGE})"
fi

print_banner "Step 2/4 - Check whether target image already exists"
if docker image inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
echo "Target image already exists: ${IMAGE_TAG}"
docker image inspect "${IMAGE_TAG}" --format 'Existing image -> ID={{.Id}} Created={{.Created}}'
echo "Removing existing target image: ${IMAGE_TAG}"
docker image rm -f "${IMAGE_TAG}"
else
echo "Target image does not exist yet: ${IMAGE_TAG}"
fi
echo

print_banner "Step 3/4 - Build target image: ${IMAGE_TAG}"
NO_CACHE_FLAG=""
if [[ "${BUILD_NO_CACHE}" == "1" ]]; then
NO_CACHE_FLAG="--no-cache"
fi

DOCKER_BUILDKIT=1 docker build \
${NO_CACHE_FLAG} \
-f "${DOCKERFILE_PATH}" \
-t "${IMAGE_TAG}" \
--build-arg "BASE_IMAGE=${BASE_IMAGE}" \
--build-arg "VLLM_REPO=${VLLM_REPO}" \
--build-arg "VLLM_COMMIT=${VLLM_COMMIT}" \
--build-arg "MAX_JOBS=${MAX_JOBS}" \
--build-arg "INSTALL_LM_EVAL=${INSTALL_LM_EVAL}" \
"$@" \
"${REPO_ROOT}"

print_banner "Step 4/4 - Build completed"
docker image inspect "${IMAGE_TAG}" --format 'Image={{.RepoTags}} ID={{.Id}} Created={{.Created}}'