diff --git a/docker/Dockerfile_vllm_atom_oot b/docker/Dockerfile_vllm_atom_oot new file mode 100644 index 000000000..4bf2cae08 --- /dev/null +++ b/docker/Dockerfile_vllm_atom_oot @@ -0,0 +1,93 @@ +ARG BASE_IMAGE="rocm/atom-dev:nightly_202603040155" +FROM ${BASE_IMAGE} + +ARG VLLM_REPO="https://github.com/vllm-project/vllm.git" +ARG VLLM_COMMIT="f5d17400303149bbb480f6abfb6f7bb646c1d895" +ARG MAX_JOBS=64 +ARG INSTALL_LM_EVAL=1 +ARG INSTALL_FASTSAFETENSORS=1 +ARG VENV_PYTHON="/opt/venv/bin/python" + +ENV DEBIAN_FRONTEND=noninteractive +ENV PATH="/opt/venv/bin:${PATH}" +ENV VLLM_TARGET_DEVICE=rocm +ENV MAX_JOBS=${MAX_JOBS} +ENV LD_LIBRARY_PATH="/opt/venv/lib/python3.12/site-packages/torch/lib:${LD_LIBRARY_PATH}" +WORKDIR /app + +RUN echo "========== [1/7] Prepare build tools ==========" \ + && apt-get update \ + && apt --fix-broken install -y \ + && apt-get install -y --no-install-recommends git ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +RUN echo "========== [2/7] Verify base packages (atom/aiter/mori) ==========" \ + && "${VENV_PYTHON}" -m pip show atom || true \ + && "${VENV_PYTHON}" -m pip show amd-aiter || true \ + && "${VENV_PYTHON}" -m pip show mori || true + +RUN echo "========== [3/7] Clone vLLM ==========" \ + && git clone "${VLLM_REPO}" /app/vllm \ + && cd /app/vllm \ + && git checkout "${VLLM_COMMIT}" \ + && git submodule update --init --recursive \ + && echo "vLLM commit:" \ + && git rev-parse HEAD + +# Follow vLLM ROCm standard: allow torch/triton override. +RUN echo "========== [4/7] Install vLLM ROCm build dependencies ==========" \ + && cd /app/vllm \ + && "${VENV_PYTHON}" -m pip install --upgrade pip \ + && "${VENV_PYTHON}" -m pip uninstall -y torch triton torchvision torchaudio || true \ + && "${VENV_PYTHON}" -m pip install -r requirements/rocm-build.txt + +RUN echo "========== [5/7] Build vLLM wheel ==========" \ + && cd /app/vllm \ + && VLLM_TARGET_DEVICE=rocm "${VENV_PYTHON}" setup.py clean --all \ + && MAX_JOBS="${MAX_JOBS}" VLLM_TARGET_DEVICE=rocm "${VENV_PYTHON}" setup.py bdist_wheel --dist-dir=/tmp/vllm-wheels \ + && ls -lh /tmp/vllm-wheels + +RUN echo "========== [6/7] Install vLLM runtime dependencies ==========" \ + && cd /app/vllm \ + && "${VENV_PYTHON}" -m pip install -r requirements/rocm.txt \ + && "${VENV_PYTHON}" -m pip uninstall -y vllm || true \ + && "${VENV_PYTHON}" -m pip install /tmp/vllm-wheels/*.whl \ + && "${VENV_PYTHON}" -m pip install uvloop + +RUN echo "========== [7/7] Optional tools and final checks ==========" \ + && if [ "${INSTALL_LM_EVAL}" = "1" ]; then "${VENV_PYTHON}" -m pip install "lm-eval[api]"; else echo "Skip lm-eval install"; fi \ + && if [ "${INSTALL_FASTSAFETENSORS}" = "1" ]; then "${VENV_PYTHON}" -m pip install "git+https://github.com/foundation-model-stack/fastsafetensors.git"; else echo "Skip fastsafetensors install"; fi \ + && "${VENV_PYTHON}" - <<'PY' +import importlib.metadata as m +import glob +import os +import torch + +print(f"torch.version.hip: {torch.version.hip}") +print(f"torch.version.cuda: {torch.version.cuda}") +torch_lib_dir = os.path.join(os.path.dirname(torch.__file__), "lib") +print(f"torch lib dir: {torch_lib_dir}") +print(f"libtorch_hip candidates: {glob.glob(os.path.join(torch_lib_dir, 'libtorch_hip.so*'))}") +if torch.version.hip is None: + raise RuntimeError("Torch is not ROCm build (torch.version.hip is None).") + +pkgs = [ + "vllm", + "torch", + "triton", + "torchvision", + "torchaudio", + "amdsmi", + "amd-aiter", + "atom", + "mori", +] +print("Final package versions:") +for p in pkgs: + try: + print(f" {p}: {m.version(p)}") + except Exception: + print(f" {p}: ") +PY + +CMD ["/bin/bash"] diff --git a/docker/build_vllm_atom_oot.sh b/docker/build_vllm_atom_oot.sh new file mode 100644 index 000000000..d80869d94 --- /dev/null +++ b/docker/build_vllm_atom_oot.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" +LOG_DIR="${LOG_DIR:-${SCRIPT_DIR}/logs}" +LOG_FILE="${LOG_FILE:-${LOG_DIR}/build_vllm_atom_oot_$(date +%Y%m%d_%H%M%S).log}" + +mkdir -p "${LOG_DIR}" +# Mirror all stdout/stderr to terminal and log file. +exec > >(tee -a "${LOG_FILE}") 2>&1 + +DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile_vllm_atom_oot" +IMAGE_TAG="${IMAGE_TAG:-rocm/atom-vllm-dev:nightly_202603040155}" +BASE_IMAGE="${BASE_IMAGE:-rocm/atom-dev:nightly_202603040155}" +VLLM_REPO="${VLLM_REPO:-https://github.com/vllm-project/vllm.git}" +VLLM_COMMIT="${VLLM_COMMIT:-f5d17400303149bbb480f6abfb6f7bb646c1d895}" +MAX_JOBS="${MAX_JOBS:-64}" +INSTALL_LM_EVAL="${INSTALL_LM_EVAL:-1}" +PULL_BASE_IMAGE="${PULL_BASE_IMAGE:-1}" +BUILD_NO_CACHE="${BUILD_NO_CACHE:-1}" + +print_banner() { + echo "============================================================" + echo "$1" + echo "============================================================" +} + +print_banner "Build vLLM on top of ATOM base image" +echo "Log file : ${LOG_FILE}" +echo "Dockerfile : ${DOCKERFILE_PATH}" +echo "Build context : ${REPO_ROOT}" +echo "Target image : ${IMAGE_TAG}" +echo "Base image : ${BASE_IMAGE}" +echo "vLLM repo : ${VLLM_REPO}" +echo "vLLM commit : ${VLLM_COMMIT}" +echo "MAX_JOBS : ${MAX_JOBS}" +echo "INSTALL_LM_EVAL : ${INSTALL_LM_EVAL}" +echo "BUILD_NO_CACHE : ${BUILD_NO_CACHE}" +echo +echo "Build plan:" +echo " Step 1/4: (optional) pull base image" +echo " Step 2/4: check/remove existing target image" +echo " Step 3/4: build image from Dockerfile_vllm_atom_oot" +echo " Step 4/4: print final image info" +echo + +if [[ "${PULL_BASE_IMAGE}" == "1" ]]; then + print_banner "Step 1/4 - Pull base image: ${BASE_IMAGE}" + docker pull "${BASE_IMAGE}" +else + print_banner "Step 1/4 - Skip base image pull (PULL_BASE_IMAGE=${PULL_BASE_IMAGE})" +fi + +print_banner "Step 2/4 - Check whether target image already exists" +if docker image inspect "${IMAGE_TAG}" >/dev/null 2>&1; then + echo "Target image already exists: ${IMAGE_TAG}" + docker image inspect "${IMAGE_TAG}" --format 'Existing image -> ID={{.Id}} Created={{.Created}}' + echo "Removing existing target image: ${IMAGE_TAG}" + docker image rm -f "${IMAGE_TAG}" +else + echo "Target image does not exist yet: ${IMAGE_TAG}" +fi +echo + +print_banner "Step 3/4 - Build target image: ${IMAGE_TAG}" +NO_CACHE_FLAG="" +if [[ "${BUILD_NO_CACHE}" == "1" ]]; then + NO_CACHE_FLAG="--no-cache" +fi + +DOCKER_BUILDKIT=1 docker build \ + ${NO_CACHE_FLAG} \ + -f "${DOCKERFILE_PATH}" \ + -t "${IMAGE_TAG}" \ + --build-arg "BASE_IMAGE=${BASE_IMAGE}" \ + --build-arg "VLLM_REPO=${VLLM_REPO}" \ + --build-arg "VLLM_COMMIT=${VLLM_COMMIT}" \ + --build-arg "MAX_JOBS=${MAX_JOBS}" \ + --build-arg "INSTALL_LM_EVAL=${INSTALL_LM_EVAL}" \ + "$@" \ + "${REPO_ROOT}" + +print_banner "Step 4/4 - Build completed" +docker image inspect "${IMAGE_TAG}" --format 'Image={{.RepoTags}} ID={{.Id}} Created={{.Created}}'