diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2057e6b67..a51a779ee 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -12,6 +12,7 @@ on: env: PRIMUS_TURBO_COMMIT: 5233748e9c5c5795a6484ab31ece47c442d29ec2 # feat(mxfp4): refactor gemm mxfp4 and mxfp8. fuse transpose, hadamard transform and quantization. (#195) ROCSHMEM_COMMIT: 17ff985c026f9f97f85068647e863ab541dd5645 # Update version to 3.2.0 for 7.2.0 rocm release (#351) (#355) + UCCL_COMMIT: 5afb4117893c58cc0c8557d9286336141a301053 # [EP]: fix fp8 error of internode_ll on amd gfx950 arch. (#710) BASE_IMAGE: docker.io/rocm/primus:v26.1 MAXTEXT_BASE_IMAGE: docker.io/rocm/jax-training:maxtext-v25.9 @@ -101,6 +102,7 @@ jobs: --build-arg PRIMUS_TURBO_COMMIT=${PRIMUS_TURBO_COMMIT} \ --build-arg ROCSHMEM_COMMIT=${ROCSHMEM_COMMIT} \ --build-arg PRIMUS_TURBO_FRAMEWORK=PYTORCH \ + --build-arg UCCL_COMMIT=${UCCL_COMMIT} \ $GITHUB_WORKSPACE/.github/workflows/docker end_time=$(date +%s) elapsed=$((end_time - start_time)) @@ -157,6 +159,7 @@ jobs: --build-arg BASE_IMAGE=${MAXTEXT_BASE_IMAGE} \ --build-arg PRIMUS_TURBO_COMMIT=${PRIMUS_TURBO_COMMIT} \ --build-arg PRIMUS_TURBO_FRAMEWORK=JAX \ + --build-arg UCCL_COMMIT=${UCCL_COMMIT} \ --build-arg ROCSHMEM_COMMIT=${ROCSHMEM_COMMIT} . end_time=$(date +%s) elapsed=$((end_time - start_time)) diff --git a/.github/workflows/docker/Dockerfile b/.github/workflows/docker/Dockerfile index 444326c3f..6c50591ab 100644 --- a/.github/workflows/docker/Dockerfile +++ b/.github/workflows/docker/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} ARG PRIMUS_TURBO_COMMIT ARG PRIMUS_TURBO_FRAMEWORK ARG ROCSHMEM_COMMIT - +ARG UCCL_COMMIT # Non-interactive APT ENV DEBIAN_FRONTEND=noninteractive @@ -12,7 +12,7 @@ ENV DEBIAN_FRONTEND=noninteractive # Install build dependencies # --------------------------------------------------------------------------- RUN apt-get update && \ - apt-get install -y rdma-core libibverbs-dev libnuma-dev numactl&& \ + apt-get install -y rdma-core libibverbs-dev libnuma-dev numactl libgoogle-glog-dev && \ apt-get install -y --reinstall binutils RUN rm -rf /var/lib/apt/lists/* @@ -59,6 +59,22 @@ RUN cd /opt && \ RUN rm -rf /opt/Primus-Turbo +# --------------------------------------------------------------------------- +# Install UCCL-EP (skip for JAX framework) +# --------------------------------------------------------------------------- +RUN if [ "$PRIMUS_TURBO_FRAMEWORK" != "JAX" ]; then \ + cd /opt && \ + git clone https://github.com/uccl-project/uccl.git && \ + cd uccl && \ + git checkout ${UCCL_COMMIT} && \ + cd ep && \ + DISABLE_AGGRESSIVE_ATOMIC=0 TORCH_CUDA_ARCH_LIST="gfx942,gfx950" python3 setup.py build && cd .. && \ + cp ep/build/**/*.so uccl && \ + pip3 install --no-build-isolation . -v && \ + cd ep/deep_ep_wrapper && pip3 install --no-build-isolation . -v && \ + rm -rf /opt/uccl; \ + fi + # Set the default working directory WORKDIR /opt diff --git a/examples/moe_package/run_deepseek_v2_lite_pretrain_mi355x.sh b/examples/moe_package/run_deepseek_v2_lite_pretrain_mi355x.sh index a7f664f1f..d4fa1d484 100755 --- a/examples/moe_package/run_deepseek_v2_lite_pretrain_mi355x.sh +++ b/examples/moe_package/run_deepseek_v2_lite_pretrain_mi355x.sh @@ -73,11 +73,12 @@ export TRAIN_ITERS=${TRAIN_ITERS:-10} # 5 - Sync-free MoE (stage 1) # 6 - CPU NUMA binding helper # 7 - Manual GC helper +# 8 - Using UCCL-EP # MoE_Features=(0 7) # MoE_Features=(3 7) # MoE_Features=(3 4 7) # MoE_Features=(3 4 5 7) -MoE_Features=(3 4 5 6 7) +MoE_Features=(3 4 5 6 7 8) FEATURE_ARGS=() PRIMUS_TURBO_ENABLED="False" @@ -133,6 +134,9 @@ for feature in "${MoE_Features[@]}"; do FEATURE_ARGS+=("--manual_gc" "True") FEATURE_ARGS+=("--manual_gc_interval" "1") ;; + 8) + export USING_UEP=1 + ;; *) ;; esac done diff --git a/examples/moe_package/run_deepseek_v2_pretrain_mi355x.sh b/examples/moe_package/run_deepseek_v2_pretrain_mi355x.sh index a816a8816..e171afabd 100755 --- a/examples/moe_package/run_deepseek_v2_pretrain_mi355x.sh +++ b/examples/moe_package/run_deepseek_v2_pretrain_mi355x.sh @@ -70,12 +70,13 @@ export TRAIN_ITERS=${TRAIN_ITERS:-10} # 5 - Sync-free MoE (stage 1/2) # 6 - CPU NUMA binding helper # 7 - Manual GC helper +# 8 - Using UCCL-EP if [ -z "${MoE_Features}" ]; then # MoE_Features=(0 7) # MoE_Features=(3 7) # MoE_Features=(3 4 7) # MoE_Features=(3 4 6 7) - MoE_Features=(3 4 5 6 7) + MoE_Features=(3 4 5 6 7 8) else # Convert string to array # shellcheck disable=SC2128 @@ -136,6 +137,9 @@ for feature in "${MoE_Features[@]}"; do FEATURE_ARGS+=("--manual_gc" "True") FEATURE_ARGS+=("--manual_gc_interval" "1") ;; + 8) + export USING_UEP=1 + ;; *) ;; esac done diff --git a/examples/run_local_pretrain.sh b/examples/run_local_pretrain.sh index 4cb36fdb3..da4d132aa 100755 --- a/examples/run_local_pretrain.sh +++ b/examples/run_local_pretrain.sh @@ -82,6 +82,9 @@ done < <(env | grep "^HIPBLASLT_") while IFS='=' read -r name _; do ENV_ARGS+=("--env" "$name") done < <(env | grep "^PRIMUS_") +while IFS='=' read -r name _; do + ENV_ARGS+=("--env" "$name") +done < <(env | grep "^UCCL_") while IFS='=' read -r name _; do ENV_ARGS+=("--env" "$name") done < <(env | grep "^NCCL_") @@ -184,6 +187,8 @@ docker_podman_proxy run --rm \ --env MAXTEXT_PATH \ --env BACKEND_PATH \ --env REBUILD_PRIMUS_TURBO \ + --env REBUILD_UCCL \ + --env USING_UEP \ "${ENV_ARGS[@]}" \ --ipc=host --network=host \ --device=/dev/kfd --device=/dev/dri \ diff --git a/examples/run_pretrain.sh b/examples/run_pretrain.sh index 016bc23db..2c78542d6 100755 --- a/examples/run_pretrain.sh +++ b/examples/run_pretrain.sh @@ -383,6 +383,82 @@ else LOG_INFO "Skip Primus Turbo rebuild. REBUILD_PRIMUS_TURBO=$REBUILD_PRIMUS_TURBO" fi +# ----------------- Rebuild UCCL ----------------- +export REBUILD_UCCL=${REBUILD_UCCL:-0} +if [ "$REBUILD_UCCL" == "1" ]; then + LOG_INFO_RANK0 "Rebuilding UCCL from source..." + apt update && apt install -y rdma-core libibverbs-dev libnuma-dev libgoogle-glog-dev + mkdir -p "/workspace/" + cd "/workspace" || exit + + # Clean up old directory if exists to avoid git clone conflicts + if [ -d "uccl" ]; then + LOG_INFO_RANK0 "Removing existing uccl directory..." + rm -rf uccl + fi + + git clone https://github.com/uccl-project/uccl.git + cd uccl || exit + cd ep && python3 setup.py build && cd .. + cp ep/build/**/*.so uccl + pip3 install --no-build-isolation . + cd ep/deep_ep_wrapper && pip3 install --no-build-isolation . -v + cd "${PRIMUS_PATH}" || exit + LOG_INFO_RANK0 "Rebuilding UCCL from source done." +else + LOG_INFO_RANK0 "Skip UCCL rebuild. REBUILD_UCCL=$REBUILD_UCCL" +fi + +# ----------------- Using UCCL-EP ----------------- +if [ "$USING_UEP" == "1" ]; then + LOG_INFO_RANK0 "USING_UEP is enabled, checking required packages..." + + if ! python3 -m pip show uccl &>/dev/null || ! python3 -m pip show deep_ep &>/dev/null; then + LOG_ERROR "uccl is not installed! Please use pre-installed primus image or set REBUILD_UCCL=1." + exit 1 + fi + LOG_INFO_RANK0 "uccl package is installed: $(python3 -m pip show uccl | grep Version)" + LOG_INFO_RANK0 "deep_ep package is installed: $(python3 -m pip show deep_ep | grep Version)" + + if [ "$ENABLE_NUMA_BINDING" != "1" ]; then + LOG_INFO_RANK0 "ENABLE_NUMA_BINDING is not enabled! Please set ENABLE_NUMA_BINDING=1 to avoid dataloader worker exited unexpectedly." + fi + + export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=DEEP_EP + LOG_INFO_RANK0 "PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to DEEP_EP" + + + # network settings for UCCL + export UCCL_IB_GID_INDEX=${UCCL_IB_GID_INDEX:-$NCCL_IB_GID_INDEX} + export UCCL_IB_HCA=${UCCL_IB_HCA:-$NCCL_IB_HCA} + export UCCL_SOCKET_IFNAME=${UCCL_SOCKET_IFNAME:-$NCCL_SOCKET_IFNAME} + + # set low latency and normal inflight and bytes to avoid hang on AMD Pollara AI NIC and Broadcom Thor-2 + if [ "$USING_AINIC" == "1" ]; then + export UCCL_IB_MAX_INFLIGHT_NORMAL=${UCCL_IB_MAX_INFLIGHT_NORMAL:-1} + export UCCL_IB_MAX_INFLIGHT_LOW_LATENCY=${UCCL_IB_MAX_INFLIGHT_LOW_LATENCY:-1} + export UCCL_IB_MAX_INFLIGHT_BYTES=${UCCL_IB_MAX_INFLIGHT_BYTES:-4194304} # 4MB + elif [ "$REBUILD_BNXT" == "1" ]; then # Broadcom Thor-2 + # FIXME(zhuang12): use `USING_BNXT` for Broadcom Thor-2 maybe better than `REBUILD_BNXT` + export UCCL_IB_MAX_INFLIGHT_NORMAL=${UCCL_IB_MAX_INFLIGHT_NORMAL:-1} + export UCCL_IB_MAX_INFLIGHT_LOW_LATENCY=${UCCL_IB_MAX_INFLIGHT_LOW_LATENCY:-1} + export UCCL_IB_MAX_INFLIGHT_BYTES=${UCCL_IB_MAX_INFLIGHT_BYTES:-1572864} + fi + + + LOG_INFO_RANK0 "==========UCCL Network Settings==========" + LOG_INFO_RANK0 "UCCL_IB_GID_INDEX: $UCCL_IB_GID_INDEX" + LOG_INFO_RANK0 "UCCL_IB_HCA: $UCCL_IB_HCA" + LOG_INFO_RANK0 "UCCL_SOCKET_IFNAME: $UCCL_SOCKET_IFNAME" + LOG_INFO_RANK0 "UCCL_IB_MAX_INFLIGHT_NORMAL: $UCCL_IB_MAX_INFLIGHT_NORMAL" + LOG_INFO_RANK0 "UCCL_IB_MAX_INFLIGHT_LOW_LATENCY: $UCCL_IB_MAX_INFLIGHT_LOW_LATENCY" + LOG_INFO_RANK0 "UCCL_IB_MAX_INFLIGHT_BYTES: $UCCL_IB_MAX_INFLIGHT_BYTES" + LOG_INFO_RANK0 "" +else + export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=TURBO + LOG_INFO_RANK0 "USING_UEP is disabled. PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to TURBO" +fi + # nvte debug envs export NVTE_DEBUG=0 # 0, 1 export NVTE_DEBUG_LEVEL=0 # 0, 1, 2 diff --git a/primus/modules/trainer/megatron/utils.py b/primus/modules/trainer/megatron/utils.py index 5bb8350f5..47736333b 100644 --- a/primus/modules/trainer/megatron/utils.py +++ b/primus/modules/trainer/megatron/utils.py @@ -536,5 +536,9 @@ def validate_args_on_rocm(args): assert ( args.moe_router_dtype == "fp32" ), "DeepEP only supports float32 probs, please set `moe_router_dtype=fp32`" - if args.expert_model_parallel_size >= 16: + if ( + args.expert_model_parallel_size >= 16 + and os.getenv("PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND", "DEEP_EP") == "TURBO" + ): + # Turbo DeepEP is not supported for CUs > 32 when using internode dispatch/combine. assert args.turbo_deepep_num_cu <= 32, "Set `turbo_deepep_num_cu<=32` when using ep_size >= 16." diff --git a/runner/helpers/hooks/04_rebuild_uccl.sh b/runner/helpers/hooks/04_rebuild_uccl.sh index 75122b0a2..f71af14c5 100644 --- a/runner/helpers/hooks/04_rebuild_uccl.sh +++ b/runner/helpers/hooks/04_rebuild_uccl.sh @@ -21,11 +21,9 @@ fi UCCL_DIR="/tmp/uccl" UCCL_BUILD_DIR="${UCCL_BUILD_DIR:-/tmp/uccl_${HOSTNAME:-$(hostname)}}" UCCL_REF="${UCCL_REF:-}" -GPU_ARCHS="${GPU_ARCHS:-gfx942;gfx950}" LOG_INFO_RANK0 "[hook system] REBUILD_UCCL=1 → Building uccl in /tmp " LOG_INFO_RANK0 " Build directory : ${UCCL_BUILD_DIR}" -LOG_INFO_RANK0 " GPU_ARCHS : ${GPU_ARCHS}" if [ -d "$UCCL_DIR" ]; then LOG_INFO_RANK0 "[hook system] Found existed uccl in /tmp, remove it" @@ -47,7 +45,7 @@ if [[ -n "$UCCL_REF" ]]; then fi LOG_INFO_RANK0 "[hook system] Building uccl ep" -cd ep && PYTORCH_ROCM_ARCH="${GPU_ARCHS}" python3 setup.py build && cd .. +cd ep && python3 setup.py build && cd .. LOG_INFO_RANK0 "[hook system] Building uccl ep done" diff --git a/runner/helpers/hooks/05_using_uep.sh b/runner/helpers/hooks/05_using_uep.sh new file mode 100644 index 000000000..2bd384851 --- /dev/null +++ b/runner/helpers/hooks/05_using_uep.sh @@ -0,0 +1,62 @@ +#!/bin/bash +############################################################################### +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +# +# System hook: enable using UEP settings. +# +# Trigger: +# export USING_UEP=1 +# +############################################################################### + + +if [ "$USING_UEP" == "1" ]; then + LOG_INFO_RANK0 "USING_UEP is enabled, checking required packages..." + + if ! python3 -m pip show uccl &>/dev/null || ! python3 -m pip show deep_ep &>/dev/null; then + LOG_ERROR "uccl is not installed! Please use pre-installed primus image or set REBUILD_UCCL=1." + exit 1 + fi + LOG_INFO_RANK0 "uccl package is installed: $(python3 -m pip show uccl | grep Version)" + LOG_INFO_RANK0 "deep_ep package is installed: $(python3 -m pip show deep_ep | grep Version)" + + if [ "$ENABLE_NUMA_BINDING" != "1" ]; then + LOG_WARN "ENABLE_NUMA_BINDING is not enabled! Please set ENABLE_NUMA_BINDING=1 to avoid dataloader worker exited unexpectedly." + fi + + export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=DEEP_EP + LOG_INFO_RANK0 "PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to DEEP_EP" + + # network settings for UCCL + export UCCL_IB_GID_INDEX=${UCCL_IB_GID_INDEX:-$NCCL_IB_GID_INDEX} + export UCCL_IB_HCA=${UCCL_IB_HCA:-$NCCL_IB_HCA} + export UCCL_SOCKET_IFNAME=${UCCL_SOCKET_IFNAME:-$NCCL_SOCKET_IFNAME} + + # set low latency and normal inflight and bytes to avoid hang on AMD Pollara AI NIC and Broadcom Thor-2 + if [ "$USING_AINIC" == "1" ]; then + export UCCL_IB_MAX_INFLIGHT_NORMAL=${UCCL_IB_MAX_INFLIGHT_NORMAL:-1} + export UCCL_IB_MAX_INFLIGHT_LOW_LATENCY=${UCCL_IB_MAX_INFLIGHT_LOW_LATENCY:-1} + export UCCL_IB_MAX_INFLIGHT_BYTES=${UCCL_IB_MAX_INFLIGHT_BYTES:-4194304} # 4MB + elif [ "$REBUILD_BNXT" == "1" ]; then # Broadcom Thor-2 + # FIXME(zhuang12): use `USING_BNXT` for Broadcom Thor-2 maybe better than `REBUILD_BNXT` + export UCCL_IB_MAX_INFLIGHT_NORMAL=${UCCL_IB_MAX_INFLIGHT_NORMAL:-1} + export UCCL_IB_MAX_INFLIGHT_LOW_LATENCY=${UCCL_IB_MAX_INFLIGHT_LOW_LATENCY:-1} + export UCCL_IB_MAX_INFLIGHT_BYTES=${UCCL_IB_MAX_INFLIGHT_BYTES:-1572864} + fi + + + LOG_INFO_RANK0 "==========UCCL Network Settings==========" + LOG_INFO_RANK0 "UCCL_IB_GID_INDEX: $UCCL_IB_GID_INDEX" + LOG_INFO_RANK0 "UCCL_IB_HCA: $UCCL_IB_HCA" + LOG_INFO_RANK0 "UCCL_SOCKET_IFNAME: $UCCL_SOCKET_IFNAME" + LOG_INFO_RANK0 "UCCL_IB_MAX_INFLIGHT_NORMAL: $UCCL_IB_MAX_INFLIGHT_NORMAL" + LOG_INFO_RANK0 "UCCL_IB_MAX_INFLIGHT_LOW_LATENCY: $UCCL_IB_MAX_INFLIGHT_LOW_LATENCY" + LOG_INFO_RANK0 "UCCL_IB_MAX_INFLIGHT_BYTES: $UCCL_IB_MAX_INFLIGHT_BYTES" + LOG_INFO_RANK0 "" +else + export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=TURBO + LOG_INFO_RANK0 "USING_UEP is disabled. PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to TURBO" +fi diff --git a/tests/trainer/test_megatron_trainer.py b/tests/trainer/test_megatron_trainer.py index ad3acc280..1cf239be5 100644 --- a/tests/trainer/test_megatron_trainer.py +++ b/tests/trainer/test_megatron_trainer.py @@ -324,6 +324,40 @@ def test_turbo_deepep(self): ], ) + def test_deepseekv2_lite_uep(self): + run_script( + self.__class__.__name__, + "deepseekv2_lite_uep", + exp_path="examples/megatron/configs/MI300X/deepseek_v2_lite-BF16-pretrain.yaml", + env_override={"USING_UEP": "1", "REBUILD_UCCL": "1"}, + extra_args=[ + "--num_layers", + "4", + "--train_iters", + "3", + "--micro_batch_size", + "1", + "--global_batch_size", + "8", + "--moe_layer_freq", + "1", + "--expert_model_parallel_size", + "8", + "--use_turbo_deepep", + "1", + "--enable_primus_turbo", + "1", + "--moe_router_dtype", + "fp32", + "--moe_shared_expert_overlap", + "0", + "--moe_use_legacy_grouped_gemm", + "1", + "--turbo_sync_free_moe_stage", + "3", + ], + ) + class TestMegatronTrainerDeterministic(PrimusUT): def __init__(self, *args, **kwargs):