From fc42d7af252fa625c78d6cfb4f1331496bfe1ee4 Mon Sep 17 00:00:00 2001 From: zhuang12 Date: Thu, 5 Feb 2026 09:08:01 +0000 Subject: [PATCH 01/16] feat: add uccl-ep --- .github/workflows/ci.yaml | 3 ++ .github/workflows/docker/Dockerfile | 16 ++++++- .../run_deepseek_v2_lite_pretrain_mi355x.sh | 6 ++- .../run_deepseek_v2_pretrain_mi355x.sh | 6 ++- examples/run_local_pretrain.sh | 2 + examples/run_pretrain.sh | 48 +++++++++++++++++++ primus/modules/trainer/megatron/utils.py | 6 ++- runner/helpers/hooks/05_using_uep.sh | 35 ++++++++++++++ 8 files changed, 118 insertions(+), 4 deletions(-) create mode 100644 runner/helpers/hooks/05_using_uep.sh diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2057e6b67..a51a779ee 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -12,6 +12,7 @@ on: env: PRIMUS_TURBO_COMMIT: 5233748e9c5c5795a6484ab31ece47c442d29ec2 # feat(mxfp4): refactor gemm mxfp4 and mxfp8. fuse transpose, hadamard transform and quantization. (#195) ROCSHMEM_COMMIT: 17ff985c026f9f97f85068647e863ab541dd5645 # Update version to 3.2.0 for 7.2.0 rocm release (#351) (#355) + UCCL_COMMIT: 5afb4117893c58cc0c8557d9286336141a301053 # [EP]: fix fp8 error of internode_ll on amd gfx950 arch. (#710) BASE_IMAGE: docker.io/rocm/primus:v26.1 MAXTEXT_BASE_IMAGE: docker.io/rocm/jax-training:maxtext-v25.9 @@ -101,6 +102,7 @@ jobs: --build-arg PRIMUS_TURBO_COMMIT=${PRIMUS_TURBO_COMMIT} \ --build-arg ROCSHMEM_COMMIT=${ROCSHMEM_COMMIT} \ --build-arg PRIMUS_TURBO_FRAMEWORK=PYTORCH \ + --build-arg UCCL_COMMIT=${UCCL_COMMIT} \ $GITHUB_WORKSPACE/.github/workflows/docker end_time=$(date +%s) elapsed=$((end_time - start_time)) @@ -157,6 +159,7 @@ jobs: --build-arg BASE_IMAGE=${MAXTEXT_BASE_IMAGE} \ --build-arg PRIMUS_TURBO_COMMIT=${PRIMUS_TURBO_COMMIT} \ --build-arg PRIMUS_TURBO_FRAMEWORK=JAX \ + --build-arg UCCL_COMMIT=${UCCL_COMMIT} \ --build-arg ROCSHMEM_COMMIT=${ROCSHMEM_COMMIT} . end_time=$(date +%s) elapsed=$((end_time - start_time)) diff --git a/.github/workflows/docker/Dockerfile b/.github/workflows/docker/Dockerfile index 444326c3f..780e00d46 100644 --- a/.github/workflows/docker/Dockerfile +++ b/.github/workflows/docker/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} ARG PRIMUS_TURBO_COMMIT ARG PRIMUS_TURBO_FRAMEWORK ARG ROCSHMEM_COMMIT - +ARG UCCL_COMMIT # Non-interactive APT ENV DEBIAN_FRONTEND=noninteractive @@ -59,6 +59,20 @@ RUN cd /opt && \ RUN rm -rf /opt/Primus-Turbo +# --------------------------------------------------------------------------- +# Install UCCL-EP +# --------------------------------------------------------------------------- +RUN cd /opt && \ + git clone https://github.com/uccl-project/uccl.git && \ + cd uccl && \ + git checkout ${UCCL_COMMIT} && \ + cd ep && python3 setup.py build && cd .. && \ + cp ep/build/**/*.so uccl && \ + pip3 install --no-build-isolation . -v \ + cd ep/deep_ep_wrapper && pip3 install --no-build-isolation . -v + +RUN rm -rf /tmp/uccl + # Set the default working directory WORKDIR /opt diff --git a/examples/moe_package/run_deepseek_v2_lite_pretrain_mi355x.sh b/examples/moe_package/run_deepseek_v2_lite_pretrain_mi355x.sh index a7f664f1f..d4fa1d484 100755 --- a/examples/moe_package/run_deepseek_v2_lite_pretrain_mi355x.sh +++ b/examples/moe_package/run_deepseek_v2_lite_pretrain_mi355x.sh @@ -73,11 +73,12 @@ export TRAIN_ITERS=${TRAIN_ITERS:-10} # 5 - Sync-free MoE (stage 1) # 6 - CPU NUMA binding helper # 7 - Manual GC helper +# 8 - Using UCCL-EP # MoE_Features=(0 7) # MoE_Features=(3 7) # MoE_Features=(3 4 7) # MoE_Features=(3 4 5 7) -MoE_Features=(3 4 5 6 7) +MoE_Features=(3 4 5 6 7 8) FEATURE_ARGS=() PRIMUS_TURBO_ENABLED="False" @@ -133,6 +134,9 @@ for feature in "${MoE_Features[@]}"; do FEATURE_ARGS+=("--manual_gc" "True") FEATURE_ARGS+=("--manual_gc_interval" "1") ;; + 8) + export USING_UEP=1 + ;; *) ;; esac done diff --git a/examples/moe_package/run_deepseek_v2_pretrain_mi355x.sh b/examples/moe_package/run_deepseek_v2_pretrain_mi355x.sh index a816a8816..e171afabd 100755 --- a/examples/moe_package/run_deepseek_v2_pretrain_mi355x.sh +++ b/examples/moe_package/run_deepseek_v2_pretrain_mi355x.sh @@ -70,12 +70,13 @@ export TRAIN_ITERS=${TRAIN_ITERS:-10} # 5 - Sync-free MoE (stage 1/2) # 6 - CPU NUMA binding helper # 7 - Manual GC helper +# 8 - Using UCCL-EP if [ -z "${MoE_Features}" ]; then # MoE_Features=(0 7) # MoE_Features=(3 7) # MoE_Features=(3 4 7) # MoE_Features=(3 4 6 7) - MoE_Features=(3 4 5 6 7) + MoE_Features=(3 4 5 6 7 8) else # Convert string to array # shellcheck disable=SC2128 @@ -136,6 +137,9 @@ for feature in "${MoE_Features[@]}"; do FEATURE_ARGS+=("--manual_gc" "True") FEATURE_ARGS+=("--manual_gc_interval" "1") ;; + 8) + export USING_UEP=1 + ;; *) ;; esac done diff --git a/examples/run_local_pretrain.sh b/examples/run_local_pretrain.sh index 4cb36fdb3..38813585b 100755 --- a/examples/run_local_pretrain.sh +++ b/examples/run_local_pretrain.sh @@ -184,6 +184,8 @@ docker_podman_proxy run --rm \ --env MAXTEXT_PATH \ --env BACKEND_PATH \ --env REBUILD_PRIMUS_TURBO \ + --env REBUILD_UCCL \ + --env USING_UEP \ "${ENV_ARGS[@]}" \ --ipc=host --network=host \ --device=/dev/kfd --device=/dev/dri \ diff --git a/examples/run_pretrain.sh b/examples/run_pretrain.sh index 016bc23db..644a5109c 100755 --- a/examples/run_pretrain.sh +++ b/examples/run_pretrain.sh @@ -383,6 +383,54 @@ else LOG_INFO "Skip Primus Turbo rebuild. REBUILD_PRIMUS_TURBO=$REBUILD_PRIMUS_TURBO" fi +# ----------------- Rebuild UCCL ----------------- +export REBUILD_UCCL=${REBUILD_UCCL:-0} +if [ "$REBUILD_UCCL" == "1" ]; then + LOG_INFO "Rebuilding UCCL from source..." + apt update && apt install -y rdma-core libibverbs-dev libnuma-dev libgoogle-glog-dev + mkdir -p "/workspace/" + cd "/workspace" || exit + + # Clean up old directory if exists to avoid git clone conflicts + if [ -d "uccl" ]; then + LOG_INFO "Removing existing uccl directory..." + rm -rf uccl + fi + + git clone https://github.com/uccl-project/uccl.git + cd uccl || exit + cd ep && PYTORCH_ROCM_ARCH="gfx942;gfx950" python3 setup.py build && cd .. + cp ep/build/**/*.so uccl + pip3 install --no-build-isolation . + cd ep/deep_ep_wrapper && pip3 install --no-build-isolation . -v + cd "${PRIMUS_PATH}" || exit + LOG_INFO "Rebuilding UCCL from source done." +else + LOG_INFO "Skip UCCL rebuild. REBUILD_UCCL=$REBUILD_UCCL" +fi + +# ----------------- Using UCCL-EP ----------------- +if [ "$USING_UEP" == "1" ]; then + LOG_INFO "USING_UEP is enabled, checking required packages..." + + if ! pip show uccl &>/dev/null || ! pip show deep_ep &>/dev/null; then + LOG_ERROR "uccl is not installed! Please use pre-installed primus image or set REBUILD_UCCL=1." + exit 1 + fi + LOG_INFO "uccl package is installed: $(pip show uccl | grep Version)" + LOG_INFO "deep_ep package is installed: $(pip show deep_ep | grep Version)" + + if [ "$ENABLE_NUMA_BINDING" != "1" ]; then + LOG_INFO "ENABLE_NUMA_BINDING is not enabled! Please set ENABLE_NUMA_BINDING=1 to avoid dataloader worker exited unexpectedly." + fi + + export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=DEEP_EP + LOG_INFO "PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to DEEP_EP" +else + export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=TURBO + LOG_INFO "USING_UEP is disabled. PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to TURBO" +fi + # nvte debug envs export NVTE_DEBUG=0 # 0, 1 export NVTE_DEBUG_LEVEL=0 # 0, 1, 2 diff --git a/primus/modules/trainer/megatron/utils.py b/primus/modules/trainer/megatron/utils.py index 5bb8350f5..c28d4e212 100644 --- a/primus/modules/trainer/megatron/utils.py +++ b/primus/modules/trainer/megatron/utils.py @@ -536,5 +536,9 @@ def validate_args_on_rocm(args): assert ( args.moe_router_dtype == "fp32" ), "DeepEP only supports float32 probs, please set `moe_router_dtype=fp32`" - if args.expert_model_parallel_size >= 16: + if ( + args.expert_model_parallel_size >= 16 + and os.getenv("PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND") == "TURBO" + ): + # Turbo DeepEP is not supported CUs > 32 when using internode dispatch/combine. assert args.turbo_deepep_num_cu <= 32, "Set `turbo_deepep_num_cu<=32` when using ep_size >= 16." diff --git a/runner/helpers/hooks/05_using_uep.sh b/runner/helpers/hooks/05_using_uep.sh new file mode 100644 index 000000000..f3b4ffeda --- /dev/null +++ b/runner/helpers/hooks/05_using_uep.sh @@ -0,0 +1,35 @@ +#!/bin/bash +############################################################################### +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +# +# System hook: enable using ucep settings. +# +# Trigger: +# export USING_UEP=1 +# +############################################################################### + + +if [ "$USING_UEP" == "1" ]; then + LOG_INFO "USING_UEP is enabled, checking required packages..." + + if ! pip show uccl &>/dev/null || ! pip show deep_ep &>/dev/null; then + LOG_ERROR "uccl is not installed! Please use pre-installed primus image or set REBUILD_UCCL=1." + exit 1 + fi + LOG_INFO "uccl package is installed: $(pip show uccl | grep Version)" + LOG_INFO "deep_ep package is installed: $(pip show deep_ep | grep Version)" + + if [ "$ENABLE_NUMA_BINDING" != "1" ]; then + LOG_WARN "ENABLE_NUMA_BINDING is not enabled! Please set ENABLE_NUMA_BINDING=1 to avoid dataloader worker exited unexpectedly." + fi + + export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=DEEP_EP + LOG_INFO "PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to DEEP_EP" +else + export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=TURBO + LOG_INFO "USING_UEP is disabled. PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to TURBO" +fi From 42735b2db2f297fe4f254bdd5081d5c590257744 Mon Sep 17 00:00:00 2001 From: zhuang12 Date: Thu, 5 Feb 2026 09:11:53 +0000 Subject: [PATCH 02/16] update --- .github/workflows/docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker/Dockerfile b/.github/workflows/docker/Dockerfile index 780e00d46..b0f251281 100644 --- a/.github/workflows/docker/Dockerfile +++ b/.github/workflows/docker/Dockerfile @@ -71,7 +71,7 @@ RUN cd /opt && \ pip3 install --no-build-isolation . -v \ cd ep/deep_ep_wrapper && pip3 install --no-build-isolation . -v -RUN rm -rf /tmp/uccl +RUN rm -rf /opt/uccl # Set the default working directory WORKDIR /opt From 14783a38083b6d87d58a2ffd8a30c13afe7a755b Mon Sep 17 00:00:00 2001 From: zhuang12 Date: Fri, 6 Feb 2026 02:36:46 +0000 Subject: [PATCH 03/16] fix --- .github/workflows/docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker/Dockerfile b/.github/workflows/docker/Dockerfile index b0f251281..dbf3fce63 100644 --- a/.github/workflows/docker/Dockerfile +++ b/.github/workflows/docker/Dockerfile @@ -12,7 +12,7 @@ ENV DEBIAN_FRONTEND=noninteractive # Install build dependencies # --------------------------------------------------------------------------- RUN apt-get update && \ - apt-get install -y rdma-core libibverbs-dev libnuma-dev numactl&& \ + apt-get install -y rdma-core libibverbs-dev libnuma-dev numactl libgoogle-glog-dev && \ apt-get install -y --reinstall binutils RUN rm -rf /var/lib/apt/lists/* From 1b3acf692a4a32d68fad558dc44992e965b6e90b Mon Sep 17 00:00:00 2001 From: zhenhuang12 Date: Fri, 6 Feb 2026 10:39:11 +0800 Subject: [PATCH 04/16] Update runner/helpers/hooks/05_using_uep.sh Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- runner/helpers/hooks/05_using_uep.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runner/helpers/hooks/05_using_uep.sh b/runner/helpers/hooks/05_using_uep.sh index f3b4ffeda..f42935aab 100644 --- a/runner/helpers/hooks/05_using_uep.sh +++ b/runner/helpers/hooks/05_using_uep.sh @@ -5,7 +5,7 @@ # See LICENSE for license information. ############################################################################### # -# System hook: enable using ucep settings. +# System hook: enable using UEP settings. # # Trigger: # export USING_UEP=1 From 5bc90cafb7c8bea43dc83a2d530830891464920e Mon Sep 17 00:00:00 2001 From: zhenhuang12 Date: Fri, 6 Feb 2026 10:48:00 +0800 Subject: [PATCH 05/16] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .github/workflows/docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker/Dockerfile b/.github/workflows/docker/Dockerfile index dbf3fce63..988b856f0 100644 --- a/.github/workflows/docker/Dockerfile +++ b/.github/workflows/docker/Dockerfile @@ -68,7 +68,7 @@ RUN cd /opt && \ git checkout ${UCCL_COMMIT} && \ cd ep && python3 setup.py build && cd .. && \ cp ep/build/**/*.so uccl && \ - pip3 install --no-build-isolation . -v \ + pip3 install --no-build-isolation . -v && \ cd ep/deep_ep_wrapper && pip3 install --no-build-isolation . -v RUN rm -rf /opt/uccl From b7b4f81bb40a74e0d1d48a4df6510ec50be4dcb4 Mon Sep 17 00:00:00 2001 From: zhuang12 Date: Fri, 6 Feb 2026 07:30:55 +0000 Subject: [PATCH 06/16] fix build uccl-ep --- .github/workflows/docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker/Dockerfile b/.github/workflows/docker/Dockerfile index 988b856f0..28a495259 100644 --- a/.github/workflows/docker/Dockerfile +++ b/.github/workflows/docker/Dockerfile @@ -66,7 +66,7 @@ RUN cd /opt && \ git clone https://github.com/uccl-project/uccl.git && \ cd uccl && \ git checkout ${UCCL_COMMIT} && \ - cd ep && python3 setup.py build && cd .. && \ + cd ep && TORCH_CUDA_ARCH_LIST="gfx942;gfx950" python3 setup.py build && cd .. && \ cp ep/build/**/*.so uccl && \ pip3 install --no-build-isolation . -v && \ cd ep/deep_ep_wrapper && pip3 install --no-build-isolation . -v From a9bbbd759f6154a23ded68c83ba6c8a73710846b Mon Sep 17 00:00:00 2001 From: zhuang12 Date: Sat, 7 Feb 2026 08:19:26 +0000 Subject: [PATCH 07/16] fix --- .github/workflows/docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker/Dockerfile b/.github/workflows/docker/Dockerfile index 28a495259..b36e61324 100644 --- a/.github/workflows/docker/Dockerfile +++ b/.github/workflows/docker/Dockerfile @@ -66,7 +66,7 @@ RUN cd /opt && \ git clone https://github.com/uccl-project/uccl.git && \ cd uccl && \ git checkout ${UCCL_COMMIT} && \ - cd ep && TORCH_CUDA_ARCH_LIST="gfx942;gfx950" python3 setup.py build && cd .. && \ + cd ep && TORCH_CUDA_ARCH_LIST="gfx942,gfx950" python3 setup.py build && cd .. && \ cp ep/build/**/*.so uccl && \ pip3 install --no-build-isolation . -v && \ cd ep/deep_ep_wrapper && pip3 install --no-build-isolation . -v From 7ac8116b4586c60c13d4c8688abc20a0a66e4789 Mon Sep 17 00:00:00 2001 From: zhuang12 Date: Sat, 7 Feb 2026 11:21:18 +0000 Subject: [PATCH 08/16] skip build uccl for jax --- .github/workflows/docker/Dockerfile | 11 ++++++----- examples/run_pretrain.sh | 2 +- runner/helpers/hooks/04_rebuild_uccl.sh | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.github/workflows/docker/Dockerfile b/.github/workflows/docker/Dockerfile index b36e61324..52e090486 100644 --- a/.github/workflows/docker/Dockerfile +++ b/.github/workflows/docker/Dockerfile @@ -60,18 +60,19 @@ RUN cd /opt && \ RUN rm -rf /opt/Primus-Turbo # --------------------------------------------------------------------------- -# Install UCCL-EP +# Install UCCL-EP (skip for jax framework) # --------------------------------------------------------------------------- -RUN cd /opt && \ +RUN if [ "$PRIMUS_TURBO_FRAMEWORK" != "jax" ]; then \ + cd /opt && \ git clone https://github.com/uccl-project/uccl.git && \ cd uccl && \ git checkout ${UCCL_COMMIT} && \ cd ep && TORCH_CUDA_ARCH_LIST="gfx942,gfx950" python3 setup.py build && cd .. && \ cp ep/build/**/*.so uccl && \ pip3 install --no-build-isolation . -v && \ - cd ep/deep_ep_wrapper && pip3 install --no-build-isolation . -v - -RUN rm -rf /opt/uccl + cd ep/deep_ep_wrapper && pip3 install --no-build-isolation . -v && \ + rm -rf /opt/uccl; \ + fi # Set the default working directory WORKDIR /opt diff --git a/examples/run_pretrain.sh b/examples/run_pretrain.sh index 644a5109c..364375495 100755 --- a/examples/run_pretrain.sh +++ b/examples/run_pretrain.sh @@ -399,7 +399,7 @@ if [ "$REBUILD_UCCL" == "1" ]; then git clone https://github.com/uccl-project/uccl.git cd uccl || exit - cd ep && PYTORCH_ROCM_ARCH="gfx942;gfx950" python3 setup.py build && cd .. + cd ep && TORCH_CUDA_ARCH_LIST="gfx942,gfx950" python3 setup.py build && cd .. cp ep/build/**/*.so uccl pip3 install --no-build-isolation . cd ep/deep_ep_wrapper && pip3 install --no-build-isolation . -v diff --git a/runner/helpers/hooks/04_rebuild_uccl.sh b/runner/helpers/hooks/04_rebuild_uccl.sh index 75122b0a2..d5690a591 100644 --- a/runner/helpers/hooks/04_rebuild_uccl.sh +++ b/runner/helpers/hooks/04_rebuild_uccl.sh @@ -21,7 +21,7 @@ fi UCCL_DIR="/tmp/uccl" UCCL_BUILD_DIR="${UCCL_BUILD_DIR:-/tmp/uccl_${HOSTNAME:-$(hostname)}}" UCCL_REF="${UCCL_REF:-}" -GPU_ARCHS="${GPU_ARCHS:-gfx942;gfx950}" +GPU_ARCHS="${GPU_ARCHS:-gfx942,gfx950}" LOG_INFO_RANK0 "[hook system] REBUILD_UCCL=1 → Building uccl in /tmp " LOG_INFO_RANK0 " Build directory : ${UCCL_BUILD_DIR}" @@ -47,7 +47,7 @@ if [[ -n "$UCCL_REF" ]]; then fi LOG_INFO_RANK0 "[hook system] Building uccl ep" -cd ep && PYTORCH_ROCM_ARCH="${GPU_ARCHS}" python3 setup.py build && cd .. +cd ep && TORCH_CUDA_ARCH_LIST="${GPU_ARCHS}" python3 setup.py build && cd .. LOG_INFO_RANK0 "[hook system] Building uccl ep done" From 116a9e1d4aa1110b0e3e3d3f7fdbaa55a81c9422 Mon Sep 17 00:00:00 2001 From: zhuang12 Date: Sat, 7 Feb 2026 11:24:15 +0000 Subject: [PATCH 09/16] add test case --- tests/trainer/test_megatron_trainer.py | 34 ++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/trainer/test_megatron_trainer.py b/tests/trainer/test_megatron_trainer.py index ad3acc280..7a665690a 100644 --- a/tests/trainer/test_megatron_trainer.py +++ b/tests/trainer/test_megatron_trainer.py @@ -324,6 +324,40 @@ def test_turbo_deepep(self): ], ) + def test_deepseekv2_lite_uep(self): + run_script( + self.__class__.__name__, + "deepseekv2_lite_uep", + exp_path="examples/megatron/configs/MI300X/deepseek_v2_lite-BF16-pretrain.yaml", + env_override={"USING_UEP": "1"}, + extra_args=[ + "--num_layers", + "4", + "--train_iters", + "3", + "--micro_batch_size", + "1", + "--global_batch_size", + "8", + "--moe_layer_freq", + "1", + "--expert_model_parallel_size", + "8", + "--use_turbo_deepep", + "1", + "--enable_primus_turbo", + "1", + "--moe_router_dtype", + "fp32", + "--moe_shared_expert_overlap", + "0", + "--moe_use_legacy_grouped_gemm", + "1", + "--turbo_sync_free_moe_stage", + "3", + ], + ) + class TestMegatronTrainerDeterministic(PrimusUT): def __init__(self, *args, **kwargs): From b41a3f24fdb8b53b58c92ac88930ff629749c503 Mon Sep 17 00:00:00 2001 From: zhuang12 Date: Sat, 7 Feb 2026 11:27:50 +0000 Subject: [PATCH 10/16] fix --- .github/workflows/docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker/Dockerfile b/.github/workflows/docker/Dockerfile index 52e090486..340b7553d 100644 --- a/.github/workflows/docker/Dockerfile +++ b/.github/workflows/docker/Dockerfile @@ -60,9 +60,9 @@ RUN cd /opt && \ RUN rm -rf /opt/Primus-Turbo # --------------------------------------------------------------------------- -# Install UCCL-EP (skip for jax framework) +# Install UCCL-EP (skip for JAX framework) # --------------------------------------------------------------------------- -RUN if [ "$PRIMUS_TURBO_FRAMEWORK" != "jax" ]; then \ +RUN if [ "$PRIMUS_TURBO_FRAMEWORK" != "JAX" ]; then \ cd /opt && \ git clone https://github.com/uccl-project/uccl.git && \ cd uccl && \ From b7380411922e75b0ef7e58ff69fa9c168e82368e Mon Sep 17 00:00:00 2001 From: zhuang12 Date: Mon, 9 Feb 2026 09:27:42 +0000 Subject: [PATCH 11/16] add uccl network settings --- examples/run_local_pretrain.sh | 3 +++ examples/run_pretrain.sh | 30 ++++++++++++++++++++++++- runner/helpers/hooks/04_rebuild_uccl.sh | 4 +--- runner/helpers/hooks/05_using_uep.sh | 27 ++++++++++++++++++++++ tests/trainer/test_megatron_trainer.py | 2 +- 5 files changed, 61 insertions(+), 5 deletions(-) diff --git a/examples/run_local_pretrain.sh b/examples/run_local_pretrain.sh index 38813585b..da4d132aa 100755 --- a/examples/run_local_pretrain.sh +++ b/examples/run_local_pretrain.sh @@ -82,6 +82,9 @@ done < <(env | grep "^HIPBLASLT_") while IFS='=' read -r name _; do ENV_ARGS+=("--env" "$name") done < <(env | grep "^PRIMUS_") +while IFS='=' read -r name _; do + ENV_ARGS+=("--env" "$name") +done < <(env | grep "^UCCL_") while IFS='=' read -r name _; do ENV_ARGS+=("--env" "$name") done < <(env | grep "^NCCL_") diff --git a/examples/run_pretrain.sh b/examples/run_pretrain.sh index 364375495..aa2a34a57 100755 --- a/examples/run_pretrain.sh +++ b/examples/run_pretrain.sh @@ -399,7 +399,7 @@ if [ "$REBUILD_UCCL" == "1" ]; then git clone https://github.com/uccl-project/uccl.git cd uccl || exit - cd ep && TORCH_CUDA_ARCH_LIST="gfx942,gfx950" python3 setup.py build && cd .. + cd ep && python3 setup.py build && cd .. cp ep/build/**/*.so uccl pip3 install --no-build-isolation . cd ep/deep_ep_wrapper && pip3 install --no-build-isolation . -v @@ -426,6 +426,34 @@ if [ "$USING_UEP" == "1" ]; then export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=DEEP_EP LOG_INFO "PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to DEEP_EP" + + + # network settings for UCCL + export UCCL_IB_GID_INDEX=${UCCL_IB_GID_INDEX:-$NCCL_IB_GID_INDEX} + export UCCL_IB_HCA=${UCCL_IB_HCA:-$NCCL_IB_HCA} + export UCCL_SOCKET_IFNAME=${UCCL_SOCKET_IFNAME:-$NCCL_SOCKET_IFNAME} + + # set low latency and normal inflight and bytes to avoid hang on AMD Pollara AI NIC and Broadcom Thor-2 + if [ "$USING_AINIC" == "1" ]; then + export UCCL_IB_MAX_INFLIGHT_NORMAL=${UCCL_IB_MAX_INFLIGHT_NORMAL:-1} + export UCCL_IB_MAX_INFLIGHT_LOW_LATENCY=${UCCL_IB_MAX_INFLIGHT_LOW_LATENCY:-1} + export UCCL_IB_MAX_INFLIGHT_BYTES=${UCCL_IB_MAX_INFLIGHT_BYTES:-4194304} # 4MB + elif [ "$REBUILD_BNXT" == "1" ]; then # Broadcom Thor-2 + # FIXME(zhuang12): use `USING_BNXT` for Broadcom Thor-2 maybe better than `REBUILD_BNXT` + export UCCL_IB_MAX_INFLIGHT_NORMAL=${UCCL_IB_MAX_INFLIGHT_NORMAL:-1} + export UCCL_IB_MAX_INFLIGHT_LOW_LATENCY=${UCCL_IB_MAX_INFLIGHT_LOW_LATENCY:-1} + export UCCL_IB_MAX_INFLIGHT_BYTES=${UCCL_IB_MAX_INFLIGHT_BYTES:-1572864} + fi + + + LOG_INFO "==========UCCL Network Settings==========" + LOG_INFO "UCCL_IB_GID_INDEX: $UCCL_IB_GID_INDEX" + LOG_INFO "UCCL_IB_HCA: $UCCL_IB_HCA" + LOG_INFO "UCCL_SOCKET_IFNAME: $UCCL_SOCKET_IFNAME" + LOG_INFO "UCCL_IB_MAX_INFLIGHT_NORMAL: $UCCL_IB_MAX_INFLIGHT_NORMAL" + LOG_INFO "UCCL_IB_MAX_INFLIGHT_LOW_LATENCY: $UCCL_IB_MAX_INFLIGHT_LOW_LATENCY" + LOG_INFO "UCCL_IB_MAX_INFLIGHT_BYTES: $UCCL_IB_MAX_INFLIGHT_BYTES" + LOG_INFO "" else export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=TURBO LOG_INFO "USING_UEP is disabled. PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to TURBO" diff --git a/runner/helpers/hooks/04_rebuild_uccl.sh b/runner/helpers/hooks/04_rebuild_uccl.sh index d5690a591..f71af14c5 100644 --- a/runner/helpers/hooks/04_rebuild_uccl.sh +++ b/runner/helpers/hooks/04_rebuild_uccl.sh @@ -21,11 +21,9 @@ fi UCCL_DIR="/tmp/uccl" UCCL_BUILD_DIR="${UCCL_BUILD_DIR:-/tmp/uccl_${HOSTNAME:-$(hostname)}}" UCCL_REF="${UCCL_REF:-}" -GPU_ARCHS="${GPU_ARCHS:-gfx942,gfx950}" LOG_INFO_RANK0 "[hook system] REBUILD_UCCL=1 → Building uccl in /tmp " LOG_INFO_RANK0 " Build directory : ${UCCL_BUILD_DIR}" -LOG_INFO_RANK0 " GPU_ARCHS : ${GPU_ARCHS}" if [ -d "$UCCL_DIR" ]; then LOG_INFO_RANK0 "[hook system] Found existed uccl in /tmp, remove it" @@ -47,7 +45,7 @@ if [[ -n "$UCCL_REF" ]]; then fi LOG_INFO_RANK0 "[hook system] Building uccl ep" -cd ep && TORCH_CUDA_ARCH_LIST="${GPU_ARCHS}" python3 setup.py build && cd .. +cd ep && python3 setup.py build && cd .. LOG_INFO_RANK0 "[hook system] Building uccl ep done" diff --git a/runner/helpers/hooks/05_using_uep.sh b/runner/helpers/hooks/05_using_uep.sh index f42935aab..089a2dbb3 100644 --- a/runner/helpers/hooks/05_using_uep.sh +++ b/runner/helpers/hooks/05_using_uep.sh @@ -29,6 +29,33 @@ if [ "$USING_UEP" == "1" ]; then export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=DEEP_EP LOG_INFO "PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to DEEP_EP" + + # network settings for UCCL + export UCCL_IB_GID_INDEX=${UCCL_IB_GID_INDEX:-$NCCL_IB_GID_INDEX} + export UCCL_IB_HCA=${UCCL_IB_HCA:-$NCCL_IB_HCA} + export UCCL_SOCKET_IFNAME=${UCCL_SOCKET_IFNAME:-$NCCL_SOCKET_IFNAME} + + # set low latency and normal inflight and bytes to avoid hang on AMD Pollara AI NIC and Broadcom Thor-2 + if [ "$USING_AINIC" == "1" ]; then + export UCCL_IB_MAX_INFLIGHT_NORMAL=${UCCL_IB_MAX_INFLIGHT_NORMAL:-1} + export UCCL_IB_MAX_INFLIGHT_LOW_LATENCY=${UCCL_IB_MAX_INFLIGHT_LOW_LATENCY:-1} + export UCCL_IB_MAX_INFLIGHT_BYTES=${UCCL_IB_MAX_INFLIGHT_BYTES:-4194304} # 4MB + elif [ "$REBUILD_BNXT" == "1" ]; then # Broadcom Thor-2 + # FIXME(zhuang12): use `USING_BNXT` for Broadcom Thor-2 maybe better than `REBUILD_BNXT` + export UCCL_IB_MAX_INFLIGHT_NORMAL=${UCCL_IB_MAX_INFLIGHT_NORMAL:-1} + export UCCL_IB_MAX_INFLIGHT_LOW_LATENCY=${UCCL_IB_MAX_INFLIGHT_LOW_LATENCY:-1} + export UCCL_IB_MAX_INFLIGHT_BYTES=${UCCL_IB_MAX_INFLIGHT_BYTES:-1572864} + fi + + + LOG_INFO "==========UCCL Network Settings==========" + LOG_INFO "UCCL_IB_GID_INDEX: $UCCL_IB_GID_INDEX" + LOG_INFO "UCCL_IB_HCA: $UCCL_IB_HCA" + LOG_INFO "UCCL_SOCKET_IFNAME: $UCCL_SOCKET_IFNAME" + LOG_INFO "UCCL_IB_MAX_INFLIGHT_NORMAL: $UCCL_IB_MAX_INFLIGHT_NORMAL" + LOG_INFO "UCCL_IB_MAX_INFLIGHT_LOW_LATENCY: $UCCL_IB_MAX_INFLIGHT_LOW_LATENCY" + LOG_INFO "UCCL_IB_MAX_INFLIGHT_BYTES: $UCCL_IB_MAX_INFLIGHT_BYTES" + LOG_INFO "" else export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=TURBO LOG_INFO "USING_UEP is disabled. PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to TURBO" diff --git a/tests/trainer/test_megatron_trainer.py b/tests/trainer/test_megatron_trainer.py index 7a665690a..1cf239be5 100644 --- a/tests/trainer/test_megatron_trainer.py +++ b/tests/trainer/test_megatron_trainer.py @@ -329,7 +329,7 @@ def test_deepseekv2_lite_uep(self): self.__class__.__name__, "deepseekv2_lite_uep", exp_path="examples/megatron/configs/MI300X/deepseek_v2_lite-BF16-pretrain.yaml", - env_override={"USING_UEP": "1"}, + env_override={"USING_UEP": "1", "REBUILD_UCCL": "1"}, extra_args=[ "--num_layers", "4", From f1241174ff5180b245212733b0425463c8821b60 Mon Sep 17 00:00:00 2001 From: zhuang12 Date: Mon, 9 Feb 2026 09:37:51 +0000 Subject: [PATCH 12/16] modify as sugguestion --- examples/run_pretrain.sh | 6 +++--- primus/modules/trainer/megatron/utils.py | 4 ++-- runner/helpers/hooks/05_using_uep.sh | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/run_pretrain.sh b/examples/run_pretrain.sh index aa2a34a57..0c0f2efa0 100755 --- a/examples/run_pretrain.sh +++ b/examples/run_pretrain.sh @@ -413,12 +413,12 @@ fi if [ "$USING_UEP" == "1" ]; then LOG_INFO "USING_UEP is enabled, checking required packages..." - if ! pip show uccl &>/dev/null || ! pip show deep_ep &>/dev/null; then + if ! python3 -m pip show uccl &>/dev/null || ! python3 -m pip show deep_ep &>/dev/null; then LOG_ERROR "uccl is not installed! Please use pre-installed primus image or set REBUILD_UCCL=1." exit 1 fi - LOG_INFO "uccl package is installed: $(pip show uccl | grep Version)" - LOG_INFO "deep_ep package is installed: $(pip show deep_ep | grep Version)" + LOG_INFO "uccl package is installed: $(python3 -m pip show uccl | grep Version)" + LOG_INFO "deep_ep package is installed: $(python3 -m pip show deep_ep | grep Version)" if [ "$ENABLE_NUMA_BINDING" != "1" ]; then LOG_INFO "ENABLE_NUMA_BINDING is not enabled! Please set ENABLE_NUMA_BINDING=1 to avoid dataloader worker exited unexpectedly." diff --git a/primus/modules/trainer/megatron/utils.py b/primus/modules/trainer/megatron/utils.py index c28d4e212..47736333b 100644 --- a/primus/modules/trainer/megatron/utils.py +++ b/primus/modules/trainer/megatron/utils.py @@ -538,7 +538,7 @@ def validate_args_on_rocm(args): ), "DeepEP only supports float32 probs, please set `moe_router_dtype=fp32`" if ( args.expert_model_parallel_size >= 16 - and os.getenv("PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND") == "TURBO" + and os.getenv("PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND", "DEEP_EP") == "TURBO" ): - # Turbo DeepEP is not supported CUs > 32 when using internode dispatch/combine. + # Turbo DeepEP is not supported for CUs > 32 when using internode dispatch/combine. assert args.turbo_deepep_num_cu <= 32, "Set `turbo_deepep_num_cu<=32` when using ep_size >= 16." diff --git a/runner/helpers/hooks/05_using_uep.sh b/runner/helpers/hooks/05_using_uep.sh index 089a2dbb3..e45026383 100644 --- a/runner/helpers/hooks/05_using_uep.sh +++ b/runner/helpers/hooks/05_using_uep.sh @@ -16,12 +16,12 @@ if [ "$USING_UEP" == "1" ]; then LOG_INFO "USING_UEP is enabled, checking required packages..." - if ! pip show uccl &>/dev/null || ! pip show deep_ep &>/dev/null; then + if ! python3 -m pip show uccl &>/dev/null || ! python3 -m pip show deep_ep &>/dev/null; then LOG_ERROR "uccl is not installed! Please use pre-installed primus image or set REBUILD_UCCL=1." exit 1 fi - LOG_INFO "uccl package is installed: $(pip show uccl | grep Version)" - LOG_INFO "deep_ep package is installed: $(pip show deep_ep | grep Version)" + LOG_INFO "uccl package is installed: $(python3 -m pip show uccl | grep Version)" + LOG_INFO "deep_ep package is installed: $(python3 -m pip show deep_ep | grep Version)" if [ "$ENABLE_NUMA_BINDING" != "1" ]; then LOG_WARN "ENABLE_NUMA_BINDING is not enabled! Please set ENABLE_NUMA_BINDING=1 to avoid dataloader worker exited unexpectedly." From c802da09035d4b6ebe6134218b1c4ef39e623b49 Mon Sep 17 00:00:00 2001 From: zhenhuang12 Date: Sat, 28 Feb 2026 07:23:25 +0000 Subject: [PATCH 13/16] nits --- examples/run_pretrain.sh | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/examples/run_pretrain.sh b/examples/run_pretrain.sh index 0c0f2efa0..2c78542d6 100755 --- a/examples/run_pretrain.sh +++ b/examples/run_pretrain.sh @@ -386,14 +386,14 @@ fi # ----------------- Rebuild UCCL ----------------- export REBUILD_UCCL=${REBUILD_UCCL:-0} if [ "$REBUILD_UCCL" == "1" ]; then - LOG_INFO "Rebuilding UCCL from source..." + LOG_INFO_RANK0 "Rebuilding UCCL from source..." apt update && apt install -y rdma-core libibverbs-dev libnuma-dev libgoogle-glog-dev mkdir -p "/workspace/" cd "/workspace" || exit # Clean up old directory if exists to avoid git clone conflicts if [ -d "uccl" ]; then - LOG_INFO "Removing existing uccl directory..." + LOG_INFO_RANK0 "Removing existing uccl directory..." rm -rf uccl fi @@ -404,28 +404,28 @@ if [ "$REBUILD_UCCL" == "1" ]; then pip3 install --no-build-isolation . cd ep/deep_ep_wrapper && pip3 install --no-build-isolation . -v cd "${PRIMUS_PATH}" || exit - LOG_INFO "Rebuilding UCCL from source done." + LOG_INFO_RANK0 "Rebuilding UCCL from source done." else - LOG_INFO "Skip UCCL rebuild. REBUILD_UCCL=$REBUILD_UCCL" + LOG_INFO_RANK0 "Skip UCCL rebuild. REBUILD_UCCL=$REBUILD_UCCL" fi # ----------------- Using UCCL-EP ----------------- if [ "$USING_UEP" == "1" ]; then - LOG_INFO "USING_UEP is enabled, checking required packages..." + LOG_INFO_RANK0 "USING_UEP is enabled, checking required packages..." if ! python3 -m pip show uccl &>/dev/null || ! python3 -m pip show deep_ep &>/dev/null; then LOG_ERROR "uccl is not installed! Please use pre-installed primus image or set REBUILD_UCCL=1." exit 1 fi - LOG_INFO "uccl package is installed: $(python3 -m pip show uccl | grep Version)" - LOG_INFO "deep_ep package is installed: $(python3 -m pip show deep_ep | grep Version)" + LOG_INFO_RANK0 "uccl package is installed: $(python3 -m pip show uccl | grep Version)" + LOG_INFO_RANK0 "deep_ep package is installed: $(python3 -m pip show deep_ep | grep Version)" if [ "$ENABLE_NUMA_BINDING" != "1" ]; then - LOG_INFO "ENABLE_NUMA_BINDING is not enabled! Please set ENABLE_NUMA_BINDING=1 to avoid dataloader worker exited unexpectedly." + LOG_INFO_RANK0 "ENABLE_NUMA_BINDING is not enabled! Please set ENABLE_NUMA_BINDING=1 to avoid dataloader worker exited unexpectedly." fi export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=DEEP_EP - LOG_INFO "PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to DEEP_EP" + LOG_INFO_RANK0 "PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to DEEP_EP" # network settings for UCCL @@ -446,17 +446,17 @@ if [ "$USING_UEP" == "1" ]; then fi - LOG_INFO "==========UCCL Network Settings==========" - LOG_INFO "UCCL_IB_GID_INDEX: $UCCL_IB_GID_INDEX" - LOG_INFO "UCCL_IB_HCA: $UCCL_IB_HCA" - LOG_INFO "UCCL_SOCKET_IFNAME: $UCCL_SOCKET_IFNAME" - LOG_INFO "UCCL_IB_MAX_INFLIGHT_NORMAL: $UCCL_IB_MAX_INFLIGHT_NORMAL" - LOG_INFO "UCCL_IB_MAX_INFLIGHT_LOW_LATENCY: $UCCL_IB_MAX_INFLIGHT_LOW_LATENCY" - LOG_INFO "UCCL_IB_MAX_INFLIGHT_BYTES: $UCCL_IB_MAX_INFLIGHT_BYTES" - LOG_INFO "" + LOG_INFO_RANK0 "==========UCCL Network Settings==========" + LOG_INFO_RANK0 "UCCL_IB_GID_INDEX: $UCCL_IB_GID_INDEX" + LOG_INFO_RANK0 "UCCL_IB_HCA: $UCCL_IB_HCA" + LOG_INFO_RANK0 "UCCL_SOCKET_IFNAME: $UCCL_SOCKET_IFNAME" + LOG_INFO_RANK0 "UCCL_IB_MAX_INFLIGHT_NORMAL: $UCCL_IB_MAX_INFLIGHT_NORMAL" + LOG_INFO_RANK0 "UCCL_IB_MAX_INFLIGHT_LOW_LATENCY: $UCCL_IB_MAX_INFLIGHT_LOW_LATENCY" + LOG_INFO_RANK0 "UCCL_IB_MAX_INFLIGHT_BYTES: $UCCL_IB_MAX_INFLIGHT_BYTES" + LOG_INFO_RANK0 "" else export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=TURBO - LOG_INFO "USING_UEP is disabled. PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to TURBO" + LOG_INFO_RANK0 "USING_UEP is disabled. PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to TURBO" fi # nvte debug envs From 144a1032ec57e0cd2480122e90dfe3c6c5efdd0d Mon Sep 17 00:00:00 2001 From: zhenhuang12 Date: Sat, 28 Feb 2026 07:26:33 +0000 Subject: [PATCH 14/16] nits --- runner/helpers/hooks/05_using_uep.sh | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/runner/helpers/hooks/05_using_uep.sh b/runner/helpers/hooks/05_using_uep.sh index e45026383..2bd384851 100644 --- a/runner/helpers/hooks/05_using_uep.sh +++ b/runner/helpers/hooks/05_using_uep.sh @@ -14,21 +14,21 @@ if [ "$USING_UEP" == "1" ]; then - LOG_INFO "USING_UEP is enabled, checking required packages..." + LOG_INFO_RANK0 "USING_UEP is enabled, checking required packages..." if ! python3 -m pip show uccl &>/dev/null || ! python3 -m pip show deep_ep &>/dev/null; then LOG_ERROR "uccl is not installed! Please use pre-installed primus image or set REBUILD_UCCL=1." exit 1 fi - LOG_INFO "uccl package is installed: $(python3 -m pip show uccl | grep Version)" - LOG_INFO "deep_ep package is installed: $(python3 -m pip show deep_ep | grep Version)" + LOG_INFO_RANK0 "uccl package is installed: $(python3 -m pip show uccl | grep Version)" + LOG_INFO_RANK0 "deep_ep package is installed: $(python3 -m pip show deep_ep | grep Version)" if [ "$ENABLE_NUMA_BINDING" != "1" ]; then LOG_WARN "ENABLE_NUMA_BINDING is not enabled! Please set ENABLE_NUMA_BINDING=1 to avoid dataloader worker exited unexpectedly." fi export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=DEEP_EP - LOG_INFO "PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to DEEP_EP" + LOG_INFO_RANK0 "PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to DEEP_EP" # network settings for UCCL export UCCL_IB_GID_INDEX=${UCCL_IB_GID_INDEX:-$NCCL_IB_GID_INDEX} @@ -48,15 +48,15 @@ if [ "$USING_UEP" == "1" ]; then fi - LOG_INFO "==========UCCL Network Settings==========" - LOG_INFO "UCCL_IB_GID_INDEX: $UCCL_IB_GID_INDEX" - LOG_INFO "UCCL_IB_HCA: $UCCL_IB_HCA" - LOG_INFO "UCCL_SOCKET_IFNAME: $UCCL_SOCKET_IFNAME" - LOG_INFO "UCCL_IB_MAX_INFLIGHT_NORMAL: $UCCL_IB_MAX_INFLIGHT_NORMAL" - LOG_INFO "UCCL_IB_MAX_INFLIGHT_LOW_LATENCY: $UCCL_IB_MAX_INFLIGHT_LOW_LATENCY" - LOG_INFO "UCCL_IB_MAX_INFLIGHT_BYTES: $UCCL_IB_MAX_INFLIGHT_BYTES" - LOG_INFO "" + LOG_INFO_RANK0 "==========UCCL Network Settings==========" + LOG_INFO_RANK0 "UCCL_IB_GID_INDEX: $UCCL_IB_GID_INDEX" + LOG_INFO_RANK0 "UCCL_IB_HCA: $UCCL_IB_HCA" + LOG_INFO_RANK0 "UCCL_SOCKET_IFNAME: $UCCL_SOCKET_IFNAME" + LOG_INFO_RANK0 "UCCL_IB_MAX_INFLIGHT_NORMAL: $UCCL_IB_MAX_INFLIGHT_NORMAL" + LOG_INFO_RANK0 "UCCL_IB_MAX_INFLIGHT_LOW_LATENCY: $UCCL_IB_MAX_INFLIGHT_LOW_LATENCY" + LOG_INFO_RANK0 "UCCL_IB_MAX_INFLIGHT_BYTES: $UCCL_IB_MAX_INFLIGHT_BYTES" + LOG_INFO_RANK0 "" else export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=TURBO - LOG_INFO "USING_UEP is disabled. PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to TURBO" + LOG_INFO_RANK0 "USING_UEP is disabled. PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to TURBO" fi From 5d4b2b2c87d0b46ec360f5b913f7643c5f8eaa1a Mon Sep 17 00:00:00 2001 From: zhenhuang12 Date: Mon, 2 Mar 2026 02:12:56 +0000 Subject: [PATCH 15/16] nits --- .github/workflows/docker/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docker/Dockerfile b/.github/workflows/docker/Dockerfile index 340b7553d..a1e419483 100644 --- a/.github/workflows/docker/Dockerfile +++ b/.github/workflows/docker/Dockerfile @@ -67,7 +67,8 @@ RUN if [ "$PRIMUS_TURBO_FRAMEWORK" != "JAX" ]; then \ git clone https://github.com/uccl-project/uccl.git && \ cd uccl && \ git checkout ${UCCL_COMMIT} && \ - cd ep && TORCH_CUDA_ARCH_LIST="gfx942,gfx950" python3 setup.py build && cd .. && \ + cd ep && \ + DISABLE_AGGRESSIVE_ATOMIC=0 TORCH_CUDA_ARCH_LIST="${HCC_AMDGPU_TARGET}" python3 setup.py build && cd .. && \ cp ep/build/**/*.so uccl && \ pip3 install --no-build-isolation . -v && \ cd ep/deep_ep_wrapper && pip3 install --no-build-isolation . -v && \ From a114249c1eac65a5d2e2b43e0c3fcaa567fadfe9 Mon Sep 17 00:00:00 2001 From: zhenhuang12 Date: Mon, 2 Mar 2026 02:13:33 +0000 Subject: [PATCH 16/16] nits --- .github/workflows/docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker/Dockerfile b/.github/workflows/docker/Dockerfile index a1e419483..6c50591ab 100644 --- a/.github/workflows/docker/Dockerfile +++ b/.github/workflows/docker/Dockerfile @@ -68,7 +68,7 @@ RUN if [ "$PRIMUS_TURBO_FRAMEWORK" != "JAX" ]; then \ cd uccl && \ git checkout ${UCCL_COMMIT} && \ cd ep && \ - DISABLE_AGGRESSIVE_ATOMIC=0 TORCH_CUDA_ARCH_LIST="${HCC_AMDGPU_TARGET}" python3 setup.py build && cd .. && \ + DISABLE_AGGRESSIVE_ATOMIC=0 TORCH_CUDA_ARCH_LIST="gfx942,gfx950" python3 setup.py build && cd .. && \ cp ep/build/**/*.so uccl && \ pip3 install --no-build-isolation . -v && \ cd ep/deep_ep_wrapper && pip3 install --no-build-isolation . -v && \