diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2057e6b67..40ef7643c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -175,192 +175,192 @@ jobs: # docker rmi tasimage/primus:${{env.IMAGE_TAG}}-jax echo "> build-docker success" - run-unittest-torch: - env: - PRIMUS_WORKDIR: /wekafs/primus-data/primus_safe_ci/torch - needs: [code-lint] - runs-on: [primus-lm-cicd-torch-j8knc] - steps: - - run: echo "🎉 Begin Primus-Turbo Checkout." - - name: Set commit hash to env - run: echo "PRIMUS_TURBO_COMMIT=${PRIMUS_TURBO_COMMIT}" >> $GITHUB_ENV - - name: Checkout Repo Primus-Turbo - uses: actions/checkout@v4 - with: - repository: AMD-AIG-AIMA/Primus-Turbo - submodules: "recursive" - path: Primus-Turbo - ref: ${{ env.PRIMUS_TURBO_COMMIT }} - - run: echo "Begin Primus-Turbo Install." - - name: Install Primus-Turbo - run: | - mv Primus-Turbo /tmp/ - echo "Primus-Turbo dir: /tmp/Primus-Turbo" - git config --global --add safe.directory /tmp/Primus-Turbo - cd /tmp/Primus-Turbo - start_time=$(date +%s) - echo "✅ [Pip install requirements] started at: $(date)" - mkdir -p ${PRIMUS_WORKDIR}/primus-cache - MAX_JOBS=128 pip install --cache-dir=${PRIMUS_WORKDIR}/primus-cache --no-build-isolation --no-clean -r requirements.txt - end_time=$(date +%s) - elapsed=$((end_time - start_time)) - echo "✅ [Pip install requirements] ended at: $(date)" - echo "⏱️ [Pip install requirements] Total elapsed time: ${elapsed} seconds" - start_time=$(date +%s) - echo "✅ [build primus-turbo] started at: $(date)" - pip3 install --no-build-isolation -e . -v - end_time=$(date +%s) - elapsed=$((end_time - start_time)) - echo "✅ [build primus-turbo] ended at: $(date)" - echo "⏱️ [build primus-turbo] Total elapsed time: ${elapsed} seconds" - - run: echo "🎉 Begin Primus Unit Test." - - uses: actions/checkout@v4 - with: - submodules: recursive - - name: Show Environment Info - run: | - echo "Hostname: $(hostname)" - echo "PWD: $(pwd)" - echo "HOME: $HOME" - echo "GITHUB_WORKSPACE: $GITHUB_WORKSPACE" - echo "Runner Temp Dir: $RUNNER_TEMP" - echo "Runner Tool Cache: $RUNNER_TOOL_CACHE" - - name: Install Primus - run: | - pip install -r requirements.txt - - name: Set UT_LOG_PATH - run: | - ts="$(date +%Y%m%d-%H%M%S)" - commit_id="${GITHUB_SHA::7}" - if [[ "${{ github.event_name }}" == "pull_request" ]]; then - echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/pr-${{ github.event.pull_request.number }}-${ts}-${commit_id}" >> $GITHUB_ENV - elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]]; then - echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/main-${ts}-${commit_id}" >> $GITHUB_ENV - elif [[ "${{ github.event_name }}" == "release" ]]; then - TAG_NAME="${{ github.ref }}" - TAG="${TAG_NAME#refs/tags/}" - echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/${TAG}-${ts}-${commit_id}" >> $GITHUB_ENV - else - echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/others-${ts}-${commit_id}" >> $GITHUB_ENV - fi - - name: Run CLI Shell Tests - run: | - echo "Running Primus CLI shell tests..." - bash ./tests/runner/run_all_tests.sh - - name: Run Primus Core Tests - run: | - echo "Running Primus Core tests..." - # Note: The tests `test_fp8_te_linear` and `test_te_linear` are temporarily skipped due to intermittent failures. - pytest --maxfail=1 -s ./tests/unit_tests/ \ - --deselect=tests/unit_tests/megatron/cco/test_tp_overlap.py::TPOverlapTestCase::test_fp8_te_linear \ - --deselect=tests/unit_tests/megatron/cco/test_tp_overlap.py::TPOverlapTestCase::test_te_linear - - name: Run Primus Model Tests -- Megatron-LM - env: - HF_TOKEN: ${{secrets.HF_TOKEN}} - run: | - echo "Set UT_LOG_PATH: ${{ env.UT_LOG_PATH }}" - rm -rf "${{ env.UT_LOG_PATH }}" - mkdir -p "${{ env.UT_LOG_PATH }}" - MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \ - pytest --maxfail=1 -s ./tests/trainer/test_megatron_trainer.py - - name: Run Primus Model Tests -- TorchTitan - env: - HF_TOKEN: ${{secrets.HF_TOKEN}} - run: | - echo "Set UT_LOG_PATH: ${{ env.UT_LOG_PATH }}" - rm -rf "${{ env.UT_LOG_PATH }}" - mkdir -p "${{ env.UT_LOG_PATH }}" - MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \ - pytest --maxfail=1 -s ./tests/trainer/test_torchtitan_trainer.py - - name: Clean - run: | - rm -rf ${PRIMUS_WORKDIR}/Primus-Turbo - rm -rf ${PRIMUS_WORKDIR}/Primus + # run-unittest-torch: + # env: + # PRIMUS_WORKDIR: /wekafs/primus-data/primus_safe_ci/torch + # needs: [code-lint] + # runs-on: [primus-lm-cicd-torch-j8knc] + # steps: + # - run: echo "🎉 Begin Primus-Turbo Checkout." + # - name: Set commit hash to env + # run: echo "PRIMUS_TURBO_COMMIT=${PRIMUS_TURBO_COMMIT}" >> $GITHUB_ENV + # - name: Checkout Repo Primus-Turbo + # uses: actions/checkout@v4 + # with: + # repository: AMD-AIG-AIMA/Primus-Turbo + # submodules: "recursive" + # path: Primus-Turbo + # ref: ${{ env.PRIMUS_TURBO_COMMIT }} + # - run: echo "Begin Primus-Turbo Install." + # - name: Install Primus-Turbo + # run: | + # mv Primus-Turbo /tmp/ + # echo "Primus-Turbo dir: /tmp/Primus-Turbo" + # git config --global --add safe.directory /tmp/Primus-Turbo + # cd /tmp/Primus-Turbo + # start_time=$(date +%s) + # echo "✅ [Pip install requirements] started at: $(date)" + # mkdir -p ${PRIMUS_WORKDIR}/primus-cache + # MAX_JOBS=128 pip install --cache-dir=${PRIMUS_WORKDIR}/primus-cache --no-build-isolation --no-clean -r requirements.txt + # end_time=$(date +%s) + # elapsed=$((end_time - start_time)) + # echo "✅ [Pip install requirements] ended at: $(date)" + # echo "⏱️ [Pip install requirements] Total elapsed time: ${elapsed} seconds" + # start_time=$(date +%s) + # echo "✅ [build primus-turbo] started at: $(date)" + # pip3 install --no-build-isolation -e . -v + # end_time=$(date +%s) + # elapsed=$((end_time - start_time)) + # echo "✅ [build primus-turbo] ended at: $(date)" + # echo "⏱️ [build primus-turbo] Total elapsed time: ${elapsed} seconds" + # - run: echo "🎉 Begin Primus Unit Test." + # - uses: actions/checkout@v4 + # with: + # submodules: recursive + # - name: Show Environment Info + # run: | + # echo "Hostname: $(hostname)" + # echo "PWD: $(pwd)" + # echo "HOME: $HOME" + # echo "GITHUB_WORKSPACE: $GITHUB_WORKSPACE" + # echo "Runner Temp Dir: $RUNNER_TEMP" + # echo "Runner Tool Cache: $RUNNER_TOOL_CACHE" + # - name: Install Primus + # run: | + # pip install -r requirements.txt + # - name: Set UT_LOG_PATH + # run: | + # ts="$(date +%Y%m%d-%H%M%S)" + # commit_id="${GITHUB_SHA::7}" + # if [[ "${{ github.event_name }}" == "pull_request" ]]; then + # echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/pr-${{ github.event.pull_request.number }}-${ts}-${commit_id}" >> $GITHUB_ENV + # elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]]; then + # echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/main-${ts}-${commit_id}" >> $GITHUB_ENV + # elif [[ "${{ github.event_name }}" == "release" ]]; then + # TAG_NAME="${{ github.ref }}" + # TAG="${TAG_NAME#refs/tags/}" + # echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/${TAG}-${ts}-${commit_id}" >> $GITHUB_ENV + # else + # echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/others-${ts}-${commit_id}" >> $GITHUB_ENV + # fi + # - name: Run CLI Shell Tests + # run: | + # echo "Running Primus CLI shell tests..." + # bash ./tests/runner/run_all_tests.sh + # - name: Run Primus Core Tests + # run: | + # echo "Running Primus Core tests..." + # # Note: The tests `test_fp8_te_linear` and `test_te_linear` are temporarily skipped due to intermittent failures. + # pytest --maxfail=1 -s ./tests/unit_tests/ \ + # --deselect=tests/unit_tests/megatron/cco/test_tp_overlap.py::TPOverlapTestCase::test_fp8_te_linear \ + # --deselect=tests/unit_tests/megatron/cco/test_tp_overlap.py::TPOverlapTestCase::test_te_linear + # - name: Run Primus Model Tests -- Megatron-LM + # env: + # HF_TOKEN: ${{secrets.HF_TOKEN}} + # run: | + # echo "Set UT_LOG_PATH: ${{ env.UT_LOG_PATH }}" + # rm -rf "${{ env.UT_LOG_PATH }}" + # mkdir -p "${{ env.UT_LOG_PATH }}" + # MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \ + # pytest --maxfail=1 -s ./tests/trainer/test_megatron_trainer.py + # - name: Run Primus Model Tests -- TorchTitan + # env: + # HF_TOKEN: ${{secrets.HF_TOKEN}} + # run: | + # echo "Set UT_LOG_PATH: ${{ env.UT_LOG_PATH }}" + # rm -rf "${{ env.UT_LOG_PATH }}" + # mkdir -p "${{ env.UT_LOG_PATH }}" + # MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \ + # pytest --maxfail=1 -s ./tests/trainer/test_torchtitan_trainer.py + # - name: Clean + # run: | + # rm -rf ${PRIMUS_WORKDIR}/Primus-Turbo + # rm -rf ${PRIMUS_WORKDIR}/Primus - run-unittest-jax: - env: - PRIMUS_WORKDIR: /wekafs/primus-data/primus_safe_ci/jax - needs: [code-lint] - runs-on: [primus-lm-cicd-jax-8t8mh] - steps: - - run: echo "🎉 Begin Primus-Turbo Checkout." - - name: Set commit hash to env - run: echo "PRIMUS_TURBO_COMMIT=${PRIMUS_TURBO_COMMIT}" >> $GITHUB_ENV - - name: Checkout Repo Primus-Turbo - uses: actions/checkout@v4 - with: - repository: AMD-AIG-AIMA/Primus-Turbo - submodules: "recursive" - path: Primus-Turbo - ref: ${{ env.PRIMUS_TURBO_COMMIT }} - - run: echo "Begin Primus-Turbo Install." - - name: Install Primus-Turbo - run: | - mv Primus-Turbo /tmp/ - echo "Primus-Turbo dir: /tmp/Primus-Turbo" - git config --global --add safe.directory /tmp/Primus-Turbo - cd /tmp/Primus-Turbo - start_time=$(date +%s) - echo "✅ [Pip install requirements] started at: $(date)" - mkdir -p ${PRIMUS_WORKDIR}/primus-cache - python3 -m pip install --upgrade pip setuptools - pip3 install --cache-dir=${PRIMUS_WORKDIR}/primus-cache --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm7.0 - MAX_JOBS=128 pip3 install --cache-dir=${PRIMUS_WORKDIR}/primus-cache --no-build-isolation --no-clean -r requirements.txt - end_time=$(date +%s) - elapsed=$((end_time - start_time)) - echo "✅ [Pip install requirements] ended at: $(date)" - echo "⏱️ [Pip install requirements] Total elapsed time: ${elapsed} seconds" - start_time=$(date +%s) - echo "✅ [build primus-turbo] started at: $(date)" - PRIMUS_TURBO_FRAMEWORK="JAX" pip3 install --no-build-isolation -e . -v - end_time=$(date +%s) - elapsed=$((end_time - start_time)) - echo "✅ [build primus-turbo] ended at: $(date)" - echo "⏱️ [build primus-turbo] Total elapsed time: ${elapsed} seconds" - - run: echo "🎉 Begin Primus Unit Test." - - uses: actions/checkout@v4 - with: - submodules: recursive - - name: Show Environment Info - run: | - echo "Hostname: $(hostname)" - echo "PWD: $(pwd)" - echo "HOME: $HOME" - echo "GITHUB_WORKSPACE: $GITHUB_WORKSPACE" - echo "Runner Temp Dir: $RUNNER_TEMP" - echo "Runner Tool Cache: $RUNNER_TOOL_CACHE" - - name: Install Primus - run: | - pip install -r requirements-jax.txt - - name: Set UT_LOG_PATH - run: | - ts="$(date +%Y%m%d-%H%M%S)" - commit_id="${GITHUB_SHA::7}" - if [[ "${{ github.event_name }}" == "pull_request" ]]; then - echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/pr-${{ github.event.pull_request.number }}-${ts}-${commit_id}" >> $GITHUB_ENV - elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]]; then - echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/main-${ts}-${commit_id}" >> $GITHUB_ENV - elif [[ "${{ github.event_name }}" == "release" ]]; then - TAG_NAME="${{ github.ref }}" - TAG="${TAG_NAME#refs/tags/}" - echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/${TAG}-${ts}-${commit_id}" >> $GITHUB_ENV - else - echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/others-${ts}-${commit_id}" >> $GITHUB_ENV - fi - - name: Run Shell Tests - run: | - echo "Running Primus CLI shell tests..." - bash ./tests/runner/run_all_tests.sh - - name: Run Unit Tests - env: - HF_TOKEN: ${{secrets.HF_TOKEN}} - run: | - echo "Set UT_LOG_PATH: ${{ env.UT_LOG_PATH }}" - rm -rf "${{ env.UT_LOG_PATH }}" - mkdir -p "${{ env.UT_LOG_PATH }}" - MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \ - JAX_SKIP_UT=1 python ./tests/run_unit_tests.py --jax - - name: Clean - run: | - rm -rf ${PRIMUS_WORKDIR}/Primus-Turbo - rm -rf ${PRIMUS_WORKDIR}/Primus + # run-unittest-jax: + # env: + # PRIMUS_WORKDIR: /wekafs/primus-data/primus_safe_ci/jax + # needs: [code-lint] + # runs-on: [primus-lm-cicd-jax-8t8mh] + # steps: + # - run: echo "🎉 Begin Primus-Turbo Checkout." + # - name: Set commit hash to env + # run: echo "PRIMUS_TURBO_COMMIT=${PRIMUS_TURBO_COMMIT}" >> $GITHUB_ENV + # - name: Checkout Repo Primus-Turbo + # uses: actions/checkout@v4 + # with: + # repository: AMD-AIG-AIMA/Primus-Turbo + # submodules: "recursive" + # path: Primus-Turbo + # ref: ${{ env.PRIMUS_TURBO_COMMIT }} + # - run: echo "Begin Primus-Turbo Install." + # - name: Install Primus-Turbo + # run: | + # mv Primus-Turbo /tmp/ + # echo "Primus-Turbo dir: /tmp/Primus-Turbo" + # git config --global --add safe.directory /tmp/Primus-Turbo + # cd /tmp/Primus-Turbo + # start_time=$(date +%s) + # echo "✅ [Pip install requirements] started at: $(date)" + # mkdir -p ${PRIMUS_WORKDIR}/primus-cache + # python3 -m pip install --upgrade pip setuptools + # pip3 install --cache-dir=${PRIMUS_WORKDIR}/primus-cache --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm7.0 + # MAX_JOBS=128 pip3 install --cache-dir=${PRIMUS_WORKDIR}/primus-cache --no-build-isolation --no-clean -r requirements.txt + # end_time=$(date +%s) + # elapsed=$((end_time - start_time)) + # echo "✅ [Pip install requirements] ended at: $(date)" + # echo "⏱️ [Pip install requirements] Total elapsed time: ${elapsed} seconds" + # start_time=$(date +%s) + # echo "✅ [build primus-turbo] started at: $(date)" + # PRIMUS_TURBO_FRAMEWORK="JAX" pip3 install --no-build-isolation -e . -v + # end_time=$(date +%s) + # elapsed=$((end_time - start_time)) + # echo "✅ [build primus-turbo] ended at: $(date)" + # echo "⏱️ [build primus-turbo] Total elapsed time: ${elapsed} seconds" + # - run: echo "🎉 Begin Primus Unit Test." + # - uses: actions/checkout@v4 + # with: + # submodules: recursive + # - name: Show Environment Info + # run: | + # echo "Hostname: $(hostname)" + # echo "PWD: $(pwd)" + # echo "HOME: $HOME" + # echo "GITHUB_WORKSPACE: $GITHUB_WORKSPACE" + # echo "Runner Temp Dir: $RUNNER_TEMP" + # echo "Runner Tool Cache: $RUNNER_TOOL_CACHE" + # - name: Install Primus + # run: | + # pip install -r requirements-jax.txt + # - name: Set UT_LOG_PATH + # run: | + # ts="$(date +%Y%m%d-%H%M%S)" + # commit_id="${GITHUB_SHA::7}" + # if [[ "${{ github.event_name }}" == "pull_request" ]]; then + # echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/pr-${{ github.event.pull_request.number }}-${ts}-${commit_id}" >> $GITHUB_ENV + # elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]]; then + # echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/main-${ts}-${commit_id}" >> $GITHUB_ENV + # elif [[ "${{ github.event_name }}" == "release" ]]; then + # TAG_NAME="${{ github.ref }}" + # TAG="${TAG_NAME#refs/tags/}" + # echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/${TAG}-${ts}-${commit_id}" >> $GITHUB_ENV + # else + # echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/others-${ts}-${commit_id}" >> $GITHUB_ENV + # fi + # - name: Run Shell Tests + # run: | + # echo "Running Primus CLI shell tests..." + # bash ./tests/runner/run_all_tests.sh + # - name: Run Unit Tests + # env: + # HF_TOKEN: ${{secrets.HF_TOKEN}} + # run: | + # echo "Set UT_LOG_PATH: ${{ env.UT_LOG_PATH }}" + # rm -rf "${{ env.UT_LOG_PATH }}" + # mkdir -p "${{ env.UT_LOG_PATH }}" + # MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \ + # JAX_SKIP_UT=1 python ./tests/run_unit_tests.py --jax + # - name: Clean + # run: | + # rm -rf ${PRIMUS_WORKDIR}/Primus-Turbo + # rm -rf ${PRIMUS_WORKDIR}/Primus diff --git a/examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml b/examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml index 51dd8affa..dd0c26cdf 100644 --- a/examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml +++ b/examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml @@ -40,7 +40,7 @@ modules: # parallel tensor_model_parallel_size: ${PRIMUS_TP:1} - pipeline_model_parallel_size: ${PRIMUS_PP:1} + pipeline_model_parallel_size: ${PRIMUS_PP:8} expert_model_parallel_size: ${PRIMUS_EP:8} overlap_grad_reduce: true overlap_param_gather: true @@ -71,6 +71,24 @@ modules: ckpt_format: torch eval_iters: 0 + # Turbo + enable_primus_turbo: true + use_turbo_attention: false + use_turbo_grouped_mlp: false + + # deepep + use_turbo_deepep: true + moe_shared_expert_overlap: false + moe_router_dtype: fp32 + + # 64 or 80 for ep8, 32 for ep16-64 is best practice + turbo_deepep_num_cu: 64 + turbo_deepep_use_comm_stream: false + + # sync-free moe support stage 1-2, 0 means not use sync-free moe + # stage 2 is recommended for better performance + turbo_sync_free_moe_stage: 1 + # Cross entropy flags # cross_entropy_fusion_impl: "te" # cross_entropy_loss_fusion: true diff --git a/examples/megatron/prepare.py b/examples/megatron/prepare.py index f7d7eddd2..464a55643 100644 --- a/examples/megatron/prepare.py +++ b/examples/megatron/prepare.py @@ -264,7 +264,9 @@ def build_megatron_helper(primus_path: Path, patch_args: Path, backend_path: str emerging_optimizers_path = primus_path / "third_party/Emerging-Optimizers" log_info(f"Building Emerging Optimizers in {emerging_optimizers_path}") - ret = subprocess.run(["pip", "install", "-e", str(emerging_optimizers_path)], check=True) + ret = subprocess.run( + ["pip", "install", "--no-build-isolation", "-e", str(emerging_optimizers_path)], check=True + ) if ret.returncode != 0: log_error_and_exit("Building Emerging Optimizers failed.") diff --git a/examples/run_local_pretrain.sh b/examples/run_local_pretrain.sh index 4cb36fdb3..688df1be2 100755 --- a/examples/run_local_pretrain.sh +++ b/examples/run_local_pretrain.sh @@ -113,7 +113,7 @@ echo "ENV_ARGS: ${ENV_ARGS[*]}" HOSTNAME=$(hostname) ARGS=("$@") -VOLUME_ARGS=(-v "$PRIMUS_PATH":"$PRIMUS_PATH" -v "$DATA_PATH":"$DATA_PATH") +VOLUME_ARGS=(-v "$PRIMUS_PATH":"$PRIMUS_PATH" -v "$DATA_PATH":"$DATA_PATH" -v "/shared_aig/c4:/shared_aig/c4") if [[ -f "$PATH_TO_BNXT_TAR_PACKAGE" ]]; then VOLUME_ARGS+=(-v "$PATH_TO_BNXT_TAR_PACKAGE":"$PATH_TO_BNXT_TAR_PACKAGE") fi @@ -134,10 +134,10 @@ export CLEAN_DOCKER_CONTAINER=${CLEAN_DOCKER_CONTAINER:-0} # ------------------ Optional Container Cleanup ------------------ docker_podman_proxy() { - if command -v podman &>/dev/null; then - podman "$@" - elif command -v docker &>/dev/null; then + if command -v docker &>/dev/null; then docker "$@" + elif command -v podman &>/dev/null; then + podman "$@" else echo "Neither Docker nor Podman found!" >&2 return 1 @@ -164,6 +164,13 @@ else echo "Node-${NODE_RANK}: Launching training container." fi +if ! docker_podman_proxy image inspect "$DOCKER_IMAGE" &>/dev/null; then + echo "Node-${NODE_RANK}: Image not found locally, pulling $DOCKER_IMAGE..." + docker_podman_proxy pull "$DOCKER_IMAGE" +else + echo "Node-${NODE_RANK}: Image $DOCKER_IMAGE already exists, skipping pull." +fi + # ------------------ Launch Training Container ------------------ docker_podman_proxy run --rm \ --env MASTER_ADDR \ diff --git a/examples/run_pretrain.sh b/examples/run_pretrain.sh index 016bc23db..5b3058b74 100755 --- a/examples/run_pretrain.sh +++ b/examples/run_pretrain.sh @@ -197,8 +197,10 @@ if [ "$USING_AINIC" == "1" ]; then export NCCL_IB_GID_INDEX=1 # export NCCL_IB_ROCE_VERSION_NUM=2 export NCCL_MAX_P2P_CHANNELS=56 - export NCCL_IB_TC=104 - export NCCL_IB_FIFO_TC=192 + # export NCCL_IB_TC=104 + # export NCCL_IB_FIFO_TC=192 + export NCCL_IB_TC=41 + export NCCL_IB_FIFO_TC=185 export NET_OPTIONAL_RECV_COMPLETION=1 export NCCL_IB_USE_INLINE=1 export RCCL_GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING=0 diff --git a/examples/run_slurm_pretrain.sh b/examples/run_slurm_pretrain.sh index 04da35a4d..0fd004526 100755 --- a/examples/run_slurm_pretrain.sh +++ b/examples/run_slurm_pretrain.sh @@ -38,10 +38,14 @@ export LOG_DIR=${LOG_DIR:-"./output"} LOG_FILE="${LOG_DIR}/log_slurm_pretrain.txt" mkdir -p "$LOG_DIR" + # --nodelist="uswslocpm2m-106-[273,297,310,319,687,732,836,892]" \ srun -N "${NNODES}" \ --exclusive \ --export ALL \ --ntasks-per-node=1 \ + --time="${SLURM_TIME:-07:00:00}" \ + --nodelist="${SLURM_NODELIST:-}" \ + --partition="${SLURM_PARTITION:-amd-aig}" \ --cpus-per-task="${CPUS_PER_TASK:-128}" \ bash -c " readarray -t node_array < <(scontrol show hostnames \"\$SLURM_JOB_NODELIST\") diff --git a/prepare_c4_data.sh b/prepare_c4_data.sh new file mode 100755 index 000000000..79aee27e5 --- /dev/null +++ b/prepare_c4_data.sh @@ -0,0 +1,144 @@ +#!/bin/bash +############################################################################### +# Prepare C4 English dataset for Megatron training with DeepSeek V3 +# +# This script: +# 1. Downloads C4-en data from HuggingFace (configurable amount) +# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datasets/allenai/c4 +# cd c4 +# git lfs pull --include "en/*" +# 2. Converts to JSONL format +# 3. Tokenizes into Megatron .bin/.idx format using DeepSeekV3Tokenizer +# +# Usage: +# bash prepare_c4_data.sh [--num_shards N] [--data_dir /path/to/data] +# +# By default downloads 1 shard (~350MB compressed, ~3M documents) for testing. +# Full C4-en has 1024 shards. Adjust --num_shards for more data. +############################################################################### + +set -e + +# ======================== Configuration ======================== +NUM_SHARDS=${NUM_SHARDS:-200} # Number of C4 shards to download (1-1024) +DATA_DIR=${DATA_DIR:-"/shared/c4"} +PRIMUS_PATH=${PRIMUS_PATH:-"/shared/john/Primus"} +TOKENIZER_TYPE="DeepSeekV3Tokenizer" +TOKENIZER_MODEL="deepseek-ai/DeepSeek-V3" +WORKERS=${WORKERS:-$(nproc)} # Number of preprocessing workers +HF_TOKEN=${HF_TOKEN:-"your_hf_token"} # Set your HuggingFace token + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --num_shards) NUM_SHARDS="$2"; shift 2;; + --data_dir) DATA_DIR="$2"; shift 2;; + --workers) WORKERS="$2"; shift 2;; + *) echo "Unknown option: $1"; exit 1;; + esac +done + +# ======================== Paths ======================== +export RAW_DIR="${DATA_DIR}/en" # Pre-downloaded shards live here +export JSONL_DIR="${DATA_DIR}/jsonl" +export TOKENIZED_DIR="${DATA_DIR}/tokenized" +export TRAIN_OUTPUT_PREFIX="${TOKENIZED_DIR}/c4_en_train" +export NUM_SHARDS + +mkdir -p "$RAW_DIR" "$JSONL_DIR" "$TOKENIZED_DIR" + +echo "============================================" +echo "C4 English Data Preparation" +echo "============================================" +echo "NUM_SHARDS: ${NUM_SHARDS} (out of 1024 total)" +echo "DATA_DIR: ${DATA_DIR}" +echo "PRIMUS_PATH: ${PRIMUS_PATH}" +echo "TOKENIZER: ${TOKENIZER_TYPE} / ${TOKENIZER_MODEL}" +echo "WORKERS: ${WORKERS}" +echo "============================================" + +# ======================== Step 1: Merge shards into JSONL ======================== +echo "" +echo ">>> Step 1: Merging C4 English shards into JSONL (${NUM_SHARDS} shards)..." +echo " (Download skipped — using pre-downloaded shards in ${RAW_DIR})" + +JSONL_FILE="${JSONL_DIR}/c4_en_train.jsonl" + +if [ -f "${JSONL_FILE}" ]; then + echo "JSONL file already exists: ${JSONL_FILE}" + echo "Skipping merge. Delete it to re-merge." +else + # Verify shards exist + MISSING=0 + for i in $(seq 0 $((NUM_SHARDS - 1))); do + SHARD_NAME=$(printf "c4-train.%05d-of-01024.json.gz" "$i") + if [ ! -f "${RAW_DIR}/${SHARD_NAME}" ]; then + echo " WARNING: Missing shard ${SHARD_NAME}" + MISSING=$((MISSING + 1)) + fi + done + if [ "$MISSING" -gt 0 ]; then + echo "ERROR: ${MISSING} shard(s) missing in ${RAW_DIR}. Cannot proceed." + exit 1 + fi + + echo "Decompressing and merging shards into JSONL ..." + for i in $(seq 0 $((NUM_SHARDS - 1))); do + SHARD_NAME=$(printf "c4-train.%05d-of-01024.json.gz" "$i") + SHARD_PATH="${RAW_DIR}/${SHARD_NAME}" + echo " [${i}/${NUM_SHARDS}] Decompressing ${SHARD_NAME} ..." + zcat "${SHARD_PATH}" >> "${JSONL_FILE}" + done + + DOC_COUNT=$(wc -l < "${JSONL_FILE}") + echo "Done! Total documents: ${DOC_COUNT}" + echo "Saved to: ${JSONL_FILE}" +fi + +echo ">>> Step 1 complete." + +# ======================== Step 2: Tokenize ======================== +echo "" +echo ">>> Step 2: Tokenizing with ${TOKENIZER_TYPE}..." + +JSONL_FILE="${JSONL_DIR}/c4_en_train.jsonl" + +if [ -f "${TRAIN_OUTPUT_PREFIX}_text_document.bin" ] && [ -f "${TRAIN_OUTPUT_PREFIX}_text_document.idx" ]; then + echo "Tokenized files already exist:" + echo " ${TRAIN_OUTPUT_PREFIX}_text_document.bin" + echo " ${TRAIN_OUTPUT_PREFIX}_text_document.idx" + echo "Skipping tokenization. Delete them to re-tokenize." +else + # Need to set up Python path for Megatron imports + export PYTHONPATH="${PRIMUS_PATH}/third_party/Megatron-LM:${PRIMUS_PATH}:${PYTHONPATH:-}" + + python3 "${PRIMUS_PATH}/examples/megatron/preprocess_data.py" \ + --input "${JSONL_FILE}" \ + --tokenizer-type "${TOKENIZER_TYPE}" \ + --tokenizer-model "${TOKENIZER_MODEL}" \ + --output-prefix "${TRAIN_OUTPUT_PREFIX}" \ + --workers "${WORKERS}" \ + --append-eod \ + --partitions 1 + + echo ">>> Step 2 complete." +fi + +# ======================== Summary ======================== +echo "" +echo "============================================" +echo "Data preparation complete!" +echo "============================================" +echo "" +echo "Tokenized data files:" +ls -lh "${TOKENIZED_DIR}/" +echo "" +echo "To use this data for training, set in run_dsv3.sh:" +echo "" +echo " 1. Change: --mock_data True → --mock_data False" +echo " 2. Add env: export PRIMUS_TOKENIZED_DATA_PATH=${TRAIN_OUTPUT_PREFIX}_text_document" +echo "" +echo "Or pass directly via environment variable before running:" +echo " export PRIMUS_TOKENIZED_DATA_PATH=${TRAIN_OUTPUT_PREFIX}_text_document" +echo "" +echo "============================================" diff --git a/primus/backends/megatron/patches/training_log/print_rank_last_patches.py b/primus/backends/megatron/patches/training_log/print_rank_last_patches.py index 59167db9f..2986c2634 100644 --- a/primus/backends/megatron/patches/training_log/print_rank_last_patches.py +++ b/primus/backends/megatron/patches/training_log/print_rank_last_patches.py @@ -193,12 +193,26 @@ def inject( local_rank = torch.cuda.current_device() r_total, r_used, r_free = get_rocm_smi_mem_info(local_rank) r_ratio = r_used / r_total + + # get the max rocm_mem_usage + usage_tensor = torch.tensor([r_used], device="cuda", dtype=torch.float32) + world_size = torch.distributed.get_world_size() + gathered_usage = [torch.zeros_like(usage_tensor) for _ in range(world_size)] + torch.distributed.all_gather(gathered_usage, usage_tensor) + + rocm_mem_usages = [t.item() for t in gathered_usage] + max_usage = max(rocm_mem_usages) + max_rank = rocm_mem_usages.index(max_usage) + rocm_mem_str = ( f" | rocm mem usage/free/total/usage_ratio: " f"{r_used / 1024 ** 3:.2f}GB/" f"{r_free / 1024 ** 3:.2f}GB/" f"{r_total / 1024 ** 3:.2f}GB/" f"{r_ratio * 100:.2f}%" + f" | rank-{max_rank} rocm max mem usage/usage_ratio: " + f"{max_usage / 1024 ** 3:.2f}GB/" + f"{max_usage / r_total * 100:.2f}%" ) # Cache for reuse on non-sampled iterations self._last_rocm_mem_str = rocm_mem_str diff --git a/start_training_dsv2_lite.sh b/start_training_dsv2_lite.sh new file mode 100755 index 000000000..387b0d728 --- /dev/null +++ b/start_training_dsv2_lite.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +export HF_TOKEN="your_hf_token" # make it your own hf token +export WANDB_API_KEY="your_wandb_api_key" # make it your own wandb api key +export DOCKER_IMAGE="docker.io/tasimage/primus:pr-563-ainic" +#export SLURM_TREE_WIDTH=128 + +export NNODES=128 +export SLURM_TIME=07:00:00 +export SLURM_PARTITION=amd-aig + +# export NCCL_DEBUG=INFO +export USING_AINIC=1 +export NCCL_IB_HCA="ionic_0:1,ionic_2:1,ionic_3:1,ionic_4:1,ionic_5:1,ionic_7:1,ionic_8:1,ionic_9:1" +export GLOO_SOCKET_IFNAME=ens9np0 +export NCCL_SOCKET_IFNAME=ens9np0 +export CLEAN_DOCKER_CONTAINER=1 + +export MBS=12 +export GBS=$((96 * NNODES)) +export PROFILE=False +export TURBO_GROUPED_MLP=False +export TURBO_DEEPEEP=True +export LEGACY_GG=True +export PRIMUS_DETERMINISTIC=0 + +# export EXP=examples/megatron/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml +export EXP=examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml +export PRIMUS_TEAM=amd +export PRIMUS_USER=tas +export PRIMUS_EXP_NAME=dsv2_lite-pretrain-mbs_$MBS-gbs_$GBS-turbogg_$TURBO_GROUPED_MLP-turbodeepep_$TURBO_DEEPEEP-legacygg_$LEGACY_GG-profile_$PROFILE + +mkdir -p output/$PRIMUS_TEAM/$PRIMUS_USER/$PRIMUS_EXP_NAME +bash ./examples/run_slurm_pretrain.sh \ + --train_iters 10 \ + --disable_wandb True \ + --disable_tensorboard True \ + --micro_batch_size $MBS \ + --global_batch_size $GBS \ + --seq_length 4096 \ + --max_position_embeddings 4096 \ + --use_turbo_grouped_mlp $TURBO_GROUPED_MLP \ + --use_turbo_deepep $TURBO_DEEPEEP \ + --moe_use_legacy_grouped_gemm $LEGACY_GG \ + --cross_entropy_fusion_impl "te" \ + --cross_entropy_loss_fusion True \ + --profile $PROFILE \ + --use_pytorch_profiler $PROFILE \ + --profile_step_end 7 \ + --profile_step_start 6 \ + 2>&1 | tee output/$PRIMUS_TEAM/$PRIMUS_USER/$PRIMUS_EXP_NAME/log.txt diff --git a/start_training_dsv3.sh b/start_training_dsv3.sh new file mode 100755 index 000000000..28390f081 --- /dev/null +++ b/start_training_dsv3.sh @@ -0,0 +1,139 @@ +#!/bin/bash + +export HF_TOKEN="your_hf_token" # make it your own hf token +export WANDB_API_KEY="your_wandb_api_key" # make it your own wandb api key +export DOCKER_IMAGE="docker.io/tasimage/primus:pr-563-ainic" +# export SLURM_TREE_WIDTH=128 +export NNODES=128 +export TRAIN_ITERS=5000 +export SLURM_TIME=48:00:00 +export SLURM_PARTITION=amd-aig +# export SLURM_NODELIST="uswslocpm2m-106-[005,015,021,030-031,038-039,042,050,056-057,063,069,077,079-080,082,084-086,091-092,122,125,138,142,145,151,176,179-180,185,190,194,196-197,199,212-215,218,220-221,224-226,273,275,281,285,297,310,319,340,346,359-360,362,373,387,392,395,399,419,423,433,442,444-445,448-450,452,454,456-457,472-475,479-481,629,631,635,640,646,656,658,663-664,667,678,681,687,695,103,700,723,732,735,740-741,757,760-761,766,772,781,784,833,841-842,851,857,865,868,883,889,895,899-900,905,1066,1070,1177]" +# export SLURM_NODELIST="uswslocpm2m-106-[005,015,021,030-031,038-039,042,050,056-057,063,069,077,079-080,082,084-086,091-092,122,125,138,142,145,151,176,179-180,185,190,194,196-197,199,212-215,218,220-221,224-226,273,275,281,285,297,310,319,340,346,359-360,362,373,387,392,395,399,419,423,433,442,444-445,448-450,452,454,456-457,472-475,479-481,629,631,635,640,646,656,658,663-664,667,678,681,687,695-696,700,723,732,735,740-741,757,760-761,766,772,781,784,833,841-842,851,857,865,868,883,889,895,899-900,905,1066,1070,1177]" +# export SLURM_NODELIST="uswslocpm2m-106-[005,015,021,030-031,038-039,042,050,056-057,063,069,077,079-080,082,084-086,091-092,122,125,138,142,145,151,176,179-180,185,190,194,196-197,199,212-215,218,220-221,224-226,273,275,281,285,297,310,319,340,346,359-360,362,373,387,392,395,399,419,423,433,442,444-445,448-450,452,454,456-457,472-475,479-481,629,631,635,640,646,628,658,663-664,667,678,681,687,695,103,700,723,732,735,740-741,757,760-761,766,772,781,784,833,841-842,851,857,865,868,883,889,895,899-900,905,1066,1070,1177]" +export SLURM_NODELIST="uswslocpm2m-106-[015,021,030-031,038-039,042,050,056-057,063,069,077,079-080,082,084-086,091-092,103,122,125,138,142,145,151,176,179-180,185,190,194,196-197,199,212-215,218,220-221,224-226,273,275,281,285,297,310,319,340,346,359,362,373,392,395,399,419,423,430,433,442,444-445,448-450,452,454,456-457,472-475,479-481,628-629,631,635,640,646,658,663-664,667,678,681,687,695,700,723,732,735,740-741,757,760-761,766,772,781,784,833,836,841-842,851,857,865,868,883,889,892,895,899-900,905,1066,1070,1177]" +# export NCCL_DEBUG=INFO +export USING_AINIC=1 +export NCCL_IB_HCA="ionic_0:1,ionic_2:1,ionic_3:1,ionic_4:1,ionic_5:1,ionic_7:1,ionic_8:1,ionic_9:1" +export GLOO_SOCKET_IFNAME=ens9np0 +export NCCL_SOCKET_IFNAME=ens9np0 +export CLEAN_DOCKER_CONTAINER=1 + +export MBS=2 +export GBS=$((128 * NNODES)) +export PRIMUS_TOTAL_LAYERS=61 +export PRIMUS_RECOMPUTE_LAYERS=2 +export PRIMUS_MOE_LAYER_FREQ=1 +export PRIMUS_PP=8 +export PRIMUS_EP=8 +export PRIMUS_VPP=2 +export PROFILE=False +export TURBO_DEEPEEP=False +export LEGACY_GG=True +export PRIMUS_DETERMINISTIC=0 +# Enable NUMA binding for better memory locality (increase stability for large models) +export ENABLE_NUMA_BINDING=1 +export HSA_KERNARG_POOL_SIZE=12582912 +# export SLURM_NODELIST="uswslocpm2m-106-[273,297,310,319,687,732,836,892]" +# export EXP=examples/megatron/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml + + +FEATURE_ARGS=() +PIPELINE_ARGS=() +if [ "$PRIMUS_VPP" -gt 1 ]; then + case "$PRIMUS_VPP" in + 2) + if [ "$PRIMUS_PP" -eq 4 ]; then + # DeepSeek-V3 has 61 decoder layers. For PP4+VPP2 (8 pipeline chunks), + # use a balanced split: 8,8,8,8,8,7,7,7. + FEATURE_ARGS+=("--pipeline_model_parallel_layout" "'Et*8|t*8|t*8|t*8|t*8|t*7|t*7|t*7,L'") + elif [ "$PRIMUS_PP" -eq 8 ]; then + # DeepSeek-V3 has 61 decoder layers. For PP8+VPP2 (16 pipeline chunks), + # use a balanced split: 4x13 + 3x3 = 61 (13 stages with 4 layers, 3 stages with 3 layers). + FEATURE_ARGS+=("--pipeline_model_parallel_layout" "'Et*4|t*4|t*4|t*4|t*4|t*4|t*4|t*4|t*4|t*4|t*4|t*4|t*4|t*3|t*3|t*3,L'") + else + echo "Unsupported PRIMUS_PP=${PRIMUS_PP} for PRIMUS_VPP=2. Supported PP values: 4, 8." >&2 + exit 1 + fi + ;; + 4) + # DeepSeek-V3 has 61 decoder layers. For PP4+VPP4 (16 pipeline chunks), + # use a balanced split: 4x13 + 3x3 = 61 (13 stages with 4 layers, 3 stages with 3 layers). + FEATURE_ARGS+=("--pipeline_model_parallel_layout" "'Et*4|t*4|t*4|t*4|t*4|t*4|t*4|t*4|t*4|t*4|t*4|t*4|t*4|t*3|t*3|t*3,L'") + ;; + *) + echo "Unsupported PRIMUS_VPP=${PRIMUS_VPP}. Supported values in this script: 1, 2, 4." >&2 + exit 1 + ;; + esac +else + if [ -z "${DECODER_LAST_PIPELINE_NUM_LAYERS:-}" ]; then + if [ "$PRIMUS_PP" -eq 4 ]; then + DECODER_LAST_PIPELINE_NUM_LAYERS=13 + elif [ "$PRIMUS_PP" -eq 8 ]; then + DECODER_LAST_PIPELINE_NUM_LAYERS=12 + else + DECODER_LAST_PIPELINE_NUM_LAYERS=13 + fi + fi + export DECODER_LAST_PIPELINE_NUM_LAYERS + MIDDLE_PP_SIZE=$((PRIMUS_PP - 1)) + if [ "$MIDDLE_PP_SIZE" -le 0 ]; then + echo "Invalid PRIMUS_PP=${PRIMUS_PP}. PRIMUS_PP must be >= 2 when PRIMUS_VPP <= 1." >&2 + exit 1 + fi + MIDDLE_STAGE_LAYERS=$((PRIMUS_TOTAL_LAYERS - DECODER_LAST_PIPELINE_NUM_LAYERS)) + if [ $((MIDDLE_STAGE_LAYERS % MIDDLE_PP_SIZE)) -ne 0 ]; then + echo "Invalid split: PRIMUS_TOTAL_LAYERS=${PRIMUS_TOTAL_LAYERS}, DECODER_LAST_PIPELINE_NUM_LAYERS=${DECODER_LAST_PIPELINE_NUM_LAYERS}, PRIMUS_PP=${PRIMUS_PP}. (PRIMUS_TOTAL_LAYERS - DECODER_LAST_PIPELINE_NUM_LAYERS) must be divisible by (PRIMUS_PP - 1)." >&2 + exit 1 + fi + PIPELINE_ARGS+=("--decoder_last_pipeline_num_layers" "$DECODER_LAST_PIPELINE_NUM_LAYERS") +fi + +export EXP=examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml +export PRIMUS_TEAM=amd +export PRIMUS_USER=tas +export PRIMUS_TOKENIZED_DATA_PATH=/shared_aig/c4/tokenized/c4_en_train_text_document # this is the tokenized data path for the training +export PRIMUS_EXP_NAME=dsv3-pretrain-mbs_$MBS-gbs_$GBS-PP_$PRIMUS_PP-EP_$PRIMUS_EP-VPP_$PRIMUS_VPP-turbodeepep_$TURBO_DEEPEEP-legacygg_$LEGACY_GG-profile_$PROFILE + +#CKPT_DIR=output/$PRIMUS_TEAM/$PRIMUS_USER/$PRIMUS_EXP_NAME/checkpoints + + +mkdir -p output/$PRIMUS_TEAM/$PRIMUS_USER/$PRIMUS_EXP_NAME +# mkdir -p "$CKPT_DIR" +bash ./examples/run_slurm_pretrain.sh \ + --num_layers $PRIMUS_TOTAL_LAYERS \ + --train_iters $TRAIN_ITERS \ + --micro_batch_size $MBS \ + --global_batch_size $GBS \ + --use_turbo_deepep $TURBO_DEEPEEP \ + --turbo_sync_free_moe_stage 0 \ + --lr 2.2e-4 \ + --min_lr 2.2e-5 \ + --lr_warmup_iters 200 \ + --lr_decay_iters 5000 \ + --lr_decay_style cosine \ + --moe_use_legacy_grouped_gemm $LEGACY_GG \ + --pipeline_model_parallel_size $PRIMUS_PP \ + --expert_model_parallel_size $PRIMUS_EP \ + "${PIPELINE_ARGS[@]}" \ + "${FEATURE_ARGS[@]}" \ + --cross_entropy_fusion_impl "te" \ + --cross_entropy_loss_fusion True \ + --recompute_num_layers $PRIMUS_RECOMPUTE_LAYERS \ + --recompute_granularity full \ + --recompute_method block \ + --disable_last_saving True \ + --moe_layer_freq $PRIMUS_MOE_LAYER_FREQ \ + --mock_data False \ + --manual_gc True \ + --manual_gc_interval 1 \ + --pp_warmup True \ + --mtp_num_layers 0 \ + --profile $PROFILE \ + --use_pytorch_profiler $PROFILE \ + --profile_step_end 7 \ + --profile_step_start 6 \ + --disable_wandb False \ + --disable_tensorboard False \ + 2>&1 | tee output/$PRIMUS_TEAM/$PRIMUS_USER/$PRIMUS_EXP_NAME/log.txt \ No newline at end of file