From 1ec487a4eb3d8416eca72dc7668a8de53eda523a Mon Sep 17 00:00:00 2001 From: JohnQinAMD Date: Sat, 7 Feb 2026 20:14:06 +0000 Subject: [PATCH 1/2] update config for deepseek models --- .../MI355X/deepseek_v2-BF16-pretrain.yaml | 1 + .../deepseek_v2_lite-BF16-pretrain.yaml | 8 ++-- .../MI355X/deepseek_v3-BF16-pretrain.yaml | 15 ++++++- examples/megatron/prepare.py | 2 +- examples/run_local_pretrain.sh | 8 ++-- examples/run_pretrain.sh | 11 ++--- examples/run_slurm_pretrain.sh | 15 ++++++- run_dsv2_lite.sh | 16 ++++++++ run_dsv3.sh | 40 +++++++++++++++++++ start_training_dsv2_lite.sh | 33 +++++++++++++++ start_training_dsv3.sh | 33 +++++++++++++++ 11 files changed, 166 insertions(+), 16 deletions(-) create mode 100644 run_dsv2_lite.sh create mode 100644 run_dsv3.sh create mode 100755 start_training_dsv2_lite.sh create mode 100755 start_training_dsv3.sh diff --git a/examples/megatron/configs/MI355X/deepseek_v2-BF16-pretrain.yaml b/examples/megatron/configs/MI355X/deepseek_v2-BF16-pretrain.yaml index 987b0d019..93da1774f 100644 --- a/examples/megatron/configs/MI355X/deepseek_v2-BF16-pretrain.yaml +++ b/examples/megatron/configs/MI355X/deepseek_v2-BF16-pretrain.yaml @@ -13,6 +13,7 @@ modules: overrides: # log wandb_project: "Primus_DeepSeek_Pretrain" + wandb_disable: false stderr_sink_level: DEBUG # debug diff --git a/examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml b/examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml index b706c159f..94005325c 100644 --- a/examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml +++ b/examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml @@ -13,6 +13,8 @@ modules: overrides: # log wandb_project: "Primus_DeepSeek_Pretrain" + disable_wandb: false + disable_tensorboard: false stderr_sink_level: DEBUG # debug @@ -52,7 +54,7 @@ modules: valid_data_path: null test_data_path: null - moe_use_legacy_grouped_gemm: false + moe_use_legacy_grouped_gemm: true # MLA multi_latent_attention: true @@ -71,12 +73,12 @@ modules: eval_iters: 0 # Turbo - enable_primus_turbo: true + enable_primus_turbo: false use_turbo_attention: false use_turbo_grouped_mlp: false # deepep - use_turbo_deepep: true + use_turbo_deepep: false moe_shared_expert_overlap: false moe_router_dtype: fp32 diff --git a/examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml b/examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml index 51dd8affa..67d6dc955 100644 --- a/examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml +++ b/examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml @@ -13,6 +13,8 @@ modules: overrides: # log wandb_project: "Primus_DeepSeek_Pretrain" + disable_wandb: false + disable_tensorboard: false stderr_sink_level: DEBUG # debug @@ -22,7 +24,7 @@ modules: # hyper parameters train_iters: 50 - micro_batch_size: 4 + micro_batch_size: 1 global_batch_size: 256 seq_length: ${PRIMUS_SEQ_LENGTH:4096} max_position_embeddings: ${PRIMUS_MAX_POSITION_EMBEDDINGS:4096} @@ -46,13 +48,18 @@ modules: overlap_param_gather: true gradient_accumulation_fusion: false + # recompute + recompute_granularity: full # full, selective + recompute_method: block # uniform, block + recompute_num_layers: 61 # int + # data mock_data: true train_data_path: ${PRIMUS_TOKENIZED_DATA_PATH:null} valid_data_path: null test_data_path: null - moe_use_legacy_grouped_gemm: true + moe_use_legacy_grouped_gemm: false # need to disalbe legacy grouped gemm for dsv3 as it will hang the job # MLA multi_latent_attention: true @@ -71,6 +78,10 @@ modules: ckpt_format: torch eval_iters: 0 + # Turbo + enable_primus_turbo: false + use_turbo_attention: false + use_turbo_grouped_mlp: false # Cross entropy flags # cross_entropy_fusion_impl: "te" # cross_entropy_loss_fusion: true diff --git a/examples/megatron/prepare.py b/examples/megatron/prepare.py index f7d7eddd2..8fe7b067a 100644 --- a/examples/megatron/prepare.py +++ b/examples/megatron/prepare.py @@ -264,7 +264,7 @@ def build_megatron_helper(primus_path: Path, patch_args: Path, backend_path: str emerging_optimizers_path = primus_path / "third_party/Emerging-Optimizers" log_info(f"Building Emerging Optimizers in {emerging_optimizers_path}") - ret = subprocess.run(["pip", "install", "-e", str(emerging_optimizers_path)], check=True) + ret = subprocess.run(["pip", "install", "--no-build-isolation", "-e", str(emerging_optimizers_path)], check=True) if ret.returncode != 0: log_error_and_exit("Building Emerging Optimizers failed.") diff --git a/examples/run_local_pretrain.sh b/examples/run_local_pretrain.sh index 4cb36fdb3..537f908a5 100755 --- a/examples/run_local_pretrain.sh +++ b/examples/run_local_pretrain.sh @@ -134,10 +134,10 @@ export CLEAN_DOCKER_CONTAINER=${CLEAN_DOCKER_CONTAINER:-0} # ------------------ Optional Container Cleanup ------------------ docker_podman_proxy() { - if command -v podman &>/dev/null; then - podman "$@" - elif command -v docker &>/dev/null; then + if command -v docker &>/dev/null; then docker "$@" + elif command -v podman &>/dev/null; then + podman "$@" else echo "Neither Docker nor Podman found!" >&2 return 1 @@ -163,7 +163,7 @@ if [[ "${SKIP_TRAIN:-0}" == "1" ]]; then else echo "Node-${NODE_RANK}: Launching training container." fi - +docker stop $(docker ps -aq) || true # ------------------ Launch Training Container ------------------ docker_podman_proxy run --rm \ --env MASTER_ADDR \ diff --git a/examples/run_pretrain.sh b/examples/run_pretrain.sh index dd23ec02d..04e9af842 100755 --- a/examples/run_pretrain.sh +++ b/examples/run_pretrain.sh @@ -178,7 +178,7 @@ if [ "$USING_AINIC" == "1" ]; then export ANP_HOME_DIR=${ANP_HOME_DIR:-"/opt/amd-anp"} export RCCL_HOME_DIR=${RCCL_HOME_DIR:-"/opt/rccl"} export MPI_HOME_DIR=${MPI_HOME_DIR:-"/opt/ompi"} - export NCCL_NET_PLUGIN=librccl-anp.so + # export NCCL_NET_PLUGIN=librccl-anp.so # this for anp version 1.1.0-5. LOG_INFO_RANK0 "Using AINIC" LOG_INFO_RANK0 "RCCL_HOME_DIR: $RCCL_HOME_DIR" @@ -189,8 +189,8 @@ if [ "$USING_AINIC" == "1" ]; then export NCCL_IB_GID_INDEX=1 # export NCCL_IB_ROCE_VERSION_NUM=2 export NCCL_MAX_P2P_CHANNELS=56 - export NCCL_IB_TC=104 - export NCCL_IB_FIFO_TC=192 + export NCCL_IB_TC=41 + export NCCL_IB_FIFO_TC=185 export NET_OPTIONAL_RECV_COMPLETION=1 export NCCL_IB_USE_INLINE=1 export RCCL_GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING=0 @@ -199,8 +199,9 @@ if [ "$USING_AINIC" == "1" ]; then export NCCL_IGNORE_CPU_AFFINITY=1 export NCCL_IB_QPS_PER_CONNECTION=1 - export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/libibverbs:${RCCL_HOME_DIR}/build/release:${ANP_HOME_DIR}/build:${MPI_HOME_DIR}/lib:$LD_LIBRARY_PATH - + #export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/libibverbs:${RCCL_HOME_DIR}/build/release:${ANP_HOME_DIR}/build:${MPI_HOME_DIR}/lib:$LD_LIBRARY_PATH + export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/libibverbs:${RCCL_HOME_DIR}/build/release:${ANP_HOME_DIR}/build:${MPI_HOME_DIR}/install/lib:$LD_LIBRARY_PATH + export LD_PRELOAD=${ANP_HOME_DIR}/build/librccl-net.so:${RCCL_HOME_DIR}/build/release/librccl.so.1.0 else export NCCL_IB_GID_INDEX=3 fi diff --git a/examples/run_slurm_pretrain.sh b/examples/run_slurm_pretrain.sh index 04da35a4d..bb4381304 100755 --- a/examples/run_slurm_pretrain.sh +++ b/examples/run_slurm_pretrain.sh @@ -45,15 +45,28 @@ srun -N "${NNODES}" \ --cpus-per-task="${CPUS_PER_TASK:-128}" \ bash -c " readarray -t node_array < <(scontrol show hostnames \"\$SLURM_JOB_NODELIST\") + # Get IP address of master node from ens9np0 interface + MASTER_NODE=\${node_array[0]} + if [ \"\$SLURM_NODEID\" = \"0\" ]; then + # We are on the master node, get IP directly + MASTER_IP=\$(ip addr show ens9np0 | grep 'inet ' | awk '{print \$2}' | cut -d/ -f1) + else + # Query the master node via ssh + MASTER_IP=\$(ssh \$MASTER_NODE \"ip addr show ens9np0 | grep 'inet ' | awk '{print \\\$2}' | cut -d/ -f1\") + fi if [ \"\$SLURM_NODEID\" = \"0\" ]; then echo \"========== Slurm cluster info ==========\" echo \"SLURM_NODELIST: \${node_array[*]}\" echo \"SLURM_NNODES: \${SLURM_NNODES}\" echo \"SLURM_GPUS_ON_NODE: \${SLURM_GPUS_ON_NODE}\" + echo \"MASTER_NODE: \$MASTER_NODE\" + echo \"MASTER_ADDR (IP): \$MASTER_IP\" echo \"\" fi - export MASTER_ADDR=\${node_array[0]} + export MASTER_ADDR=\${MASTER_IP} export MASTER_PORT=\${MASTER_PORT} + export GLOO_SOCKET_IFNAME=ens9np0 + export NCCL_SOCKET_IFNAME=ens9np0 export NNODES=\${SLURM_NNODES} export NODE_RANK=\${SLURM_PROCID} export GPUS_PER_NODE=\${SLURM_GPUS_ON_NODE} diff --git a/run_dsv2_lite.sh b/run_dsv2_lite.sh new file mode 100644 index 000000000..a9d861cd8 --- /dev/null +++ b/run_dsv2_lite.sh @@ -0,0 +1,16 @@ +#!/bin/bash +export HF_TOKEN="your_hf_token" # change it to your own hf token +export WANDB_API_KEY="your_wandb_api_key" # change it to your own wandb api key +export DOCKER_IMAGE=primus_kernel_benchmark:backup +export NNODES=4 +export NCCL_IB_HCA=ionic_0,ionic_2,ionic_3,ionic_4,ionic_5,ionic_7,ionic_8,ionic_9 +export GLOBAL_BATCH_SIZE=$((96 * NNODES)) +export ANP_HOME_DIR=${ANP_HOME_DIR:-"/workspace/ainic/amd-anp"} +export RCCL_HOME_DIR=${RCCL_HOME_DIR:-"/workspace/ainic/rccl"} +export MPI_HOME_DIR=${MPI_HOME_DIR:-"/workspace/ainic/ompi-4.1.6"} +export EXP=examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml +export USING_AINIC=1 +export GLOO_SOCKET_IFNAME=ens9np0 +export NCCL_SOCKET_IFNAME=ens9np0 +export PRIMUS_DETERMINISTIC=0 +bash ./examples/run_slurm_pretrain.sh --global_batch_size $GLOBAL_BATCH_SIZE --train_iters 50 --debug \ No newline at end of file diff --git a/run_dsv3.sh b/run_dsv3.sh new file mode 100644 index 000000000..626fe4d2d --- /dev/null +++ b/run_dsv3.sh @@ -0,0 +1,40 @@ +#!/bin/bash +export HF_TOKEN="your_hf_token" # change it to your own hf token +export WANDB_API_KEY="your_wandb_api_key" # change it to your own wandb api key +export DOCKER_IMAGE=primus_kernel_benchmark:backup +export NNODES=8 +export NCCL_IB_HCA=ionic_0,ionic_2,ionic_3,ionic_4,ionic_5,ionic_7,ionic_8,ionic_9 +export MICRO_BATCH_SIZE=1 +export GLOBAL_BATCH_SIZE=$((64 * NNODES)) +export ANP_HOME_DIR=${ANP_HOME_DIR:-"/workspace/ainic/amd-anp"} +export RCCL_HOME_DIR=${RCCL_HOME_DIR:-"/workspace/ainic/rccl"} +export MPI_HOME_DIR=${MPI_HOME_DIR:-"/workspace/ainic/ompi-4.1.6"} +export EXP=examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml +export USING_AINIC=1 +export PRIMUS_TP=1 +export PRIMUS_PP=8 +export PRIMUS_EP=8 +export PRIMUS_VPP=1 +export TOTAL_ITERS=50 +export PRIMUS_TOTAL_LAYERS=61 +export PRIMUS_RECOMPUTE_LAYERS=8 +export PRIMUS_MOE_LAYER_FREQ=1 +export GLOO_SOCKET_IFNAME=ens9np0 +export NCCL_SOCKET_IFNAME=ens9np0 +export PRIMUS_DETERMINISTIC=0 +bash ./examples/run_slurm_pretrain.sh \ +--mtp_num_layers 0 \ +--manual_gc True \ +--manual_gc_interval 1 \ +--pp_warmup True \ +--mock_data True \ +--decoder_last_pipeline_num_layers 5 \ +--micro_batch_size $MICRO_BATCH_SIZE --global_batch_size $GLOBAL_BATCH_SIZE --train_iters $TOTAL_ITERS \ +--tensor_model_parallel_size $PRIMUS_TP \ +--pipeline_model_parallel_size $PRIMUS_PP \ +--expert_model_parallel_size $PRIMUS_EP \ +--num_layers $PRIMUS_TOTAL_LAYERS --recompute_num_layers $PRIMUS_RECOMPUTE_LAYERS --moe_layer_freq $PRIMUS_MOE_LAYER_FREQ + +# --manual_gc True \ +# --manual_gc_interval 1 \ +# --pp_warmup True \ \ No newline at end of file diff --git a/start_training_dsv2_lite.sh b/start_training_dsv2_lite.sh new file mode 100755 index 000000000..85562bf7c --- /dev/null +++ b/start_training_dsv2_lite.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Start training with W&B loss logging on rank-0 + +echo "=== Starting DeepSeek V2 Lite Training ===" +echo "This script will:" +echo "1. Allocate 8 nodes using SLURM" +echo "2. Run training with W&B logging enabled on rank-0" +echo "" + +# Load Docker image if not already present +if ! docker images --format '{{.Repository}}:{{.Tag}}' | grep -q 'primus_kernel_benchmark:backup'; then + echo "Loading Docker image..." + docker load -i /data/john/primus_kernel_benchmark_backup.tar +else + echo "Docker image already loaded, skipping." +fi + +# Clean old output (optional - comment out if you want to keep old runs) +echo "Cleaning old output directory..." +rm -rf /data/john/Primus/output/amd/root/deepseek_v2_lite-pretrain/* 2>/dev/null + +# Allocate nodes and run training +echo "Allocating 8 nodes and starting training..." +salloc -N 4 \ + --exclude=GPU-73 \ + --ntasks-per-node=1 \ + --cpus-per-task=128 \ + --exclusive --mem=0 \ + --job-name=qyy_test \ + --time=12:00:00 \ + bash -c "cd /data/john/Primus && bash run_dsv2_lite.sh" + +echo "Training completed or allocation ended." diff --git a/start_training_dsv3.sh b/start_training_dsv3.sh new file mode 100755 index 000000000..f6134f171 --- /dev/null +++ b/start_training_dsv3.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Start training with W&B loss logging on rank-0 + +echo "=== Starting DeepSeek V3 Training ===" +echo "This script will:" +echo "1. Allocate 8 nodes using SLURM" +echo "2. Run training with W&B logging enabled on rank-0" +echo "" + +# Load Docker image if not already present +if ! docker images --format '{{.Repository}}:{{.Tag}}' | grep -q 'primus_kernel_benchmark:backup'; then + echo "Loading Docker image..." + docker load -i /data/john/primus_kernel_benchmark_backup.tar +else + echo "Docker image already loaded, skipping." +fi + +# Clean old output (optional - comment out if you want to keep old runs) +echo "Cleaning old output directory..." +sudo rm -rf /data/john/Primus/output/amd/root/deepseek_v3-pretrain/* 2>/dev/null + +# Allocate nodes and run training +echo "Allocating 8 nodes (excluding GPU-20,GPU-73) and starting training..." +salloc -N 8 \ + --exclude=GPU-73 \ + --ntasks-per-node=1 \ + --cpus-per-task=128 \ + --exclusive --mem=0 \ + --job-name=qyy_test \ + --time=12:00:00 \ + bash -c 'cd /data/john/Primus && bash run_dsv3.sh' + +echo "Training completed or allocation ended." From 9f70c5ae9b4434d7fe8afa0cf382619dcaac4471 Mon Sep 17 00:00:00 2001 From: JohnQinAMD Date: Mon, 9 Feb 2026 22:40:10 +0000 Subject: [PATCH 2/2] fix ssh issue for master address --- examples/run_local_pretrain.sh | 5 +- examples/run_slurm_pretrain.sh | 4 +- prepare_c4_data.sh | 144 +++++++++++++++++++++++++++++++++ run_dsv2_lite.sh | 7 +- run_dsv3.sh | 14 ++-- start_training_dsv2_lite.sh | 41 ++++++---- start_training_dsv3.sh | 44 ++++++---- 7 files changed, 214 insertions(+), 45 deletions(-) create mode 100644 prepare_c4_data.sh diff --git a/examples/run_local_pretrain.sh b/examples/run_local_pretrain.sh index 537f908a5..2a3e856aa 100755 --- a/examples/run_local_pretrain.sh +++ b/examples/run_local_pretrain.sh @@ -118,6 +118,9 @@ if [[ -f "$PATH_TO_BNXT_TAR_PACKAGE" ]]; then VOLUME_ARGS+=(-v "$PATH_TO_BNXT_TAR_PACKAGE":"$PATH_TO_BNXT_TAR_PACKAGE") fi +if [[ -n "${DOCKER_MOUNT_PATH:-}" ]]; then + VOLUME_ARGS+=(-v "${DOCKER_MOUNT_PATH}":"${DOCKER_MOUNT_PATH}") +fi # using ainic if [ "$USING_AINIC" == "1" ]; then ENV_ARGS+=("--env" "USING_AINIC") @@ -125,7 +128,7 @@ if [ "$USING_AINIC" == "1" ]; then ENV_ARGS+=("--env" "ANP_HOME_DIR") ENV_ARGS+=("--env" "MPI_HOME_DIR") - # VOLUME_ARGS+=(-v /mnt/shared:/mnt/shared) + # VOLUME_ARGS+=(-v /shared:/shared) # VOLUME_ARGS+=(-v /etc/libibverbs.d/:/etc/libibverbs.d:ro) # VOLUME_ARGS+=(-v /usr/lib/x86_64-linux-gnu/libibverbs/:/usr/lib/x86_64-linux-gnu/libibverbs/:ro) fi diff --git a/examples/run_slurm_pretrain.sh b/examples/run_slurm_pretrain.sh index bb4381304..c76604205 100755 --- a/examples/run_slurm_pretrain.sh +++ b/examples/run_slurm_pretrain.sh @@ -51,8 +51,8 @@ srun -N "${NNODES}" \ # We are on the master node, get IP directly MASTER_IP=\$(ip addr show ens9np0 | grep 'inet ' | awk '{print \$2}' | cut -d/ -f1) else - # Query the master node via ssh - MASTER_IP=\$(ssh \$MASTER_NODE \"ip addr show ens9np0 | grep 'inet ' | awk '{print \\\$2}' | cut -d/ -f1\") + # Resolve master node IP via DNS (no SSH needed) + MASTER_IP=\$(getent hosts \$MASTER_NODE | awk '{print \$1}') fi if [ \"\$SLURM_NODEID\" = \"0\" ]; then echo \"========== Slurm cluster info ==========\" diff --git a/prepare_c4_data.sh b/prepare_c4_data.sh new file mode 100644 index 000000000..79aee27e5 --- /dev/null +++ b/prepare_c4_data.sh @@ -0,0 +1,144 @@ +#!/bin/bash +############################################################################### +# Prepare C4 English dataset for Megatron training with DeepSeek V3 +# +# This script: +# 1. Downloads C4-en data from HuggingFace (configurable amount) +# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datasets/allenai/c4 +# cd c4 +# git lfs pull --include "en/*" +# 2. Converts to JSONL format +# 3. Tokenizes into Megatron .bin/.idx format using DeepSeekV3Tokenizer +# +# Usage: +# bash prepare_c4_data.sh [--num_shards N] [--data_dir /path/to/data] +# +# By default downloads 1 shard (~350MB compressed, ~3M documents) for testing. +# Full C4-en has 1024 shards. Adjust --num_shards for more data. +############################################################################### + +set -e + +# ======================== Configuration ======================== +NUM_SHARDS=${NUM_SHARDS:-200} # Number of C4 shards to download (1-1024) +DATA_DIR=${DATA_DIR:-"/shared/c4"} +PRIMUS_PATH=${PRIMUS_PATH:-"/shared/john/Primus"} +TOKENIZER_TYPE="DeepSeekV3Tokenizer" +TOKENIZER_MODEL="deepseek-ai/DeepSeek-V3" +WORKERS=${WORKERS:-$(nproc)} # Number of preprocessing workers +HF_TOKEN=${HF_TOKEN:-"your_hf_token"} # Set your HuggingFace token + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --num_shards) NUM_SHARDS="$2"; shift 2;; + --data_dir) DATA_DIR="$2"; shift 2;; + --workers) WORKERS="$2"; shift 2;; + *) echo "Unknown option: $1"; exit 1;; + esac +done + +# ======================== Paths ======================== +export RAW_DIR="${DATA_DIR}/en" # Pre-downloaded shards live here +export JSONL_DIR="${DATA_DIR}/jsonl" +export TOKENIZED_DIR="${DATA_DIR}/tokenized" +export TRAIN_OUTPUT_PREFIX="${TOKENIZED_DIR}/c4_en_train" +export NUM_SHARDS + +mkdir -p "$RAW_DIR" "$JSONL_DIR" "$TOKENIZED_DIR" + +echo "============================================" +echo "C4 English Data Preparation" +echo "============================================" +echo "NUM_SHARDS: ${NUM_SHARDS} (out of 1024 total)" +echo "DATA_DIR: ${DATA_DIR}" +echo "PRIMUS_PATH: ${PRIMUS_PATH}" +echo "TOKENIZER: ${TOKENIZER_TYPE} / ${TOKENIZER_MODEL}" +echo "WORKERS: ${WORKERS}" +echo "============================================" + +# ======================== Step 1: Merge shards into JSONL ======================== +echo "" +echo ">>> Step 1: Merging C4 English shards into JSONL (${NUM_SHARDS} shards)..." +echo " (Download skipped — using pre-downloaded shards in ${RAW_DIR})" + +JSONL_FILE="${JSONL_DIR}/c4_en_train.jsonl" + +if [ -f "${JSONL_FILE}" ]; then + echo "JSONL file already exists: ${JSONL_FILE}" + echo "Skipping merge. Delete it to re-merge." +else + # Verify shards exist + MISSING=0 + for i in $(seq 0 $((NUM_SHARDS - 1))); do + SHARD_NAME=$(printf "c4-train.%05d-of-01024.json.gz" "$i") + if [ ! -f "${RAW_DIR}/${SHARD_NAME}" ]; then + echo " WARNING: Missing shard ${SHARD_NAME}" + MISSING=$((MISSING + 1)) + fi + done + if [ "$MISSING" -gt 0 ]; then + echo "ERROR: ${MISSING} shard(s) missing in ${RAW_DIR}. Cannot proceed." + exit 1 + fi + + echo "Decompressing and merging shards into JSONL ..." + for i in $(seq 0 $((NUM_SHARDS - 1))); do + SHARD_NAME=$(printf "c4-train.%05d-of-01024.json.gz" "$i") + SHARD_PATH="${RAW_DIR}/${SHARD_NAME}" + echo " [${i}/${NUM_SHARDS}] Decompressing ${SHARD_NAME} ..." + zcat "${SHARD_PATH}" >> "${JSONL_FILE}" + done + + DOC_COUNT=$(wc -l < "${JSONL_FILE}") + echo "Done! Total documents: ${DOC_COUNT}" + echo "Saved to: ${JSONL_FILE}" +fi + +echo ">>> Step 1 complete." + +# ======================== Step 2: Tokenize ======================== +echo "" +echo ">>> Step 2: Tokenizing with ${TOKENIZER_TYPE}..." + +JSONL_FILE="${JSONL_DIR}/c4_en_train.jsonl" + +if [ -f "${TRAIN_OUTPUT_PREFIX}_text_document.bin" ] && [ -f "${TRAIN_OUTPUT_PREFIX}_text_document.idx" ]; then + echo "Tokenized files already exist:" + echo " ${TRAIN_OUTPUT_PREFIX}_text_document.bin" + echo " ${TRAIN_OUTPUT_PREFIX}_text_document.idx" + echo "Skipping tokenization. Delete them to re-tokenize." +else + # Need to set up Python path for Megatron imports + export PYTHONPATH="${PRIMUS_PATH}/third_party/Megatron-LM:${PRIMUS_PATH}:${PYTHONPATH:-}" + + python3 "${PRIMUS_PATH}/examples/megatron/preprocess_data.py" \ + --input "${JSONL_FILE}" \ + --tokenizer-type "${TOKENIZER_TYPE}" \ + --tokenizer-model "${TOKENIZER_MODEL}" \ + --output-prefix "${TRAIN_OUTPUT_PREFIX}" \ + --workers "${WORKERS}" \ + --append-eod \ + --partitions 1 + + echo ">>> Step 2 complete." +fi + +# ======================== Summary ======================== +echo "" +echo "============================================" +echo "Data preparation complete!" +echo "============================================" +echo "" +echo "Tokenized data files:" +ls -lh "${TOKENIZED_DIR}/" +echo "" +echo "To use this data for training, set in run_dsv3.sh:" +echo "" +echo " 1. Change: --mock_data True → --mock_data False" +echo " 2. Add env: export PRIMUS_TOKENIZED_DATA_PATH=${TRAIN_OUTPUT_PREFIX}_text_document" +echo "" +echo "Or pass directly via environment variable before running:" +echo " export PRIMUS_TOKENIZED_DATA_PATH=${TRAIN_OUTPUT_PREFIX}_text_document" +echo "" +echo "============================================" diff --git a/run_dsv2_lite.sh b/run_dsv2_lite.sh index a9d861cd8..46a203a93 100644 --- a/run_dsv2_lite.sh +++ b/run_dsv2_lite.sh @@ -1,8 +1,7 @@ #!/bin/bash -export HF_TOKEN="your_hf_token" # change it to your own hf token -export WANDB_API_KEY="your_wandb_api_key" # change it to your own wandb api key -export DOCKER_IMAGE=primus_kernel_benchmark:backup -export NNODES=4 +export HF_TOKEN="your_hf_token" # make it your own hf token +export WANDB_API_KEY="your_wandb_api_key" # make it your own wandb api key +export DOCKER_IMAGE=john132/tas:primus-25.9-ainic-56 export NCCL_IB_HCA=ionic_0,ionic_2,ionic_3,ionic_4,ionic_5,ionic_7,ionic_8,ionic_9 export GLOBAL_BATCH_SIZE=$((96 * NNODES)) export ANP_HOME_DIR=${ANP_HOME_DIR:-"/workspace/ainic/amd-anp"} diff --git a/run_dsv3.sh b/run_dsv3.sh index 626fe4d2d..5df3b9a54 100644 --- a/run_dsv3.sh +++ b/run_dsv3.sh @@ -1,8 +1,7 @@ #!/bin/bash -export HF_TOKEN="your_hf_token" # change it to your own hf token -export WANDB_API_KEY="your_wandb_api_key" # change it to your own wandb api key -export DOCKER_IMAGE=primus_kernel_benchmark:backup -export NNODES=8 +export HF_TOKEN="your_hf_token" # make it your own hf token +export WANDB_API_KEY="your_wandb_api_key" # make it your own wandb api key +export DOCKER_IMAGE=john132/tas:primus-25.9-ainic-56 export NCCL_IB_HCA=ionic_0,ionic_2,ionic_3,ionic_4,ionic_5,ionic_7,ionic_8,ionic_9 export MICRO_BATCH_SIZE=1 export GLOBAL_BATCH_SIZE=$((64 * NNODES)) @@ -15,19 +14,22 @@ export PRIMUS_TP=1 export PRIMUS_PP=8 export PRIMUS_EP=8 export PRIMUS_VPP=1 -export TOTAL_ITERS=50 +export TOTAL_ITERS=50000 export PRIMUS_TOTAL_LAYERS=61 export PRIMUS_RECOMPUTE_LAYERS=8 export PRIMUS_MOE_LAYER_FREQ=1 export GLOO_SOCKET_IFNAME=ens9np0 export NCCL_SOCKET_IFNAME=ens9np0 export PRIMUS_DETERMINISTIC=0 +export DOCKER_MOUNT_PATH=/shared # this is the mount path for the docker container, we put the data path herer +# export DATA_PATH=/shared/c4/data +export PRIMUS_TOKENIZED_DATA_PATH=/shared/c4/tokenized/c4_en_train_text_document # this is the tokenized data path for the training bash ./examples/run_slurm_pretrain.sh \ --mtp_num_layers 0 \ --manual_gc True \ --manual_gc_interval 1 \ --pp_warmup True \ ---mock_data True \ +--mock_data False \ --decoder_last_pipeline_num_layers 5 \ --micro_batch_size $MICRO_BATCH_SIZE --global_batch_size $GLOBAL_BATCH_SIZE --train_iters $TOTAL_ITERS \ --tensor_model_parallel_size $PRIMUS_TP \ diff --git a/start_training_dsv2_lite.sh b/start_training_dsv2_lite.sh index 85562bf7c..1e594c979 100755 --- a/start_training_dsv2_lite.sh +++ b/start_training_dsv2_lite.sh @@ -1,33 +1,42 @@ #!/bin/bash # Start training with W&B loss logging on rank-0 - +export NNODES=8 echo "=== Starting DeepSeek V2 Lite Training ===" echo "This script will:" -echo "1. Allocate 8 nodes using SLURM" +echo "1. Allocate ${NNODES} nodes using SLURM" echo "2. Run training with W&B logging enabled on rank-0" echo "" -# Load Docker image if not already present -if ! docker images --format '{{.Repository}}:{{.Tag}}' | grep -q 'primus_kernel_benchmark:backup'; then - echo "Loading Docker image..." - docker load -i /data/john/primus_kernel_benchmark_backup.tar -else - echo "Docker image already loaded, skipping." -fi - # Clean old output (optional - comment out if you want to keep old runs) echo "Cleaning old output directory..." -rm -rf /data/john/Primus/output/amd/root/deepseek_v2_lite-pretrain/* 2>/dev/null +rm -rf output/amd/root/deepseek_v2_lite-pretrain/* 2>/dev/null # Allocate nodes and run training -echo "Allocating 8 nodes and starting training..." -salloc -N 4 \ - --exclude=GPU-73 \ +echo "Allocating ${NNODES} nodes and starting training..." +salloc -N ${NNODES} \ --ntasks-per-node=1 \ --cpus-per-task=128 \ --exclusive --mem=0 \ - --job-name=qyy_test \ + --job-name=dsv2_lite_test \ --time=12:00:00 \ - bash -c "cd /data/john/Primus && bash run_dsv2_lite.sh" + --partition=amd-slc \ + bash -c ' + echo "Loading Docker image on all nodes..." + srun --ntasks-per-node=1 bash -c " + if ! docker images --format \"{{.Repository}}:{{.Tag}}\" | grep -q \"john132/tas:primus-25.9-ainic-56\"; then + if [ -f /shared/primus-25.9-ainic-56.tar ]; then + echo \"[\$(hostname)] Loading Docker image from tar...\" + docker load -i /shared/primus-25.9-ainic-56.tar + else + echo \"[\$(hostname)] Tar file not found, pulling Docker image...\" + docker pull john132/tas:primus-25.9-ainic-56 + fi + else + echo \"[\$(hostname)] Docker image already loaded, skipping.\" + fi + " + echo "Docker image loaded on all nodes. Starting training..." + bash run_dsv2_lite.sh + ' echo "Training completed or allocation ended." diff --git a/start_training_dsv3.sh b/start_training_dsv3.sh index f6134f171..5f62830f0 100755 --- a/start_training_dsv3.sh +++ b/start_training_dsv3.sh @@ -1,33 +1,45 @@ #!/bin/bash # Start training with W&B loss logging on rank-0 +export NNODES=24 # modify the number of nodes here + echo "=== Starting DeepSeek V3 Training ===" echo "This script will:" -echo "1. Allocate 8 nodes using SLURM" +echo "1. Allocate ${NNODES} nodes using SLURM" echo "2. Run training with W&B logging enabled on rank-0" echo "" -# Load Docker image if not already present -if ! docker images --format '{{.Repository}}:{{.Tag}}' | grep -q 'primus_kernel_benchmark:backup'; then - echo "Loading Docker image..." - docker load -i /data/john/primus_kernel_benchmark_backup.tar -else - echo "Docker image already loaded, skipping." -fi - # Clean old output (optional - comment out if you want to keep old runs) echo "Cleaning old output directory..." -sudo rm -rf /data/john/Primus/output/amd/root/deepseek_v3-pretrain/* 2>/dev/null +rm -rf output/amd/root/deepseek_v3-pretrain/* 2>/dev/null # Allocate nodes and run training -echo "Allocating 8 nodes (excluding GPU-20,GPU-73) and starting training..." -salloc -N 8 \ - --exclude=GPU-73 \ +echo "Allo cating ${NNODES} nodes and starting training..." +salloc -N ${NNODES} \ --ntasks-per-node=1 \ --cpus-per-task=128 \ --exclusive --mem=0 \ - --job-name=qyy_test \ - --time=12:00:00 \ - bash -c 'cd /data/john/Primus && bash run_dsv3.sh' + --job-name=dsv3_test \ + --time=100-00:00:00 \ + --partition=amd-slc \ + bash -c ' + echo "Loading Docker image on all nodes..." + srun --ntasks-per-node=1 bash -c " + if ! docker images --format \"{{.Repository}}:{{.Tag}}\" | grep -q \"john132/tas:primus-25.9-ainic-56\"; then + if [ -f /shared/primus-25.9-ainic-56.tar ]; then + echo \"[\$(hostname)] Loading Docker image from tar...\" + docker load -i /shared/primus-25.9-ainic-56.tar + else + echo \"[\$(hostname)] Tar file not found, pulling Docker image...\" + docker pull john132/tas:primus-25.9-ainic-56 + fi + else + echo \"[\$(hostname)] Docker image already loaded, skipping.\" + fi + " + echo "Docker image loaded on all nodes. Starting training..." + bash run_dsv3.sh + ' echo "Training completed or allocation ended." +