diff --git a/examples/megatron/configs/MI355X/deepseek_v2-BF16-pretrain.yaml b/examples/megatron/configs/MI355X/deepseek_v2-BF16-pretrain.yaml index 987b0d019..93da1774f 100644 --- a/examples/megatron/configs/MI355X/deepseek_v2-BF16-pretrain.yaml +++ b/examples/megatron/configs/MI355X/deepseek_v2-BF16-pretrain.yaml @@ -13,6 +13,7 @@ modules: overrides: # log wandb_project: "Primus_DeepSeek_Pretrain" + wandb_disable: false stderr_sink_level: DEBUG # debug diff --git a/examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml b/examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml index b706c159f..94005325c 100644 --- a/examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml +++ b/examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml @@ -13,6 +13,8 @@ modules: overrides: # log wandb_project: "Primus_DeepSeek_Pretrain" + disable_wandb: false + disable_tensorboard: false stderr_sink_level: DEBUG # debug @@ -52,7 +54,7 @@ modules: valid_data_path: null test_data_path: null - moe_use_legacy_grouped_gemm: false + moe_use_legacy_grouped_gemm: true # MLA multi_latent_attention: true @@ -71,12 +73,12 @@ modules: eval_iters: 0 # Turbo - enable_primus_turbo: true + enable_primus_turbo: false use_turbo_attention: false use_turbo_grouped_mlp: false # deepep - use_turbo_deepep: true + use_turbo_deepep: false moe_shared_expert_overlap: false moe_router_dtype: fp32 diff --git a/examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml b/examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml index 51dd8affa..67d6dc955 100644 --- a/examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml +++ b/examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml @@ -13,6 +13,8 @@ modules: overrides: # log wandb_project: "Primus_DeepSeek_Pretrain" + disable_wandb: false + disable_tensorboard: false stderr_sink_level: DEBUG # debug @@ -22,7 +24,7 @@ modules: # hyper parameters train_iters: 50 - micro_batch_size: 4 + micro_batch_size: 1 global_batch_size: 256 seq_length: ${PRIMUS_SEQ_LENGTH:4096} max_position_embeddings: ${PRIMUS_MAX_POSITION_EMBEDDINGS:4096} @@ -46,13 +48,18 @@ modules: overlap_param_gather: true gradient_accumulation_fusion: false + # recompute + recompute_granularity: full # full, selective + recompute_method: block # uniform, block + recompute_num_layers: 61 # int + # data mock_data: true train_data_path: ${PRIMUS_TOKENIZED_DATA_PATH:null} valid_data_path: null test_data_path: null - moe_use_legacy_grouped_gemm: true + moe_use_legacy_grouped_gemm: false # need to disalbe legacy grouped gemm for dsv3 as it will hang the job # MLA multi_latent_attention: true @@ -71,6 +78,10 @@ modules: ckpt_format: torch eval_iters: 0 + # Turbo + enable_primus_turbo: false + use_turbo_attention: false + use_turbo_grouped_mlp: false # Cross entropy flags # cross_entropy_fusion_impl: "te" # cross_entropy_loss_fusion: true diff --git a/examples/megatron/prepare.py b/examples/megatron/prepare.py index f7d7eddd2..8fe7b067a 100644 --- a/examples/megatron/prepare.py +++ b/examples/megatron/prepare.py @@ -264,7 +264,7 @@ def build_megatron_helper(primus_path: Path, patch_args: Path, backend_path: str emerging_optimizers_path = primus_path / "third_party/Emerging-Optimizers" log_info(f"Building Emerging Optimizers in {emerging_optimizers_path}") - ret = subprocess.run(["pip", "install", "-e", str(emerging_optimizers_path)], check=True) + ret = subprocess.run(["pip", "install", "--no-build-isolation", "-e", str(emerging_optimizers_path)], check=True) if ret.returncode != 0: log_error_and_exit("Building Emerging Optimizers failed.") diff --git a/examples/run_local_pretrain.sh b/examples/run_local_pretrain.sh index 4cb36fdb3..2a3e856aa 100755 --- a/examples/run_local_pretrain.sh +++ b/examples/run_local_pretrain.sh @@ -118,6 +118,9 @@ if [[ -f "$PATH_TO_BNXT_TAR_PACKAGE" ]]; then VOLUME_ARGS+=(-v "$PATH_TO_BNXT_TAR_PACKAGE":"$PATH_TO_BNXT_TAR_PACKAGE") fi +if [[ -n "${DOCKER_MOUNT_PATH:-}" ]]; then + VOLUME_ARGS+=(-v "${DOCKER_MOUNT_PATH}":"${DOCKER_MOUNT_PATH}") +fi # using ainic if [ "$USING_AINIC" == "1" ]; then ENV_ARGS+=("--env" "USING_AINIC") @@ -125,7 +128,7 @@ if [ "$USING_AINIC" == "1" ]; then ENV_ARGS+=("--env" "ANP_HOME_DIR") ENV_ARGS+=("--env" "MPI_HOME_DIR") - # VOLUME_ARGS+=(-v /mnt/shared:/mnt/shared) + # VOLUME_ARGS+=(-v /shared:/shared) # VOLUME_ARGS+=(-v /etc/libibverbs.d/:/etc/libibverbs.d:ro) # VOLUME_ARGS+=(-v /usr/lib/x86_64-linux-gnu/libibverbs/:/usr/lib/x86_64-linux-gnu/libibverbs/:ro) fi @@ -134,10 +137,10 @@ export CLEAN_DOCKER_CONTAINER=${CLEAN_DOCKER_CONTAINER:-0} # ------------------ Optional Container Cleanup ------------------ docker_podman_proxy() { - if command -v podman &>/dev/null; then - podman "$@" - elif command -v docker &>/dev/null; then + if command -v docker &>/dev/null; then docker "$@" + elif command -v podman &>/dev/null; then + podman "$@" else echo "Neither Docker nor Podman found!" >&2 return 1 @@ -163,7 +166,7 @@ if [[ "${SKIP_TRAIN:-0}" == "1" ]]; then else echo "Node-${NODE_RANK}: Launching training container." fi - +docker stop $(docker ps -aq) || true # ------------------ Launch Training Container ------------------ docker_podman_proxy run --rm \ --env MASTER_ADDR \ diff --git a/examples/run_pretrain.sh b/examples/run_pretrain.sh index dd23ec02d..04e9af842 100755 --- a/examples/run_pretrain.sh +++ b/examples/run_pretrain.sh @@ -178,7 +178,7 @@ if [ "$USING_AINIC" == "1" ]; then export ANP_HOME_DIR=${ANP_HOME_DIR:-"/opt/amd-anp"} export RCCL_HOME_DIR=${RCCL_HOME_DIR:-"/opt/rccl"} export MPI_HOME_DIR=${MPI_HOME_DIR:-"/opt/ompi"} - export NCCL_NET_PLUGIN=librccl-anp.so + # export NCCL_NET_PLUGIN=librccl-anp.so # this for anp version 1.1.0-5. LOG_INFO_RANK0 "Using AINIC" LOG_INFO_RANK0 "RCCL_HOME_DIR: $RCCL_HOME_DIR" @@ -189,8 +189,8 @@ if [ "$USING_AINIC" == "1" ]; then export NCCL_IB_GID_INDEX=1 # export NCCL_IB_ROCE_VERSION_NUM=2 export NCCL_MAX_P2P_CHANNELS=56 - export NCCL_IB_TC=104 - export NCCL_IB_FIFO_TC=192 + export NCCL_IB_TC=41 + export NCCL_IB_FIFO_TC=185 export NET_OPTIONAL_RECV_COMPLETION=1 export NCCL_IB_USE_INLINE=1 export RCCL_GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING=0 @@ -199,8 +199,9 @@ if [ "$USING_AINIC" == "1" ]; then export NCCL_IGNORE_CPU_AFFINITY=1 export NCCL_IB_QPS_PER_CONNECTION=1 - export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/libibverbs:${RCCL_HOME_DIR}/build/release:${ANP_HOME_DIR}/build:${MPI_HOME_DIR}/lib:$LD_LIBRARY_PATH - + #export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/libibverbs:${RCCL_HOME_DIR}/build/release:${ANP_HOME_DIR}/build:${MPI_HOME_DIR}/lib:$LD_LIBRARY_PATH + export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/libibverbs:${RCCL_HOME_DIR}/build/release:${ANP_HOME_DIR}/build:${MPI_HOME_DIR}/install/lib:$LD_LIBRARY_PATH + export LD_PRELOAD=${ANP_HOME_DIR}/build/librccl-net.so:${RCCL_HOME_DIR}/build/release/librccl.so.1.0 else export NCCL_IB_GID_INDEX=3 fi diff --git a/examples/run_slurm_pretrain.sh b/examples/run_slurm_pretrain.sh index 04da35a4d..c76604205 100755 --- a/examples/run_slurm_pretrain.sh +++ b/examples/run_slurm_pretrain.sh @@ -45,15 +45,28 @@ srun -N "${NNODES}" \ --cpus-per-task="${CPUS_PER_TASK:-128}" \ bash -c " readarray -t node_array < <(scontrol show hostnames \"\$SLURM_JOB_NODELIST\") + # Get IP address of master node from ens9np0 interface + MASTER_NODE=\${node_array[0]} + if [ \"\$SLURM_NODEID\" = \"0\" ]; then + # We are on the master node, get IP directly + MASTER_IP=\$(ip addr show ens9np0 | grep 'inet ' | awk '{print \$2}' | cut -d/ -f1) + else + # Resolve master node IP via DNS (no SSH needed) + MASTER_IP=\$(getent hosts \$MASTER_NODE | awk '{print \$1}') + fi if [ \"\$SLURM_NODEID\" = \"0\" ]; then echo \"========== Slurm cluster info ==========\" echo \"SLURM_NODELIST: \${node_array[*]}\" echo \"SLURM_NNODES: \${SLURM_NNODES}\" echo \"SLURM_GPUS_ON_NODE: \${SLURM_GPUS_ON_NODE}\" + echo \"MASTER_NODE: \$MASTER_NODE\" + echo \"MASTER_ADDR (IP): \$MASTER_IP\" echo \"\" fi - export MASTER_ADDR=\${node_array[0]} + export MASTER_ADDR=\${MASTER_IP} export MASTER_PORT=\${MASTER_PORT} + export GLOO_SOCKET_IFNAME=ens9np0 + export NCCL_SOCKET_IFNAME=ens9np0 export NNODES=\${SLURM_NNODES} export NODE_RANK=\${SLURM_PROCID} export GPUS_PER_NODE=\${SLURM_GPUS_ON_NODE} diff --git a/prepare_c4_data.sh b/prepare_c4_data.sh new file mode 100644 index 000000000..79aee27e5 --- /dev/null +++ b/prepare_c4_data.sh @@ -0,0 +1,144 @@ +#!/bin/bash +############################################################################### +# Prepare C4 English dataset for Megatron training with DeepSeek V3 +# +# This script: +# 1. Downloads C4-en data from HuggingFace (configurable amount) +# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datasets/allenai/c4 +# cd c4 +# git lfs pull --include "en/*" +# 2. Converts to JSONL format +# 3. Tokenizes into Megatron .bin/.idx format using DeepSeekV3Tokenizer +# +# Usage: +# bash prepare_c4_data.sh [--num_shards N] [--data_dir /path/to/data] +# +# By default downloads 1 shard (~350MB compressed, ~3M documents) for testing. +# Full C4-en has 1024 shards. Adjust --num_shards for more data. +############################################################################### + +set -e + +# ======================== Configuration ======================== +NUM_SHARDS=${NUM_SHARDS:-200} # Number of C4 shards to download (1-1024) +DATA_DIR=${DATA_DIR:-"/shared/c4"} +PRIMUS_PATH=${PRIMUS_PATH:-"/shared/john/Primus"} +TOKENIZER_TYPE="DeepSeekV3Tokenizer" +TOKENIZER_MODEL="deepseek-ai/DeepSeek-V3" +WORKERS=${WORKERS:-$(nproc)} # Number of preprocessing workers +HF_TOKEN=${HF_TOKEN:-"your_hf_token"} # Set your HuggingFace token + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --num_shards) NUM_SHARDS="$2"; shift 2;; + --data_dir) DATA_DIR="$2"; shift 2;; + --workers) WORKERS="$2"; shift 2;; + *) echo "Unknown option: $1"; exit 1;; + esac +done + +# ======================== Paths ======================== +export RAW_DIR="${DATA_DIR}/en" # Pre-downloaded shards live here +export JSONL_DIR="${DATA_DIR}/jsonl" +export TOKENIZED_DIR="${DATA_DIR}/tokenized" +export TRAIN_OUTPUT_PREFIX="${TOKENIZED_DIR}/c4_en_train" +export NUM_SHARDS + +mkdir -p "$RAW_DIR" "$JSONL_DIR" "$TOKENIZED_DIR" + +echo "============================================" +echo "C4 English Data Preparation" +echo "============================================" +echo "NUM_SHARDS: ${NUM_SHARDS} (out of 1024 total)" +echo "DATA_DIR: ${DATA_DIR}" +echo "PRIMUS_PATH: ${PRIMUS_PATH}" +echo "TOKENIZER: ${TOKENIZER_TYPE} / ${TOKENIZER_MODEL}" +echo "WORKERS: ${WORKERS}" +echo "============================================" + +# ======================== Step 1: Merge shards into JSONL ======================== +echo "" +echo ">>> Step 1: Merging C4 English shards into JSONL (${NUM_SHARDS} shards)..." +echo " (Download skipped — using pre-downloaded shards in ${RAW_DIR})" + +JSONL_FILE="${JSONL_DIR}/c4_en_train.jsonl" + +if [ -f "${JSONL_FILE}" ]; then + echo "JSONL file already exists: ${JSONL_FILE}" + echo "Skipping merge. Delete it to re-merge." +else + # Verify shards exist + MISSING=0 + for i in $(seq 0 $((NUM_SHARDS - 1))); do + SHARD_NAME=$(printf "c4-train.%05d-of-01024.json.gz" "$i") + if [ ! -f "${RAW_DIR}/${SHARD_NAME}" ]; then + echo " WARNING: Missing shard ${SHARD_NAME}" + MISSING=$((MISSING + 1)) + fi + done + if [ "$MISSING" -gt 0 ]; then + echo "ERROR: ${MISSING} shard(s) missing in ${RAW_DIR}. Cannot proceed." + exit 1 + fi + + echo "Decompressing and merging shards into JSONL ..." + for i in $(seq 0 $((NUM_SHARDS - 1))); do + SHARD_NAME=$(printf "c4-train.%05d-of-01024.json.gz" "$i") + SHARD_PATH="${RAW_DIR}/${SHARD_NAME}" + echo " [${i}/${NUM_SHARDS}] Decompressing ${SHARD_NAME} ..." + zcat "${SHARD_PATH}" >> "${JSONL_FILE}" + done + + DOC_COUNT=$(wc -l < "${JSONL_FILE}") + echo "Done! Total documents: ${DOC_COUNT}" + echo "Saved to: ${JSONL_FILE}" +fi + +echo ">>> Step 1 complete." + +# ======================== Step 2: Tokenize ======================== +echo "" +echo ">>> Step 2: Tokenizing with ${TOKENIZER_TYPE}..." + +JSONL_FILE="${JSONL_DIR}/c4_en_train.jsonl" + +if [ -f "${TRAIN_OUTPUT_PREFIX}_text_document.bin" ] && [ -f "${TRAIN_OUTPUT_PREFIX}_text_document.idx" ]; then + echo "Tokenized files already exist:" + echo " ${TRAIN_OUTPUT_PREFIX}_text_document.bin" + echo " ${TRAIN_OUTPUT_PREFIX}_text_document.idx" + echo "Skipping tokenization. Delete them to re-tokenize." +else + # Need to set up Python path for Megatron imports + export PYTHONPATH="${PRIMUS_PATH}/third_party/Megatron-LM:${PRIMUS_PATH}:${PYTHONPATH:-}" + + python3 "${PRIMUS_PATH}/examples/megatron/preprocess_data.py" \ + --input "${JSONL_FILE}" \ + --tokenizer-type "${TOKENIZER_TYPE}" \ + --tokenizer-model "${TOKENIZER_MODEL}" \ + --output-prefix "${TRAIN_OUTPUT_PREFIX}" \ + --workers "${WORKERS}" \ + --append-eod \ + --partitions 1 + + echo ">>> Step 2 complete." +fi + +# ======================== Summary ======================== +echo "" +echo "============================================" +echo "Data preparation complete!" +echo "============================================" +echo "" +echo "Tokenized data files:" +ls -lh "${TOKENIZED_DIR}/" +echo "" +echo "To use this data for training, set in run_dsv3.sh:" +echo "" +echo " 1. Change: --mock_data True → --mock_data False" +echo " 2. Add env: export PRIMUS_TOKENIZED_DATA_PATH=${TRAIN_OUTPUT_PREFIX}_text_document" +echo "" +echo "Or pass directly via environment variable before running:" +echo " export PRIMUS_TOKENIZED_DATA_PATH=${TRAIN_OUTPUT_PREFIX}_text_document" +echo "" +echo "============================================" diff --git a/run_dsv2_lite.sh b/run_dsv2_lite.sh new file mode 100644 index 000000000..46a203a93 --- /dev/null +++ b/run_dsv2_lite.sh @@ -0,0 +1,15 @@ +#!/bin/bash +export HF_TOKEN="your_hf_token" # make it your own hf token +export WANDB_API_KEY="your_wandb_api_key" # make it your own wandb api key +export DOCKER_IMAGE=john132/tas:primus-25.9-ainic-56 +export NCCL_IB_HCA=ionic_0,ionic_2,ionic_3,ionic_4,ionic_5,ionic_7,ionic_8,ionic_9 +export GLOBAL_BATCH_SIZE=$((96 * NNODES)) +export ANP_HOME_DIR=${ANP_HOME_DIR:-"/workspace/ainic/amd-anp"} +export RCCL_HOME_DIR=${RCCL_HOME_DIR:-"/workspace/ainic/rccl"} +export MPI_HOME_DIR=${MPI_HOME_DIR:-"/workspace/ainic/ompi-4.1.6"} +export EXP=examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml +export USING_AINIC=1 +export GLOO_SOCKET_IFNAME=ens9np0 +export NCCL_SOCKET_IFNAME=ens9np0 +export PRIMUS_DETERMINISTIC=0 +bash ./examples/run_slurm_pretrain.sh --global_batch_size $GLOBAL_BATCH_SIZE --train_iters 50 --debug \ No newline at end of file diff --git a/run_dsv3.sh b/run_dsv3.sh new file mode 100644 index 000000000..5df3b9a54 --- /dev/null +++ b/run_dsv3.sh @@ -0,0 +1,42 @@ +#!/bin/bash +export HF_TOKEN="your_hf_token" # make it your own hf token +export WANDB_API_KEY="your_wandb_api_key" # make it your own wandb api key +export DOCKER_IMAGE=john132/tas:primus-25.9-ainic-56 +export NCCL_IB_HCA=ionic_0,ionic_2,ionic_3,ionic_4,ionic_5,ionic_7,ionic_8,ionic_9 +export MICRO_BATCH_SIZE=1 +export GLOBAL_BATCH_SIZE=$((64 * NNODES)) +export ANP_HOME_DIR=${ANP_HOME_DIR:-"/workspace/ainic/amd-anp"} +export RCCL_HOME_DIR=${RCCL_HOME_DIR:-"/workspace/ainic/rccl"} +export MPI_HOME_DIR=${MPI_HOME_DIR:-"/workspace/ainic/ompi-4.1.6"} +export EXP=examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml +export USING_AINIC=1 +export PRIMUS_TP=1 +export PRIMUS_PP=8 +export PRIMUS_EP=8 +export PRIMUS_VPP=1 +export TOTAL_ITERS=50000 +export PRIMUS_TOTAL_LAYERS=61 +export PRIMUS_RECOMPUTE_LAYERS=8 +export PRIMUS_MOE_LAYER_FREQ=1 +export GLOO_SOCKET_IFNAME=ens9np0 +export NCCL_SOCKET_IFNAME=ens9np0 +export PRIMUS_DETERMINISTIC=0 +export DOCKER_MOUNT_PATH=/shared # this is the mount path for the docker container, we put the data path herer +# export DATA_PATH=/shared/c4/data +export PRIMUS_TOKENIZED_DATA_PATH=/shared/c4/tokenized/c4_en_train_text_document # this is the tokenized data path for the training +bash ./examples/run_slurm_pretrain.sh \ +--mtp_num_layers 0 \ +--manual_gc True \ +--manual_gc_interval 1 \ +--pp_warmup True \ +--mock_data False \ +--decoder_last_pipeline_num_layers 5 \ +--micro_batch_size $MICRO_BATCH_SIZE --global_batch_size $GLOBAL_BATCH_SIZE --train_iters $TOTAL_ITERS \ +--tensor_model_parallel_size $PRIMUS_TP \ +--pipeline_model_parallel_size $PRIMUS_PP \ +--expert_model_parallel_size $PRIMUS_EP \ +--num_layers $PRIMUS_TOTAL_LAYERS --recompute_num_layers $PRIMUS_RECOMPUTE_LAYERS --moe_layer_freq $PRIMUS_MOE_LAYER_FREQ + +# --manual_gc True \ +# --manual_gc_interval 1 \ +# --pp_warmup True \ \ No newline at end of file diff --git a/start_training_dsv2_lite.sh b/start_training_dsv2_lite.sh new file mode 100755 index 000000000..1e594c979 --- /dev/null +++ b/start_training_dsv2_lite.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Start training with W&B loss logging on rank-0 +export NNODES=8 +echo "=== Starting DeepSeek V2 Lite Training ===" +echo "This script will:" +echo "1. Allocate ${NNODES} nodes using SLURM" +echo "2. Run training with W&B logging enabled on rank-0" +echo "" + +# Clean old output (optional - comment out if you want to keep old runs) +echo "Cleaning old output directory..." +rm -rf output/amd/root/deepseek_v2_lite-pretrain/* 2>/dev/null + +# Allocate nodes and run training +echo "Allocating ${NNODES} nodes and starting training..." +salloc -N ${NNODES} \ + --ntasks-per-node=1 \ + --cpus-per-task=128 \ + --exclusive --mem=0 \ + --job-name=dsv2_lite_test \ + --time=12:00:00 \ + --partition=amd-slc \ + bash -c ' + echo "Loading Docker image on all nodes..." + srun --ntasks-per-node=1 bash -c " + if ! docker images --format \"{{.Repository}}:{{.Tag}}\" | grep -q \"john132/tas:primus-25.9-ainic-56\"; then + if [ -f /shared/primus-25.9-ainic-56.tar ]; then + echo \"[\$(hostname)] Loading Docker image from tar...\" + docker load -i /shared/primus-25.9-ainic-56.tar + else + echo \"[\$(hostname)] Tar file not found, pulling Docker image...\" + docker pull john132/tas:primus-25.9-ainic-56 + fi + else + echo \"[\$(hostname)] Docker image already loaded, skipping.\" + fi + " + echo "Docker image loaded on all nodes. Starting training..." + bash run_dsv2_lite.sh + ' + +echo "Training completed or allocation ended." diff --git a/start_training_dsv3.sh b/start_training_dsv3.sh new file mode 100755 index 000000000..5f62830f0 --- /dev/null +++ b/start_training_dsv3.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Start training with W&B loss logging on rank-0 + +export NNODES=24 # modify the number of nodes here + +echo "=== Starting DeepSeek V3 Training ===" +echo "This script will:" +echo "1. Allocate ${NNODES} nodes using SLURM" +echo "2. Run training with W&B logging enabled on rank-0" +echo "" + +# Clean old output (optional - comment out if you want to keep old runs) +echo "Cleaning old output directory..." +rm -rf output/amd/root/deepseek_v3-pretrain/* 2>/dev/null + +# Allocate nodes and run training +echo "Allo cating ${NNODES} nodes and starting training..." +salloc -N ${NNODES} \ + --ntasks-per-node=1 \ + --cpus-per-task=128 \ + --exclusive --mem=0 \ + --job-name=dsv3_test \ + --time=100-00:00:00 \ + --partition=amd-slc \ + bash -c ' + echo "Loading Docker image on all nodes..." + srun --ntasks-per-node=1 bash -c " + if ! docker images --format \"{{.Repository}}:{{.Tag}}\" | grep -q \"john132/tas:primus-25.9-ainic-56\"; then + if [ -f /shared/primus-25.9-ainic-56.tar ]; then + echo \"[\$(hostname)] Loading Docker image from tar...\" + docker load -i /shared/primus-25.9-ainic-56.tar + else + echo \"[\$(hostname)] Tar file not found, pulling Docker image...\" + docker pull john132/tas:primus-25.9-ainic-56 + fi + else + echo \"[\$(hostname)] Docker image already loaded, skipping.\" + fi + " + echo "Docker image loaded on all nodes. Starting training..." + bash run_dsv3.sh + ' + +echo "Training completed or allocation ended." +