-
Notifications
You must be signed in to change notification settings - Fork 27
[Draft] Dev/355 test #569
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[Draft] Dev/355 test #569
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -118,14 +118,17 @@ if [[ -f "$PATH_TO_BNXT_TAR_PACKAGE" ]]; then | |||
| VOLUME_ARGS+=(-v "$PATH_TO_BNXT_TAR_PACKAGE":"$PATH_TO_BNXT_TAR_PACKAGE") | ||||
| fi | ||||
|
|
||||
| if [[ -n "${DOCKER_MOUNT_PATH:-}" ]]; then | ||||
| VOLUME_ARGS+=(-v "${DOCKER_MOUNT_PATH}":"${DOCKER_MOUNT_PATH}") | ||||
| fi | ||||
| # using ainic | ||||
| if [ "$USING_AINIC" == "1" ]; then | ||||
| ENV_ARGS+=("--env" "USING_AINIC") | ||||
| ENV_ARGS+=("--env" "RCCL_HOME_DIR") | ||||
| ENV_ARGS+=("--env" "ANP_HOME_DIR") | ||||
| ENV_ARGS+=("--env" "MPI_HOME_DIR") | ||||
|
|
||||
| # VOLUME_ARGS+=(-v /mnt/shared:/mnt/shared) | ||||
| # VOLUME_ARGS+=(-v /shared:/shared) | ||||
| # VOLUME_ARGS+=(-v /etc/libibverbs.d/:/etc/libibverbs.d:ro) | ||||
| # VOLUME_ARGS+=(-v /usr/lib/x86_64-linux-gnu/libibverbs/:/usr/lib/x86_64-linux-gnu/libibverbs/:ro) | ||||
| fi | ||||
|
|
@@ -134,10 +137,10 @@ export CLEAN_DOCKER_CONTAINER=${CLEAN_DOCKER_CONTAINER:-0} | |||
|
|
||||
| # ------------------ Optional Container Cleanup ------------------ | ||||
| docker_podman_proxy() { | ||||
| if command -v podman &>/dev/null; then | ||||
| podman "$@" | ||||
| elif command -v docker &>/dev/null; then | ||||
| if command -v docker &>/dev/null; then | ||||
| docker "$@" | ||||
| elif command -v podman &>/dev/null; then | ||||
| podman "$@" | ||||
| else | ||||
| echo "Neither Docker nor Podman found!" >&2 | ||||
| return 1 | ||||
|
|
@@ -163,7 +166,7 @@ if [[ "${SKIP_TRAIN:-0}" == "1" ]]; then | |||
| else | ||||
| echo "Node-${NODE_RANK}: Launching training container." | ||||
| fi | ||||
|
|
||||
| docker stop $(docker ps -aq) || true | ||||
|
||||
| docker stop $(docker ps -aq) || true |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -178,7 +178,7 @@ if [ "$USING_AINIC" == "1" ]; then | |||||
| export ANP_HOME_DIR=${ANP_HOME_DIR:-"/opt/amd-anp"} | ||||||
| export RCCL_HOME_DIR=${RCCL_HOME_DIR:-"/opt/rccl"} | ||||||
| export MPI_HOME_DIR=${MPI_HOME_DIR:-"/opt/ompi"} | ||||||
| export NCCL_NET_PLUGIN=librccl-anp.so | ||||||
| # export NCCL_NET_PLUGIN=librccl-anp.so # this for anp version 1.1.0-5. | ||||||
|
|
||||||
| LOG_INFO_RANK0 "Using AINIC" | ||||||
| LOG_INFO_RANK0 "RCCL_HOME_DIR: $RCCL_HOME_DIR" | ||||||
|
|
@@ -189,8 +189,8 @@ if [ "$USING_AINIC" == "1" ]; then | |||||
| export NCCL_IB_GID_INDEX=1 | ||||||
| # export NCCL_IB_ROCE_VERSION_NUM=2 | ||||||
| export NCCL_MAX_P2P_CHANNELS=56 | ||||||
| export NCCL_IB_TC=104 | ||||||
| export NCCL_IB_FIFO_TC=192 | ||||||
| export NCCL_IB_TC=41 | ||||||
| export NCCL_IB_FIFO_TC=185 | ||||||
| export NET_OPTIONAL_RECV_COMPLETION=1 | ||||||
| export NCCL_IB_USE_INLINE=1 | ||||||
| export RCCL_GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING=0 | ||||||
|
|
@@ -199,8 +199,9 @@ if [ "$USING_AINIC" == "1" ]; then | |||||
| export NCCL_IGNORE_CPU_AFFINITY=1 | ||||||
| export NCCL_IB_QPS_PER_CONNECTION=1 | ||||||
|
|
||||||
| export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/libibverbs:${RCCL_HOME_DIR}/build/release:${ANP_HOME_DIR}/build:${MPI_HOME_DIR}/lib:$LD_LIBRARY_PATH | ||||||
|
|
||||||
| #export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/libibverbs:${RCCL_HOME_DIR}/build/release:${ANP_HOME_DIR}/build:${MPI_HOME_DIR}/lib:$LD_LIBRARY_PATH | ||||||
| export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/libibverbs:${RCCL_HOME_DIR}/build/release:${ANP_HOME_DIR}/build:${MPI_HOME_DIR}/install/lib:$LD_LIBRARY_PATH | ||||||
| export LD_PRELOAD=${ANP_HOME_DIR}/build/librccl-net.so:${RCCL_HOME_DIR}/build/release/librccl.so.1.0 | ||||||
|
||||||
| export LD_PRELOAD=${ANP_HOME_DIR}/build/librccl-net.so:${RCCL_HOME_DIR}/build/release/librccl.so.1.0 | |
| export LD_PRELOAD=${ANP_HOME_DIR}/build/librccl-net.so:${RCCL_HOME_DIR}/build/release/librccl.so.1.0${LD_PRELOAD:+:$LD_PRELOAD} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -45,15 +45,28 @@ srun -N "${NNODES}" \ | |
| --cpus-per-task="${CPUS_PER_TASK:-128}" \ | ||
| bash -c " | ||
| readarray -t node_array < <(scontrol show hostnames \"\$SLURM_JOB_NODELIST\") | ||
| # Get IP address of master node from ens9np0 interface | ||
| MASTER_NODE=\${node_array[0]} | ||
| if [ \"\$SLURM_NODEID\" = \"0\" ]; then | ||
| # We are on the master node, get IP directly | ||
| MASTER_IP=\$(ip addr show ens9np0 | grep 'inet ' | awk '{print \$2}' | cut -d/ -f1) | ||
| else | ||
| # Resolve master node IP via DNS (no SSH needed) | ||
| MASTER_IP=\$(getent hosts \$MASTER_NODE | awk '{print \$1}') | ||
| fi | ||
|
Comment on lines
+48
to
+56
|
||
| if [ \"\$SLURM_NODEID\" = \"0\" ]; then | ||
| echo \"========== Slurm cluster info ==========\" | ||
| echo \"SLURM_NODELIST: \${node_array[*]}\" | ||
| echo \"SLURM_NNODES: \${SLURM_NNODES}\" | ||
| echo \"SLURM_GPUS_ON_NODE: \${SLURM_GPUS_ON_NODE}\" | ||
| echo \"MASTER_NODE: \$MASTER_NODE\" | ||
| echo \"MASTER_ADDR (IP): \$MASTER_IP\" | ||
| echo \"\" | ||
| fi | ||
| export MASTER_ADDR=\${node_array[0]} | ||
| export MASTER_ADDR=\${MASTER_IP} | ||
| export MASTER_PORT=\${MASTER_PORT} | ||
| export GLOO_SOCKET_IFNAME=ens9np0 | ||
| export NCCL_SOCKET_IFNAME=ens9np0 | ||
|
Comment on lines
+68
to
+69
|
||
| export NNODES=\${SLURM_NNODES} | ||
| export NODE_RANK=\${SLURM_PROCID} | ||
| export GPUS_PER_NODE=\${SLURM_GPUS_ON_NODE} | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,144 @@ | ||||||||||||
| #!/bin/bash | ||||||||||||
| ############################################################################### | ||||||||||||
| # Prepare C4 English dataset for Megatron training with DeepSeek V3 | ||||||||||||
| # | ||||||||||||
| # This script: | ||||||||||||
| # 1. Downloads C4-en data from HuggingFace (configurable amount) | ||||||||||||
| # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datasets/allenai/c4 | ||||||||||||
| # cd c4 | ||||||||||||
| # git lfs pull --include "en/*" | ||||||||||||
| # 2. Converts to JSONL format | ||||||||||||
| # 3. Tokenizes into Megatron .bin/.idx format using DeepSeekV3Tokenizer | ||||||||||||
| # | ||||||||||||
| # Usage: | ||||||||||||
| # bash prepare_c4_data.sh [--num_shards N] [--data_dir /path/to/data] | ||||||||||||
| # | ||||||||||||
| # By default downloads 1 shard (~350MB compressed, ~3M documents) for testing. | ||||||||||||
| # Full C4-en has 1024 shards. Adjust --num_shards for more data. | ||||||||||||
| ############################################################################### | ||||||||||||
|
|
||||||||||||
| set -e | ||||||||||||
|
|
||||||||||||
| # ======================== Configuration ======================== | ||||||||||||
| NUM_SHARDS=${NUM_SHARDS:-200} # Number of C4 shards to download (1-1024) | ||||||||||||
| DATA_DIR=${DATA_DIR:-"/shared/c4"} | ||||||||||||
|
Comment on lines
+16
to
+24
|
||||||||||||
| PRIMUS_PATH=${PRIMUS_PATH:-"/shared/john/Primus"} | ||||||||||||
|
||||||||||||
| PRIMUS_PATH=${PRIMUS_PATH:-"/shared/john/Primus"} | |
| # Determine repository root relative to this script if PRIMUS_PATH is not provided | |
| SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" | |
| REPO_ROOT="$(cd -- "${SCRIPT_DIR}/.." && pwd)" | |
| PRIMUS_PATH=${PRIMUS_PATH:-"${REPO_ROOT}"} |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,15 @@ | ||||||||||||||||||||||||||||||
| #!/bin/bash | ||||||||||||||||||||||||||||||
| export HF_TOKEN="your_hf_token" # make it your own hf token | ||||||||||||||||||||||||||||||
| export WANDB_API_KEY="your_wandb_api_key" # make it your own wandb api key | ||||||||||||||||||||||||||||||
|
Comment on lines
+2
to
+3
|
||||||||||||||||||||||||||||||
| export HF_TOKEN="your_hf_token" # make it your own hf token | |
| export WANDB_API_KEY="your_wandb_api_key" # make it your own wandb api key | |
| if [ -z "${HF_TOKEN:-}" ]; then | |
| echo "Error: HF_TOKEN environment variable is not set." >&2 | |
| exit 1 | |
| fi | |
| if [ -z "${WANDB_API_KEY:-}" ]; then | |
| echo "Error: WANDB_API_KEY environment variable is not set." >&2 | |
| exit 1 | |
| fi | |
| export HF_TOKEN | |
| export WANDB_API_KEY |
Copilot
AI
Feb 27, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
GLOBAL_BATCH_SIZE is computed from NNODES, but this script does not set a default/validation for NNODES. If run_dsv2_lite.sh is executed directly, the arithmetic expansion will fail or produce an unexpected value. Consider setting NNODES=${NNODES:-1} (or erroring if unset) before using it.
| export NCCL_IB_HCA=ionic_0,ionic_2,ionic_3,ionic_4,ionic_5,ionic_7,ionic_8,ionic_9 | |
| export NCCL_IB_HCA=ionic_0,ionic_2,ionic_3,ionic_4,ionic_5,ionic_7,ionic_8,ionic_9 | |
| NNODES=${NNODES:-1} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The config uses
wandb_disable, but the Megatron W&B patch code readsdisable_wandb(and the rest of the configs follow that naming). As-is, this key will be ignored and W&B enable/disable behavior will not match the config. Please rename todisable_wandbfor consistency and correctness.