Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ modules:
overrides:
# log
wandb_project: "Primus_DeepSeek_Pretrain"
wandb_disable: false
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The config uses wandb_disable, but the Megatron W&B patch code reads disable_wandb (and the rest of the configs follow that naming). As-is, this key will be ignored and W&B enable/disable behavior will not match the config. Please rename to disable_wandb for consistency and correctness.

Suggested change
wandb_disable: false
disable_wandb: false

Copilot uses AI. Check for mistakes.
stderr_sink_level: DEBUG

# debug
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ modules:
overrides:
# log
wandb_project: "Primus_DeepSeek_Pretrain"
disable_wandb: false
disable_tensorboard: false
stderr_sink_level: DEBUG

# debug
Expand Down Expand Up @@ -52,7 +54,7 @@ modules:
valid_data_path: null
test_data_path: null

moe_use_legacy_grouped_gemm: false
moe_use_legacy_grouped_gemm: true
# MLA
multi_latent_attention: true

Expand All @@ -71,12 +73,12 @@ modules:
eval_iters: 0

# Turbo
enable_primus_turbo: true
enable_primus_turbo: false
use_turbo_attention: false
use_turbo_grouped_mlp: false

# deepep
use_turbo_deepep: true
use_turbo_deepep: false
moe_shared_expert_overlap: false
moe_router_dtype: fp32

Expand Down
15 changes: 13 additions & 2 deletions examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ modules:
overrides:
# log
wandb_project: "Primus_DeepSeek_Pretrain"
disable_wandb: false
disable_tensorboard: false
stderr_sink_level: DEBUG

# debug
Expand All @@ -22,7 +24,7 @@ modules:

# hyper parameters
train_iters: 50
micro_batch_size: 4
micro_batch_size: 1
global_batch_size: 256
seq_length: ${PRIMUS_SEQ_LENGTH:4096}
max_position_embeddings: ${PRIMUS_MAX_POSITION_EMBEDDINGS:4096}
Expand All @@ -46,13 +48,18 @@ modules:
overlap_param_gather: true
gradient_accumulation_fusion: false

# recompute
recompute_granularity: full # full, selective
recompute_method: block # uniform, block
recompute_num_layers: 61 # int

# data
mock_data: true
train_data_path: ${PRIMUS_TOKENIZED_DATA_PATH:null}
valid_data_path: null
test_data_path: null

moe_use_legacy_grouped_gemm: true
moe_use_legacy_grouped_gemm: false # need to disalbe legacy grouped gemm for dsv3 as it will hang the job
# MLA
multi_latent_attention: true

Expand All @@ -71,6 +78,10 @@ modules:
ckpt_format: torch
eval_iters: 0

# Turbo
enable_primus_turbo: false
use_turbo_attention: false
use_turbo_grouped_mlp: false
# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
2 changes: 1 addition & 1 deletion examples/megatron/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def build_megatron_helper(primus_path: Path, patch_args: Path, backend_path: str

emerging_optimizers_path = primus_path / "third_party/Emerging-Optimizers"
log_info(f"Building Emerging Optimizers in {emerging_optimizers_path}")
ret = subprocess.run(["pip", "install", "-e", str(emerging_optimizers_path)], check=True)
ret = subprocess.run(["pip", "install", "--no-build-isolation", "-e", str(emerging_optimizers_path)], check=True)
if ret.returncode != 0:
log_error_and_exit("Building Emerging Optimizers failed.")

Expand Down
13 changes: 8 additions & 5 deletions examples/run_local_pretrain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -118,14 +118,17 @@ if [[ -f "$PATH_TO_BNXT_TAR_PACKAGE" ]]; then
VOLUME_ARGS+=(-v "$PATH_TO_BNXT_TAR_PACKAGE":"$PATH_TO_BNXT_TAR_PACKAGE")
fi

if [[ -n "${DOCKER_MOUNT_PATH:-}" ]]; then
VOLUME_ARGS+=(-v "${DOCKER_MOUNT_PATH}":"${DOCKER_MOUNT_PATH}")
fi
# using ainic
if [ "$USING_AINIC" == "1" ]; then
ENV_ARGS+=("--env" "USING_AINIC")
ENV_ARGS+=("--env" "RCCL_HOME_DIR")
ENV_ARGS+=("--env" "ANP_HOME_DIR")
ENV_ARGS+=("--env" "MPI_HOME_DIR")

# VOLUME_ARGS+=(-v /mnt/shared:/mnt/shared)
# VOLUME_ARGS+=(-v /shared:/shared)
# VOLUME_ARGS+=(-v /etc/libibverbs.d/:/etc/libibverbs.d:ro)
# VOLUME_ARGS+=(-v /usr/lib/x86_64-linux-gnu/libibverbs/:/usr/lib/x86_64-linux-gnu/libibverbs/:ro)
fi
Expand All @@ -134,10 +137,10 @@ export CLEAN_DOCKER_CONTAINER=${CLEAN_DOCKER_CONTAINER:-0}

# ------------------ Optional Container Cleanup ------------------
docker_podman_proxy() {
if command -v podman &>/dev/null; then
podman "$@"
elif command -v docker &>/dev/null; then
if command -v docker &>/dev/null; then
docker "$@"
elif command -v podman &>/dev/null; then
podman "$@"
else
echo "Neither Docker nor Podman found!" >&2
return 1
Expand All @@ -163,7 +166,7 @@ if [[ "${SKIP_TRAIN:-0}" == "1" ]]; then
else
echo "Node-${NODE_RANK}: Launching training container."
fi

docker stop $(docker ps -aq) || true
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docker stop $(docker ps -aq) will stop all Docker containers on the node, including unrelated workloads, and bypasses the existing CLEAN_DOCKER_CONTAINER logic (and the docker_podman_proxy abstraction). This is highly disruptive in shared environments; please remove it or restrict cleanup to the specific training container(s) started by this script.

Suggested change
docker stop $(docker ps -aq) || true

Copilot uses AI. Check for mistakes.
# ------------------ Launch Training Container ------------------
docker_podman_proxy run --rm \
--env MASTER_ADDR \
Expand Down
11 changes: 6 additions & 5 deletions examples/run_pretrain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ if [ "$USING_AINIC" == "1" ]; then
export ANP_HOME_DIR=${ANP_HOME_DIR:-"/opt/amd-anp"}
export RCCL_HOME_DIR=${RCCL_HOME_DIR:-"/opt/rccl"}
export MPI_HOME_DIR=${MPI_HOME_DIR:-"/opt/ompi"}
export NCCL_NET_PLUGIN=librccl-anp.so
# export NCCL_NET_PLUGIN=librccl-anp.so # this for anp version 1.1.0-5.

LOG_INFO_RANK0 "Using AINIC"
LOG_INFO_RANK0 "RCCL_HOME_DIR: $RCCL_HOME_DIR"
Expand All @@ -189,8 +189,8 @@ if [ "$USING_AINIC" == "1" ]; then
export NCCL_IB_GID_INDEX=1
# export NCCL_IB_ROCE_VERSION_NUM=2
export NCCL_MAX_P2P_CHANNELS=56
export NCCL_IB_TC=104
export NCCL_IB_FIFO_TC=192
export NCCL_IB_TC=41
export NCCL_IB_FIFO_TC=185
export NET_OPTIONAL_RECV_COMPLETION=1
export NCCL_IB_USE_INLINE=1
export RCCL_GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING=0
Expand All @@ -199,8 +199,9 @@ if [ "$USING_AINIC" == "1" ]; then
export NCCL_IGNORE_CPU_AFFINITY=1
export NCCL_IB_QPS_PER_CONNECTION=1

export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/libibverbs:${RCCL_HOME_DIR}/build/release:${ANP_HOME_DIR}/build:${MPI_HOME_DIR}/lib:$LD_LIBRARY_PATH

#export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/libibverbs:${RCCL_HOME_DIR}/build/release:${ANP_HOME_DIR}/build:${MPI_HOME_DIR}/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/libibverbs:${RCCL_HOME_DIR}/build/release:${ANP_HOME_DIR}/build:${MPI_HOME_DIR}/install/lib:$LD_LIBRARY_PATH
export LD_PRELOAD=${ANP_HOME_DIR}/build/librccl-net.so:${RCCL_HOME_DIR}/build/release/librccl.so.1.0
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This sets LD_PRELOAD without preserving any existing LD_PRELOAD from the environment. Overwriting it can break other required preloads (profilers, sanitizers, etc.). Consider appending/prepending while keeping the current value (or only setting it if unset).

Suggested change
export LD_PRELOAD=${ANP_HOME_DIR}/build/librccl-net.so:${RCCL_HOME_DIR}/build/release/librccl.so.1.0
export LD_PRELOAD=${ANP_HOME_DIR}/build/librccl-net.so:${RCCL_HOME_DIR}/build/release/librccl.so.1.0${LD_PRELOAD:+:$LD_PRELOAD}

Copilot uses AI. Check for mistakes.
else
export NCCL_IB_GID_INDEX=3
fi
Expand Down
15 changes: 14 additions & 1 deletion examples/run_slurm_pretrain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,28 @@ srun -N "${NNODES}" \
--cpus-per-task="${CPUS_PER_TASK:-128}" \
bash -c "
readarray -t node_array < <(scontrol show hostnames \"\$SLURM_JOB_NODELIST\")
# Get IP address of master node from ens9np0 interface
MASTER_NODE=\${node_array[0]}
if [ \"\$SLURM_NODEID\" = \"0\" ]; then
# We are on the master node, get IP directly
MASTER_IP=\$(ip addr show ens9np0 | grep 'inet ' | awk '{print \$2}' | cut -d/ -f1)
else
# Resolve master node IP via DNS (no SSH needed)
MASTER_IP=\$(getent hosts \$MASTER_NODE | awk '{print \$1}')
fi
Comment on lines +48 to +56
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MASTER_ADDR is now derived from ip addr show ens9np0 / getent hosts, but there is no validation that ens9np0 exists or that either command returns an IP. If MASTER_IP is empty, exporting MASTER_ADDR will break rendezvous. Consider making the interface configurable (or reusing the existing get_ip_interface helper) and failing fast when MASTER_IP cannot be determined, with a fallback to the hostname if appropriate.

Copilot uses AI. Check for mistakes.
if [ \"\$SLURM_NODEID\" = \"0\" ]; then
echo \"========== Slurm cluster info ==========\"
echo \"SLURM_NODELIST: \${node_array[*]}\"
echo \"SLURM_NNODES: \${SLURM_NNODES}\"
echo \"SLURM_GPUS_ON_NODE: \${SLURM_GPUS_ON_NODE}\"
echo \"MASTER_NODE: \$MASTER_NODE\"
echo \"MASTER_ADDR (IP): \$MASTER_IP\"
echo \"\"
fi
export MASTER_ADDR=\${node_array[0]}
export MASTER_ADDR=\${MASTER_IP}
export MASTER_PORT=\${MASTER_PORT}
export GLOO_SOCKET_IFNAME=ens9np0
export NCCL_SOCKET_IFNAME=ens9np0
Comment on lines +68 to +69
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This hard-codes GLOO_SOCKET_IFNAME/NCCL_SOCKET_IFNAME to ens9np0, which overrides the dynamic detection already implemented in examples/run_pretrain.sh and may break on systems that use a different interface name. Prefer only setting these if they are unset, or pass the intended interface via an env var (e.g., IP_INTERFACE) rather than forcing a specific device name.

Copilot uses AI. Check for mistakes.
export NNODES=\${SLURM_NNODES}
export NODE_RANK=\${SLURM_PROCID}
export GPUS_PER_NODE=\${SLURM_GPUS_ON_NODE}
Expand Down
144 changes: 144 additions & 0 deletions prepare_c4_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/bin/bash
###############################################################################
# Prepare C4 English dataset for Megatron training with DeepSeek V3
#
# This script:
# 1. Downloads C4-en data from HuggingFace (configurable amount)
# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datasets/allenai/c4
# cd c4
# git lfs pull --include "en/*"
# 2. Converts to JSONL format
# 3. Tokenizes into Megatron .bin/.idx format using DeepSeekV3Tokenizer
#
# Usage:
# bash prepare_c4_data.sh [--num_shards N] [--data_dir /path/to/data]
#
# By default downloads 1 shard (~350MB compressed, ~3M documents) for testing.
# Full C4-en has 1024 shards. Adjust --num_shards for more data.
###############################################################################

set -e

# ======================== Configuration ========================
NUM_SHARDS=${NUM_SHARDS:-200} # Number of C4 shards to download (1-1024)
DATA_DIR=${DATA_DIR:-"/shared/c4"}
Comment on lines +16 to +24
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The header/usage comment says the default is to download/prepare 1 shard for testing, but the script currently defaults NUM_SHARDS to 200 and also states that download is skipped in favor of pre-downloaded shards. Please update the documentation and/or defaults so the described behavior matches what the script actually does (and so the default runtime/data volume is reasonable).

Copilot uses AI. Check for mistakes.
PRIMUS_PATH=${PRIMUS_PATH:-"/shared/john/Primus"}
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The default PRIMUS_PATH is set to an absolute, user-specific location (/shared/john/Primus). This makes the script non-portable out of the box. Consider defaulting PRIMUS_PATH to the repo root relative to the script location (or requiring it to be provided explicitly).

Suggested change
PRIMUS_PATH=${PRIMUS_PATH:-"/shared/john/Primus"}
# Determine repository root relative to this script if PRIMUS_PATH is not provided
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd -- "${SCRIPT_DIR}/.." && pwd)"
PRIMUS_PATH=${PRIMUS_PATH:-"${REPO_ROOT}"}

Copilot uses AI. Check for mistakes.
TOKENIZER_TYPE="DeepSeekV3Tokenizer"
TOKENIZER_MODEL="deepseek-ai/DeepSeek-V3"
WORKERS=${WORKERS:-$(nproc)} # Number of preprocessing workers
HF_TOKEN=${HF_TOKEN:-"your_hf_token"} # Set your HuggingFace token

# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--num_shards) NUM_SHARDS="$2"; shift 2;;
--data_dir) DATA_DIR="$2"; shift 2;;
--workers) WORKERS="$2"; shift 2;;
*) echo "Unknown option: $1"; exit 1;;
esac
done

# ======================== Paths ========================
export RAW_DIR="${DATA_DIR}/en" # Pre-downloaded shards live here
export JSONL_DIR="${DATA_DIR}/jsonl"
export TOKENIZED_DIR="${DATA_DIR}/tokenized"
export TRAIN_OUTPUT_PREFIX="${TOKENIZED_DIR}/c4_en_train"
export NUM_SHARDS

mkdir -p "$RAW_DIR" "$JSONL_DIR" "$TOKENIZED_DIR"

echo "============================================"
echo "C4 English Data Preparation"
echo "============================================"
echo "NUM_SHARDS: ${NUM_SHARDS} (out of 1024 total)"
echo "DATA_DIR: ${DATA_DIR}"
echo "PRIMUS_PATH: ${PRIMUS_PATH}"
echo "TOKENIZER: ${TOKENIZER_TYPE} / ${TOKENIZER_MODEL}"
echo "WORKERS: ${WORKERS}"
echo "============================================"

# ======================== Step 1: Merge shards into JSONL ========================
echo ""
echo ">>> Step 1: Merging C4 English shards into JSONL (${NUM_SHARDS} shards)..."
echo " (Download skipped — using pre-downloaded shards in ${RAW_DIR})"

JSONL_FILE="${JSONL_DIR}/c4_en_train.jsonl"

if [ -f "${JSONL_FILE}" ]; then
echo "JSONL file already exists: ${JSONL_FILE}"
echo "Skipping merge. Delete it to re-merge."
else
# Verify shards exist
MISSING=0
for i in $(seq 0 $((NUM_SHARDS - 1))); do
SHARD_NAME=$(printf "c4-train.%05d-of-01024.json.gz" "$i")
if [ ! -f "${RAW_DIR}/${SHARD_NAME}" ]; then
echo " WARNING: Missing shard ${SHARD_NAME}"
MISSING=$((MISSING + 1))
fi
done
if [ "$MISSING" -gt 0 ]; then
echo "ERROR: ${MISSING} shard(s) missing in ${RAW_DIR}. Cannot proceed."
exit 1
fi

echo "Decompressing and merging shards into JSONL ..."
for i in $(seq 0 $((NUM_SHARDS - 1))); do
SHARD_NAME=$(printf "c4-train.%05d-of-01024.json.gz" "$i")
SHARD_PATH="${RAW_DIR}/${SHARD_NAME}"
echo " [${i}/${NUM_SHARDS}] Decompressing ${SHARD_NAME} ..."
zcat "${SHARD_PATH}" >> "${JSONL_FILE}"
done

DOC_COUNT=$(wc -l < "${JSONL_FILE}")
echo "Done! Total documents: ${DOC_COUNT}"
echo "Saved to: ${JSONL_FILE}"
fi

echo ">>> Step 1 complete."

# ======================== Step 2: Tokenize ========================
echo ""
echo ">>> Step 2: Tokenizing with ${TOKENIZER_TYPE}..."

JSONL_FILE="${JSONL_DIR}/c4_en_train.jsonl"

if [ -f "${TRAIN_OUTPUT_PREFIX}_text_document.bin" ] && [ -f "${TRAIN_OUTPUT_PREFIX}_text_document.idx" ]; then
echo "Tokenized files already exist:"
echo " ${TRAIN_OUTPUT_PREFIX}_text_document.bin"
echo " ${TRAIN_OUTPUT_PREFIX}_text_document.idx"
echo "Skipping tokenization. Delete them to re-tokenize."
else
# Need to set up Python path for Megatron imports
export PYTHONPATH="${PRIMUS_PATH}/third_party/Megatron-LM:${PRIMUS_PATH}:${PYTHONPATH:-}"

python3 "${PRIMUS_PATH}/examples/megatron/preprocess_data.py" \
--input "${JSONL_FILE}" \
--tokenizer-type "${TOKENIZER_TYPE}" \
--tokenizer-model "${TOKENIZER_MODEL}" \
--output-prefix "${TRAIN_OUTPUT_PREFIX}" \
--workers "${WORKERS}" \
--append-eod \
--partitions 1

echo ">>> Step 2 complete."
fi

# ======================== Summary ========================
echo ""
echo "============================================"
echo "Data preparation complete!"
echo "============================================"
echo ""
echo "Tokenized data files:"
ls -lh "${TOKENIZED_DIR}/"
echo ""
echo "To use this data for training, set in run_dsv3.sh:"
echo ""
echo " 1. Change: --mock_data True → --mock_data False"
echo " 2. Add env: export PRIMUS_TOKENIZED_DATA_PATH=${TRAIN_OUTPUT_PREFIX}_text_document"
echo ""
echo "Or pass directly via environment variable before running:"
echo " export PRIMUS_TOKENIZED_DATA_PATH=${TRAIN_OUTPUT_PREFIX}_text_document"
echo ""
echo "============================================"
15 changes: 15 additions & 0 deletions run_dsv2_lite.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash
export HF_TOKEN="your_hf_token" # make it your own hf token
export WANDB_API_KEY="your_wandb_api_key" # make it your own wandb api key
Comment on lines +2 to +3
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This script exports HF_TOKEN/WANDB_API_KEY as literal placeholder strings, which will overwrite any real credentials already set in the environment and can lead to accidental runs with invalid auth. Prefer the existing repo pattern of defaulting from the current env (e.g., HF_TOKEN=${HF_TOKEN:-...}) or validating they are set and failing fast.

Suggested change
export HF_TOKEN="your_hf_token" # make it your own hf token
export WANDB_API_KEY="your_wandb_api_key" # make it your own wandb api key
if [ -z "${HF_TOKEN:-}" ]; then
echo "Error: HF_TOKEN environment variable is not set." >&2
exit 1
fi
if [ -z "${WANDB_API_KEY:-}" ]; then
echo "Error: WANDB_API_KEY environment variable is not set." >&2
exit 1
fi
export HF_TOKEN
export WANDB_API_KEY

Copilot uses AI. Check for mistakes.
export DOCKER_IMAGE=john132/tas:primus-25.9-ainic-56
export NCCL_IB_HCA=ionic_0,ionic_2,ionic_3,ionic_4,ionic_5,ionic_7,ionic_8,ionic_9
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GLOBAL_BATCH_SIZE is computed from NNODES, but this script does not set a default/validation for NNODES. If run_dsv2_lite.sh is executed directly, the arithmetic expansion will fail or produce an unexpected value. Consider setting NNODES=${NNODES:-1} (or erroring if unset) before using it.

Suggested change
export NCCL_IB_HCA=ionic_0,ionic_2,ionic_3,ionic_4,ionic_5,ionic_7,ionic_8,ionic_9
export NCCL_IB_HCA=ionic_0,ionic_2,ionic_3,ionic_4,ionic_5,ionic_7,ionic_8,ionic_9
NNODES=${NNODES:-1}

Copilot uses AI. Check for mistakes.
export GLOBAL_BATCH_SIZE=$((96 * NNODES))
export ANP_HOME_DIR=${ANP_HOME_DIR:-"/workspace/ainic/amd-anp"}
export RCCL_HOME_DIR=${RCCL_HOME_DIR:-"/workspace/ainic/rccl"}
export MPI_HOME_DIR=${MPI_HOME_DIR:-"/workspace/ainic/ompi-4.1.6"}
export EXP=examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml
export USING_AINIC=1
export GLOO_SOCKET_IFNAME=ens9np0
export NCCL_SOCKET_IFNAME=ens9np0
export PRIMUS_DETERMINISTIC=0
bash ./examples/run_slurm_pretrain.sh --global_batch_size $GLOBAL_BATCH_SIZE --train_iters 50 --debug
Loading