tinyGPT/train-gpt-example.sh at main · Germinari1/tinyGPT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/bin/bash
#SBATCH --account=<SLURM_ACCOUNT>
#SBATCH --gres=gpu:L40S:8
#SBATCH --time=5-00:00:00
#SBATCH --job-name=gpt2_openwebtext_train
#SBATCH --output=<PROJECT_ROOT>/logs/train-gpt2-%j.out
#SBATCH --error=<PROJECT_ROOT>/logs/train-gpt2-%j.err
#SBATCH --mail-type=END,FAIL
#SBATCH --mail-user=<EMAIL>

cd <PROJECT_ROOT>

# Load conda
module load anaconda/3
source $(conda info --base)/etc/profile.d/conda.sh
conda activate <CONDA_ENV_NAME>

# Set up paths
export PYTHONPATH="${PWD}:${PYTHONPATH}"
export CUDA_HOME=$CONDA_PREFIX

# Redirect caches to user data directory
export HF_HOME="<USER_DATA_DIR>/huggingface_cache"
export TRITON_CACHE_DIR="<USER_DATA_DIR>/triton_cache"
export TORCH_HOME="<USER_DATA_DIR>/torch_cache"
mkdir -p $HF_HOME $TRITON_CACHE_DIR $TORCH_HOME

# set output directory in user data directory
OUTPUT_DIR="<USER_DATA_DIR>/gpt2-openwebtext-out"
mkdir -p $OUTPUT_DIR

echo " Job Information "
echo "SLURM_JOB_ID: $SLURM_JOB_ID"
echo "SLURM_GPUS_ON_NODE: $SLURM_GPUS_ON_NODE"
echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
echo "Output directory: $OUTPUT_DIR"
echo "Training will reproduce GPT-2 (124M) on OpenWebText"

echo ""
echo " GPU Hardware Check "
nvidia-smi

echo ""
echo " PyTorch CUDA Check "
python3 -c "import torch; print(f'PyTorch version: {torch.__version__}'); print(f'CUDA available: {torch.cuda.is_available()}'); print(f'CUDA version: {torch.version.cuda}'); print(f'Number of GPUs: {torch.cuda.device_count()}'); [print(f'GPU {i}: {torch.cuda.get_device_name(i)}') for i in range(torch.cuda.device_count())]"

# Navigate to nanoGPT directory
cd <PROJECT_ROOT>

echo ""
echo " Step 1: Preparing OpenWebText dataset "
echo "This will download and tokenize the OpenWebText dataset..."
echo "This may take some time on first run."
python data/openwebtext/prepare.py

echo ""
echo " Step 2: Starting distributed training "
echo "Training GPT-2 (124M) with 8 L40S GPUs using DDP"

# Start GPU monitoring in background
nvidia-smi dmon -s mu -c 1000 -d 300 > <PROJECT_ROOT>/logs/gpu-monitor-gpt2-$SLURM_JOB_ID.log 2>&1 &
MONITOR_PID=$!

# Run distributed training with torchrun
# Using 8 GPUs for distributed data parallel training
torchrun --standalone --nproc_per_node=8 src/train.py \
    config/train_gpt2.py \
    --out_dir=$OUTPUT_DIR

# Stop GPU monitoring
kill $MONITOR_PID 2>/dev/null || true

echo ""
echo " Training complete! "
echo "Checkpoints saved to: $OUTPUT_DIR"