Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions large_language_model_pretraining/nemo/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ def __init__(
micro_batch_size,
sequence_length,
init_global_step,
eval_every,
configs={}
):
mllogger.event(key=constants.CACHE_CLEAR, value=True)
Expand All @@ -169,6 +170,7 @@ def __init__(
self.gbs = global_batch_size
self.mbs = micro_batch_size
self.seq_len = sequence_length
self.eval_every = eval_every

self.is_target_reached = False
self.status = constants.ABORTED
Expand All @@ -185,7 +187,6 @@ def set_success_status(self):
def on_train_epoch_start(self, trainer, pl_module):
mllogger.start(key=constants.EPOCH_START, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)})
mllogger.start(key=constants.BLOCK_START, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)})

return super().on_train_epoch_start(trainer, pl_module)

@rank_zero_only
Expand All @@ -201,9 +202,14 @@ def on_train_end(self, trainer, pl_module):
return super().on_train_end(trainer, pl_module)

@rank_zero_only
def on_validation_start(self, trainer, pl_module):
def log_eval_start(self, trainer, pl_module):
mllogger.end(key=constants.BLOCK_STOP, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)})
mllogger.start(key=constants.EVAL_START, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)})

def on_validation_start(self, trainer, pl_module):
trainer.val_check_interval = self.eval_every
trainer.val_check_batch = self.eval_every
self.log_eval_start(trainer, pl_module)
return super().on_validation_start(trainer, pl_module)

def on_validation_end(self, trainer, pl_module):
Expand Down
39 changes: 24 additions & 15 deletions large_language_model_pretraining/nemo/config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,43 +12,49 @@
# See the License for the specific language governing permissions and
# limitations under the License.

TODAY_DATE="$(date +'%y%m%d')"
SUFFIX="gbs4608"
EXP_DIR="${TODAY_DATE}/${SUFFIX}"

export TAG="20250630"

# SSH: username that connects to the remote cluster
export USER=""
export USER="michalm"
# SSH: remote cluster URL
export HOST=""
export HOST="cw-dfw-cs-001-login-01.nvidia.com"
# Slurm: account for job submission
export ACCOUNT=""
export ACCOUNT="coreai_mlperf_training"
# Slurm: partition for job submission
export PARTITION=""
export PARTITION="batch"
# Slurm: job time limit, defaults to 4 hours
export TIME="04:00:00"
# Slurm: --nodes arguments, default to use 288 nodes
export NNODES=288
# Slurm: --gpus_per_node and --ntasks_per_node argument, defaults to 8 GPUs per node
export GPUS_PER_NODE=8
# Slurm: max job retries for transient job failures, defaults to retry 3 times
export MAX_RETRIES=3
export MAX_RETRIES=0

# Folder mapping:
# Output directory that holds logs, any path that you like.
export JOB_DIR=""
export JOB_DIR="/lustre/fs1/portfolios/coreai/users/michalm/raw-logs/llama31_405b_reference/${EXP_DIR}"
# Image / container path, either local cache file or remote URL
export IMAGE=""
export IMAGE="/lustre/fs1/portfolios/coreai/users/michalm/containers/dl+mlperf+training_references+llama31_405b_${TAG}.sqsh"
# Dataset: C4 dataset location that contains the dataset after preprocessing
# This corresponds to the PREPROCESSED_PATH in README section 3's dataset download part
export PREPROCESSED_PATH=""
export PREPROCESSED_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/c4"
# Dataset: Numpy index working directory, contains shuffled dataset
# This path must be able to hold >400GB data
export TMP_NPY_INDEX=""
export TMP_NPY_INDEX="/lustre/fs1/portfolios/coreai/users/michalm/llm-refresh-llama31/presistent_npy_index"
# Dataset: Tokenizer path
# This corresponds to the TOKENIZER_PATH in README section 3's tokenizer download part
export TOKENIZER_PATH=""
export TOKENIZER_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/mixtral-tokenizer"

# Model: checkpoint and tokenizer path
# This is the checkpoint that we want to start with.
# Each checkpoint should be a folder containing two sub-folders: context and weights.
# And we need to pass this folder's path (the folder containing context and weights) here.
export MODEL_CKPT=""
export MODEL_CKPT="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/nemo-ckpt/405b"
# Model: Continual checkpoint directory to write and resume
# This is the directory to hold all intermediate checkpoints.
# Once a run is complete and we specify to save checkpoints,
Expand All @@ -57,9 +63,9 @@ export MODEL_CKPT=""
# Inside this directory, there should be a `checkpoint` directory that holds context and weights
# which is the "actual checkpoint".
# Notice that this path must be able to hold at least 5.2TB data since each checkpoint is 5.2TB.
export CONTINUAL_CKPT=""
export CONTINUAL_CKPT="/lustre/fs1/portfolios/coreai/users/yunzhoul/llm-reference/reference_working_directory/checkpoints"
# Model: Whether we want to restore from MODEL_CKPT path. If 0, then we are not restoring.
export USE_CKPT=0
export USE_CKPT=1
# Model: Whether we are resuming from a NeMo-formatted HuggingFace checkpoint (weights only).
# If set to 1, then checkpoint resuming code will not try to load the optimizer states.
export FROM_HF=1
Expand All @@ -71,13 +77,15 @@ export SAVE_CKPT=0
# Model: size, to choose from 8b, 70b, 405b
export SIZE="405b"
# Dataloader: Global batch size
export GBS=1152
export GBS=4608
# Dataloader: Micro batch size
export MBS=1
# Dataloader: Max run N batches, optional
# If an empty string is provided (""), then the training will continue until time limit
# If we want to save a checkpoint, then this value must be set
export MAX_STEPS=""
export START_EVAL_AT="368640"
export EVAL_EVERY="18432"

# Experiment: starting steps
# This is the starting "offset" step from the checkpoint.
Expand All @@ -92,4 +100,5 @@ export NPAR=1
# Experiment manager: provides seeds to the launched experiments, use space as delimiter, such as "1234 1235 1236"
# The training script will discard all excessive seeds, and generate seeds if given seeds < NEXP.
# To preserve randomness, we recommend not to set this value so that each time seeds can be randomly generated.
export SEEDS=""
export SEEDS="14932"
unset SEEDS
104 changes: 104 additions & 0 deletions large_language_model_pretraining/nemo/config_8b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

TODAY_DATE="$(date +'%y%m%d')"
SUFFIX="gbs1152"
EXP_DIR="${TODAY_DATE}/${SUFFIX}"

export TAG="20250629"


# SSH: username that connects to the remote cluster
export USER="michalm"
# SSH: remote cluster URL
export HOST="cw-dfw-cs-001-login-01.nvidia.com"
# Slurm: account for job submission
export ACCOUNT="coreai_mlperf_training"
# Slurm: partition for job submission
export PARTITION="batch"
# Slurm: job time limit, defaults to 4 hours
export TIME="01:00:00"
# Slurm: --nodes arguments, default to use 288 nodes
export NNODES=8
# Slurm: --gpus_per_node and --ntasks_per_node argument, defaults to 8 GPUs per node
export GPUS_PER_NODE=8
# Slurm: max job retries for transient job failures, defaults to retry 3 times
export MAX_RETRIES=0

# Folder mapping:
# Output directory that holds logs, any path that you like.
export JOB_DIR="/lustre/fs1/portfolios/coreai/users/michalm/raw-logs/llama31_405b_reference/${EXP_DIR}"
# Image / container path, either local cache file or remote URL
export IMAGE="/lustre/fs1/portfolios/coreai/users/michalm/containers/dl+mlperf+training_references+llama31_405b_${TAG}.sqsh"
# Dataset: C4 dataset location that contains the dataset after preprocessing
# This corresponds to the PREPROCESSED_PATH in README section 3's dataset download part
export PREPROCESSED_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/c4"
# Dataset: Numpy index working directory, contains shuffled dataset
# This path must be able to hold >400GB data
export TMP_NPY_INDEX="/lustre/fs1/portfolios/coreai/users/michalm/llm-refresh-llama31/presistent_npy_index"
# Dataset: Tokenizer path
# This corresponds to the TOKENIZER_PATH in README section 3's tokenizer download part
export TOKENIZER_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/mixtral-tokenizer"

# Model: checkpoint and tokenizer path
# This is the checkpoint that we want to start with.
# Each checkpoint should be a folder containing two sub-folders: context and weights.
# And we need to pass this folder's path (the folder containing context and weights) here.
export MODEL_CKPT="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/nemo-ckpt/405b"
# Model: Continual checkpoint directory to write and resume
# This is the directory to hold all intermediate checkpoints.
# Once a run is complete and we specify to save checkpoints,
# we should see a checkpoint written in this folder
# with name `checkpoint-par-x-y-steps`
# Inside this directory, there should be a `checkpoint` directory that holds context and weights
# which is the "actual checkpoint".
# Notice that this path must be able to hold at least 5.2TB data since each checkpoint is 5.2TB.
export CONTINUAL_CKPT="/lustre/fs1/portfolios/coreai/users/yunzhoul/llm-reference/reference_working_directory/checkpoints"
# Model: Whether we want to restore from MODEL_CKPT path. If 0, then we are not restoring.
export USE_CKPT=0
# Model: Whether we are resuming from a NeMo-formatted HuggingFace checkpoint (weights only).
# If set to 1, then checkpoint resuming code will not try to load the optimizer states.
export FROM_HF=1
# Model: Whether we want to save a checkpoint. Must be 1 if NPAR > 1. If 1, then we save a checkpoint at the end.
export SAVE_CKPT=0


# Training Configs:
# Model: size, to choose from 8b, 70b, 405b
export SIZE="8b"
# Dataloader: Global batch size
export GBS=1152
# Dataloader: Micro batch size
export MBS=1
# Dataloader: Max run N batches, optional
# If an empty string is provided (""), then the training will continue until time limit
# If we want to save a checkpoint, then this value must be set
export MAX_STEPS="400"
export EVAL_EVERY="11520" # skip 5 first evals
export START_EVAL_AT="23040"

# Experiment: starting steps
# This is the starting "offset" step from the checkpoint.
# For instance, if you are resuming from a checkpoint folder `checkpoint-par-0-20-steps/checkpoint`,
# which means that the model is trained for 20 steps to generate the checkpoint,
# then the value 20 is needed here.
export START_STEPS="0"
# Experiment manager: Number of experiments to launch
export NEXP=1
# Experiment manager: how many consecutive jobs we want for each experiment
export NPAR=1
# Experiment manager: provides seeds to the launched experiments, use space as delimiter, such as "1234 1235 1236"
# The training script will discard all excessive seeds, and generate seeds if given seeds < NEXP.
# To preserve randomness, we recommend not to set this value so that each time seeds can be randomly generated.
export SEEDS="14932"
11 changes: 9 additions & 2 deletions large_language_model_pretraining/nemo/pretrain_llama31.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from nemo.collections.llm.gpt.data import build_pretraining_datamodule
from callbacks import PreemptiveStop, MLPerfCallback, MetricsLogger

print("USING LOCAL COPY")

def slurm_executor(
user: str,
host: str,
Expand Down Expand Up @@ -93,6 +95,7 @@ def get_pretrain(
nnodes: int,
ngpus_per_node: int,
data_module: run.Config,
start_eval_at: Optional[int]=None,
eval_every: Optional[int]=None,
eval_batches: Optional[int]=None,
) -> run.Partial:
Expand Down Expand Up @@ -180,7 +183,7 @@ def get_pretrain(
pretrain.trainer.max_steps = math.ceil(max_tokens / 8192 / gbs)

pretrain.data = data_module
pretrain.trainer.val_check_interval = eval_every
pretrain.trainer.val_check_interval = start_eval_at
pretrain.trainer.limit_val_batches = eval_batches
pretrain.trainer.limit_test_batches = eval_batches

Expand Down Expand Up @@ -300,7 +303,8 @@ def get_parser() -> argparse.ArgumentParser:

data_group.add_argument("--gbs", type=int, default=1152, help="Global batch size, should be divisible by PP")
data_group.add_argument("--mbs", type=int, default=1, help="Micro batch size")
data_group.add_argument("--eval_every", type=int, default=46080, help="Evaluate at least every N training sequences")
data_group.add_argument("--start_eval_at", type=int, default=262144, help="Start evaluating at N training sequences")
data_group.add_argument("--eval_every", type=int, default=16384, help="Evaluate at least every N training sequences")
data_group.add_argument("--eval_tokens", type=int, default=5760, help="Evaluate using at least N evaluation sequences")
data_group.add_argument('--max_steps', type=int, default=None, help="Maximum number of steps that each experiment partition will train on. None means no restriction on max steps. ")
data_group.add_argument("--use_full_dataset", action="store_true", help="If set, then we use the full dataset, instead of the last 256/1024 shards")
Expand Down Expand Up @@ -352,6 +356,7 @@ def get_parser() -> argparse.ArgumentParser:
use_full_dataset=args.use_full_dataset,
)

start_eval_at = math.ceil(args.start_eval_at / args.gbs)
eval_every_n_batches = math.ceil(args.eval_every / (args.gbs))
eval_batches = math.ceil(args.eval_tokens / (args.gbs))

Expand All @@ -360,6 +365,7 @@ def get_parser() -> argparse.ArgumentParser:
nnodes=args.nodes,
ngpus_per_node=args.gpus_per_node,
data_module=data,
start_eval_at=start_eval_at,
eval_every=eval_every_n_batches,
eval_batches=eval_batches,
)
Expand Down Expand Up @@ -497,6 +503,7 @@ def get_parser() -> argparse.ArgumentParser:
micro_batch_size=args.mbs,
sequence_length=8192,
init_global_step=start_step,
eval_every=eval_every_n_batches,
configs=configs,
),
]
Expand Down
2 changes: 2 additions & 0 deletions large_language_model_pretraining/nemo/run_llama31.sh
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,6 @@ python3 pretrain_llama31.py \
--step_time_atol $STEP_TIME_ATOL \
--ckpt_start_step $START_STEPS \
--max_retries $MAX_RETRIES \
--eval_every $EVAL_EVERY \
--start_eval_at $START_EVAL_AT \
$CMD_SUFFIX