diff --git a/large_language_model_pretraining/nemo/callbacks.py b/large_language_model_pretraining/nemo/callbacks.py index 7d4ad2439..99ab4d219 100644 --- a/large_language_model_pretraining/nemo/callbacks.py +++ b/large_language_model_pretraining/nemo/callbacks.py @@ -159,6 +159,7 @@ def __init__( micro_batch_size, sequence_length, init_global_step, + eval_every, configs={} ): mllogger.event(key=constants.CACHE_CLEAR, value=True) @@ -169,6 +170,7 @@ def __init__( self.gbs = global_batch_size self.mbs = micro_batch_size self.seq_len = sequence_length + self.eval_every = eval_every self.is_target_reached = False self.status = constants.ABORTED @@ -185,7 +187,6 @@ def set_success_status(self): def on_train_epoch_start(self, trainer, pl_module): mllogger.start(key=constants.EPOCH_START, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)}) mllogger.start(key=constants.BLOCK_START, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)}) - return super().on_train_epoch_start(trainer, pl_module) @rank_zero_only @@ -201,9 +202,14 @@ def on_train_end(self, trainer, pl_module): return super().on_train_end(trainer, pl_module) @rank_zero_only - def on_validation_start(self, trainer, pl_module): + def log_eval_start(self, trainer, pl_module): mllogger.end(key=constants.BLOCK_STOP, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)}) mllogger.start(key=constants.EVAL_START, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)}) + + def on_validation_start(self, trainer, pl_module): + trainer.val_check_interval = self.eval_every + trainer.val_check_batch = self.eval_every + self.log_eval_start(trainer, pl_module) return super().on_validation_start(trainer, pl_module) def on_validation_end(self, trainer, pl_module): diff --git a/large_language_model_pretraining/nemo/config.sh b/large_language_model_pretraining/nemo/config.sh index 5df2a222b..d3e96ada5 100644 --- a/large_language_model_pretraining/nemo/config.sh +++ b/large_language_model_pretraining/nemo/config.sh @@ -12,14 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +TODAY_DATE="$(date +'%y%m%d')" +SUFFIX="gbs4608" +EXP_DIR="${TODAY_DATE}/${SUFFIX}" + +export TAG="20250630" + # SSH: username that connects to the remote cluster -export USER="" +export USER="michalm" # SSH: remote cluster URL -export HOST="" +export HOST="cw-dfw-cs-001-login-01.nvidia.com" # Slurm: account for job submission -export ACCOUNT="" +export ACCOUNT="coreai_mlperf_training" # Slurm: partition for job submission -export PARTITION="" +export PARTITION="batch" # Slurm: job time limit, defaults to 4 hours export TIME="04:00:00" # Slurm: --nodes arguments, default to use 288 nodes @@ -27,28 +33,28 @@ export NNODES=288 # Slurm: --gpus_per_node and --ntasks_per_node argument, defaults to 8 GPUs per node export GPUS_PER_NODE=8 # Slurm: max job retries for transient job failures, defaults to retry 3 times -export MAX_RETRIES=3 +export MAX_RETRIES=0 # Folder mapping: # Output directory that holds logs, any path that you like. -export JOB_DIR="" +export JOB_DIR="/lustre/fs1/portfolios/coreai/users/michalm/raw-logs/llama31_405b_reference/${EXP_DIR}" # Image / container path, either local cache file or remote URL -export IMAGE="" +export IMAGE="/lustre/fs1/portfolios/coreai/users/michalm/containers/dl+mlperf+training_references+llama31_405b_${TAG}.sqsh" # Dataset: C4 dataset location that contains the dataset after preprocessing # This corresponds to the PREPROCESSED_PATH in README section 3's dataset download part -export PREPROCESSED_PATH="" +export PREPROCESSED_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/c4" # Dataset: Numpy index working directory, contains shuffled dataset # This path must be able to hold >400GB data -export TMP_NPY_INDEX="" +export TMP_NPY_INDEX="/lustre/fs1/portfolios/coreai/users/michalm/llm-refresh-llama31/presistent_npy_index" # Dataset: Tokenizer path # This corresponds to the TOKENIZER_PATH in README section 3's tokenizer download part -export TOKENIZER_PATH="" +export TOKENIZER_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/mixtral-tokenizer" # Model: checkpoint and tokenizer path # This is the checkpoint that we want to start with. # Each checkpoint should be a folder containing two sub-folders: context and weights. # And we need to pass this folder's path (the folder containing context and weights) here. -export MODEL_CKPT="" +export MODEL_CKPT="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/nemo-ckpt/405b" # Model: Continual checkpoint directory to write and resume # This is the directory to hold all intermediate checkpoints. # Once a run is complete and we specify to save checkpoints, @@ -57,9 +63,9 @@ export MODEL_CKPT="" # Inside this directory, there should be a `checkpoint` directory that holds context and weights # which is the "actual checkpoint". # Notice that this path must be able to hold at least 5.2TB data since each checkpoint is 5.2TB. -export CONTINUAL_CKPT="" +export CONTINUAL_CKPT="/lustre/fs1/portfolios/coreai/users/yunzhoul/llm-reference/reference_working_directory/checkpoints" # Model: Whether we want to restore from MODEL_CKPT path. If 0, then we are not restoring. -export USE_CKPT=0 +export USE_CKPT=1 # Model: Whether we are resuming from a NeMo-formatted HuggingFace checkpoint (weights only). # If set to 1, then checkpoint resuming code will not try to load the optimizer states. export FROM_HF=1 @@ -71,13 +77,15 @@ export SAVE_CKPT=0 # Model: size, to choose from 8b, 70b, 405b export SIZE="405b" # Dataloader: Global batch size -export GBS=1152 +export GBS=4608 # Dataloader: Micro batch size export MBS=1 # Dataloader: Max run N batches, optional # If an empty string is provided (""), then the training will continue until time limit # If we want to save a checkpoint, then this value must be set export MAX_STEPS="" +export START_EVAL_AT="368640" +export EVAL_EVERY="18432" # Experiment: starting steps # This is the starting "offset" step from the checkpoint. @@ -92,4 +100,5 @@ export NPAR=1 # Experiment manager: provides seeds to the launched experiments, use space as delimiter, such as "1234 1235 1236" # The training script will discard all excessive seeds, and generate seeds if given seeds < NEXP. # To preserve randomness, we recommend not to set this value so that each time seeds can be randomly generated. -export SEEDS="" \ No newline at end of file +export SEEDS="14932" +unset SEEDS diff --git a/large_language_model_pretraining/nemo/config_8b.sh b/large_language_model_pretraining/nemo/config_8b.sh new file mode 100644 index 000000000..43d94396e --- /dev/null +++ b/large_language_model_pretraining/nemo/config_8b.sh @@ -0,0 +1,104 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +TODAY_DATE="$(date +'%y%m%d')" +SUFFIX="gbs1152" +EXP_DIR="${TODAY_DATE}/${SUFFIX}" + +export TAG="20250629" + + +# SSH: username that connects to the remote cluster +export USER="michalm" +# SSH: remote cluster URL +export HOST="cw-dfw-cs-001-login-01.nvidia.com" +# Slurm: account for job submission +export ACCOUNT="coreai_mlperf_training" +# Slurm: partition for job submission +export PARTITION="batch" +# Slurm: job time limit, defaults to 4 hours +export TIME="01:00:00" +# Slurm: --nodes arguments, default to use 288 nodes +export NNODES=8 +# Slurm: --gpus_per_node and --ntasks_per_node argument, defaults to 8 GPUs per node +export GPUS_PER_NODE=8 +# Slurm: max job retries for transient job failures, defaults to retry 3 times +export MAX_RETRIES=0 + +# Folder mapping: +# Output directory that holds logs, any path that you like. +export JOB_DIR="/lustre/fs1/portfolios/coreai/users/michalm/raw-logs/llama31_405b_reference/${EXP_DIR}" +# Image / container path, either local cache file or remote URL +export IMAGE="/lustre/fs1/portfolios/coreai/users/michalm/containers/dl+mlperf+training_references+llama31_405b_${TAG}.sqsh" +# Dataset: C4 dataset location that contains the dataset after preprocessing +# This corresponds to the PREPROCESSED_PATH in README section 3's dataset download part +export PREPROCESSED_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/c4" +# Dataset: Numpy index working directory, contains shuffled dataset +# This path must be able to hold >400GB data +export TMP_NPY_INDEX="/lustre/fs1/portfolios/coreai/users/michalm/llm-refresh-llama31/presistent_npy_index" +# Dataset: Tokenizer path +# This corresponds to the TOKENIZER_PATH in README section 3's tokenizer download part +export TOKENIZER_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/mixtral-tokenizer" + +# Model: checkpoint and tokenizer path +# This is the checkpoint that we want to start with. +# Each checkpoint should be a folder containing two sub-folders: context and weights. +# And we need to pass this folder's path (the folder containing context and weights) here. +export MODEL_CKPT="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/nemo-ckpt/405b" +# Model: Continual checkpoint directory to write and resume +# This is the directory to hold all intermediate checkpoints. +# Once a run is complete and we specify to save checkpoints, +# we should see a checkpoint written in this folder +# with name `checkpoint-par-x-y-steps` +# Inside this directory, there should be a `checkpoint` directory that holds context and weights +# which is the "actual checkpoint". +# Notice that this path must be able to hold at least 5.2TB data since each checkpoint is 5.2TB. +export CONTINUAL_CKPT="/lustre/fs1/portfolios/coreai/users/yunzhoul/llm-reference/reference_working_directory/checkpoints" +# Model: Whether we want to restore from MODEL_CKPT path. If 0, then we are not restoring. +export USE_CKPT=0 +# Model: Whether we are resuming from a NeMo-formatted HuggingFace checkpoint (weights only). +# If set to 1, then checkpoint resuming code will not try to load the optimizer states. +export FROM_HF=1 +# Model: Whether we want to save a checkpoint. Must be 1 if NPAR > 1. If 1, then we save a checkpoint at the end. +export SAVE_CKPT=0 + + +# Training Configs: +# Model: size, to choose from 8b, 70b, 405b +export SIZE="8b" +# Dataloader: Global batch size +export GBS=1152 +# Dataloader: Micro batch size +export MBS=1 +# Dataloader: Max run N batches, optional +# If an empty string is provided (""), then the training will continue until time limit +# If we want to save a checkpoint, then this value must be set +export MAX_STEPS="400" +export EVAL_EVERY="11520" # skip 5 first evals +export START_EVAL_AT="23040" + +# Experiment: starting steps +# This is the starting "offset" step from the checkpoint. +# For instance, if you are resuming from a checkpoint folder `checkpoint-par-0-20-steps/checkpoint`, +# which means that the model is trained for 20 steps to generate the checkpoint, +# then the value 20 is needed here. +export START_STEPS="0" +# Experiment manager: Number of experiments to launch +export NEXP=1 +# Experiment manager: how many consecutive jobs we want for each experiment +export NPAR=1 +# Experiment manager: provides seeds to the launched experiments, use space as delimiter, such as "1234 1235 1236" +# The training script will discard all excessive seeds, and generate seeds if given seeds < NEXP. +# To preserve randomness, we recommend not to set this value so that each time seeds can be randomly generated. +export SEEDS="14932" diff --git a/large_language_model_pretraining/nemo/pretrain_llama31.py b/large_language_model_pretraining/nemo/pretrain_llama31.py index 891b22176..7f3765804 100644 --- a/large_language_model_pretraining/nemo/pretrain_llama31.py +++ b/large_language_model_pretraining/nemo/pretrain_llama31.py @@ -25,6 +25,8 @@ from nemo.collections.llm.gpt.data import build_pretraining_datamodule from callbacks import PreemptiveStop, MLPerfCallback, MetricsLogger +print("USING LOCAL COPY") + def slurm_executor( user: str, host: str, @@ -93,6 +95,7 @@ def get_pretrain( nnodes: int, ngpus_per_node: int, data_module: run.Config, + start_eval_at: Optional[int]=None, eval_every: Optional[int]=None, eval_batches: Optional[int]=None, ) -> run.Partial: @@ -180,7 +183,7 @@ def get_pretrain( pretrain.trainer.max_steps = math.ceil(max_tokens / 8192 / gbs) pretrain.data = data_module - pretrain.trainer.val_check_interval = eval_every + pretrain.trainer.val_check_interval = start_eval_at pretrain.trainer.limit_val_batches = eval_batches pretrain.trainer.limit_test_batches = eval_batches @@ -300,7 +303,8 @@ def get_parser() -> argparse.ArgumentParser: data_group.add_argument("--gbs", type=int, default=1152, help="Global batch size, should be divisible by PP") data_group.add_argument("--mbs", type=int, default=1, help="Micro batch size") - data_group.add_argument("--eval_every", type=int, default=46080, help="Evaluate at least every N training sequences") + data_group.add_argument("--start_eval_at", type=int, default=262144, help="Start evaluating at N training sequences") + data_group.add_argument("--eval_every", type=int, default=16384, help="Evaluate at least every N training sequences") data_group.add_argument("--eval_tokens", type=int, default=5760, help="Evaluate using at least N evaluation sequences") data_group.add_argument('--max_steps', type=int, default=None, help="Maximum number of steps that each experiment partition will train on. None means no restriction on max steps. ") data_group.add_argument("--use_full_dataset", action="store_true", help="If set, then we use the full dataset, instead of the last 256/1024 shards") @@ -352,6 +356,7 @@ def get_parser() -> argparse.ArgumentParser: use_full_dataset=args.use_full_dataset, ) + start_eval_at = math.ceil(args.start_eval_at / args.gbs) eval_every_n_batches = math.ceil(args.eval_every / (args.gbs)) eval_batches = math.ceil(args.eval_tokens / (args.gbs)) @@ -360,6 +365,7 @@ def get_parser() -> argparse.ArgumentParser: nnodes=args.nodes, ngpus_per_node=args.gpus_per_node, data_module=data, + start_eval_at=start_eval_at, eval_every=eval_every_n_batches, eval_batches=eval_batches, ) @@ -497,6 +503,7 @@ def get_parser() -> argparse.ArgumentParser: micro_batch_size=args.mbs, sequence_length=8192, init_global_step=start_step, + eval_every=eval_every_n_batches, configs=configs, ), ] diff --git a/large_language_model_pretraining/nemo/run_llama31.sh b/large_language_model_pretraining/nemo/run_llama31.sh index cc8d91021..f1bdfd99a 100644 --- a/large_language_model_pretraining/nemo/run_llama31.sh +++ b/large_language_model_pretraining/nemo/run_llama31.sh @@ -139,4 +139,6 @@ python3 pretrain_llama31.py \ --step_time_atol $STEP_TIME_ATOL \ --ckpt_start_step $START_STEPS \ --max_retries $MAX_RETRIES \ +--eval_every $EVAL_EVERY \ +--start_eval_at $START_EVAL_AT \ $CMD_SUFFIX