From daea3228fdbba08b0d38ed48d16aac890f3ec029 Mon Sep 17 00:00:00 2001 From: michalm Date: Thu, 19 Jun 2025 07:17:26 +0000 Subject: [PATCH 01/11] change eval sched --- .../nemo/config.sh | 39 ++++--- .../nemo/config_8b.sh | 102 ++++++++++++++++++ .../nemo/pretrain_llama31.py | 2 +- .../nemo/run_llama31.sh | 1 + 4 files changed, 127 insertions(+), 17 deletions(-) create mode 100644 large_language_model_pretraining/nemo/config_8b.sh diff --git a/large_language_model_pretraining/nemo/config.sh b/large_language_model_pretraining/nemo/config.sh index 5df2a222b..b5f865f43 100644 --- a/large_language_model_pretraining/nemo/config.sh +++ b/large_language_model_pretraining/nemo/config.sh @@ -12,43 +12,49 @@ # See the License for the specific language governing permissions and # limitations under the License. +TODAY_DATE="$(date +'%y%m%d')" +SUFFIX="gbs1152" +EXP_DIR="${TODAY_DATE}/${SUFFIX}" + +export TAG="20250617" + # SSH: username that connects to the remote cluster -export USER="" +export USER="michalm" # SSH: remote cluster URL -export HOST="" +export HOST="cw-dfw-cs-001-login-01.nvidia.com" # Slurm: account for job submission -export ACCOUNT="" +export ACCOUNT="coreai_mlperf_training" # Slurm: partition for job submission -export PARTITION="" +export PARTITION="batch" # Slurm: job time limit, defaults to 4 hours export TIME="04:00:00" # Slurm: --nodes arguments, default to use 288 nodes -export NNODES=288 +export NNODES=144 # Slurm: --gpus_per_node and --ntasks_per_node argument, defaults to 8 GPUs per node export GPUS_PER_NODE=8 # Slurm: max job retries for transient job failures, defaults to retry 3 times -export MAX_RETRIES=3 +export MAX_RETRIES=0 # Folder mapping: # Output directory that holds logs, any path that you like. -export JOB_DIR="" +export JOB_DIR="/lustre/fsw/portfolios/coreai/users/michalm/raw-logs/llama31_405b_reference/${EXP_DIR}" # Image / container path, either local cache file or remote URL -export IMAGE="" +export IMAGE="/lustre/fsw/portfolios/coreai/users/michalm/containers/dl+mlperf+training_references+llama31_405b_${TAG}.sqsh" # Dataset: C4 dataset location that contains the dataset after preprocessing # This corresponds to the PREPROCESSED_PATH in README section 3's dataset download part -export PREPROCESSED_PATH="" +export PREPROCESSED_PATH="/lustre/fsw/portfolios/coreai/projects/coreai_mlperf_training/data/c4" # Dataset: Numpy index working directory, contains shuffled dataset # This path must be able to hold >400GB data -export TMP_NPY_INDEX="" +export TMP_NPY_INDEX="/lustre/fsw/portfolios/coreai/users/michalm/llm-refresh-llama31/presistent_npy_index" # Dataset: Tokenizer path # This corresponds to the TOKENIZER_PATH in README section 3's tokenizer download part -export TOKENIZER_PATH="" +export TOKENIZER_PATH="/lustre/fsw/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/mixtral-tokenizer" # Model: checkpoint and tokenizer path # This is the checkpoint that we want to start with. # Each checkpoint should be a folder containing two sub-folders: context and weights. # And we need to pass this folder's path (the folder containing context and weights) here. -export MODEL_CKPT="" +export MODEL_CKPT="/lustre/fsw/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/nemo-ckpt/405b" # Model: Continual checkpoint directory to write and resume # This is the directory to hold all intermediate checkpoints. # Once a run is complete and we specify to save checkpoints, @@ -57,9 +63,9 @@ export MODEL_CKPT="" # Inside this directory, there should be a `checkpoint` directory that holds context and weights # which is the "actual checkpoint". # Notice that this path must be able to hold at least 5.2TB data since each checkpoint is 5.2TB. -export CONTINUAL_CKPT="" +export CONTINUAL_CKPT="/lustre/fsw/portfolios/coreai/users/yunzhoul/llm-reference/reference_working_directory/checkpoints" # Model: Whether we want to restore from MODEL_CKPT path. If 0, then we are not restoring. -export USE_CKPT=0 +export USE_CKPT=1 # Model: Whether we are resuming from a NeMo-formatted HuggingFace checkpoint (weights only). # If set to 1, then checkpoint resuming code will not try to load the optimizer states. export FROM_HF=1 @@ -77,7 +83,8 @@ export MBS=1 # Dataloader: Max run N batches, optional # If an empty string is provided (""), then the training will continue until time limit # If we want to save a checkpoint, then this value must be set -export MAX_STEPS="" +export MAX_STEPS="400" +export EVAL_EVERY="230400" # skip 5 first evals # Experiment: starting steps # This is the starting "offset" step from the checkpoint. @@ -92,4 +99,4 @@ export NPAR=1 # Experiment manager: provides seeds to the launched experiments, use space as delimiter, such as "1234 1235 1236" # The training script will discard all excessive seeds, and generate seeds if given seeds < NEXP. # To preserve randomness, we recommend not to set this value so that each time seeds can be randomly generated. -export SEEDS="" \ No newline at end of file +export SEEDS="14932" diff --git a/large_language_model_pretraining/nemo/config_8b.sh b/large_language_model_pretraining/nemo/config_8b.sh new file mode 100644 index 000000000..7817fa8bd --- /dev/null +++ b/large_language_model_pretraining/nemo/config_8b.sh @@ -0,0 +1,102 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +TODAY_DATE="$(date +'%y%m%d')" +SUFFIX="gbs896" +EXP_DIR="${TODAY_DATE}/${SUFFIX}" + +export TAG="20250619" + +# SSH: username that connects to the remote cluster +export USER="michalm" +# SSH: remote cluster URL +export HOST="cw-dfw-cs-001-login-01.nvidia.com" +# Slurm: account for job submission +export ACCOUNT="coreai_mlperf_training" +# Slurm: partition for job submission +export PARTITION="batch" +# Slurm: job time limit, defaults to 4 hours +export TIME="01:00:00" +# Slurm: --nodes arguments, default to use 288 nodes +export NNODES=8 +# Slurm: --gpus_per_node and --ntasks_per_node argument, defaults to 8 GPUs per node +export GPUS_PER_NODE=8 +# Slurm: max job retries for transient job failures, defaults to retry 3 times +export MAX_RETRIES=0 + +# Folder mapping: +# Output directory that holds logs, any path that you like. +export JOB_DIR="/lustre/fsw/portfolios/coreai/users/michalm/raw-logs/llama31_405b_reference/${EXP_DIR}" +# Image / container path, either local cache file or remote URL +export IMAGE="/lustre/fsw/portfolios/coreai/users/michalm/containers/dl+mlperf+training_references+llama31_405b_${TAG}.sqsh" +# Dataset: C4 dataset location that contains the dataset after preprocessing +# This corresponds to the PREPROCESSED_PATH in README section 3's dataset download part +export PREPROCESSED_PATH="/lustre/fsw/portfolios/coreai/projects/coreai_mlperf_training/data/c4" +# Dataset: Numpy index working directory, contains shuffled dataset +# This path must be able to hold >400GB data +export TMP_NPY_INDEX="/lustre/fsw/portfolios/coreai/users/michalm/llm-refresh-llama31/presistent_npy_index" +# Dataset: Tokenizer path +# This corresponds to the TOKENIZER_PATH in README section 3's tokenizer download part +export TOKENIZER_PATH="/lustre/fsw/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/mixtral-tokenizer" + +# Model: checkpoint and tokenizer path +# This is the checkpoint that we want to start with. +# Each checkpoint should be a folder containing two sub-folders: context and weights. +# And we need to pass this folder's path (the folder containing context and weights) here. +export MODEL_CKPT="/lustre/fsw/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/nemo-ckpt/405b" +# Model: Continual checkpoint directory to write and resume +# This is the directory to hold all intermediate checkpoints. +# Once a run is complete and we specify to save checkpoints, +# we should see a checkpoint written in this folder +# with name `checkpoint-par-x-y-steps` +# Inside this directory, there should be a `checkpoint` directory that holds context and weights +# which is the "actual checkpoint". +# Notice that this path must be able to hold at least 5.2TB data since each checkpoint is 5.2TB. +export CONTINUAL_CKPT="/lustre/fsw/portfolios/coreai/users/yunzhoul/llm-reference/reference_working_directory/checkpoints" +# Model: Whether we want to restore from MODEL_CKPT path. If 0, then we are not restoring. +export USE_CKPT=0 +# Model: Whether we are resuming from a NeMo-formatted HuggingFace checkpoint (weights only). +# If set to 1, then checkpoint resuming code will not try to load the optimizer states. +export FROM_HF=1 +# Model: Whether we want to save a checkpoint. Must be 1 if NPAR > 1. If 1, then we save a checkpoint at the end. +export SAVE_CKPT=0 + + +# Training Configs: +# Model: size, to choose from 8b, 70b, 405b +export SIZE="8b" +# Dataloader: Global batch size +export GBS=896 +# Dataloader: Micro batch size +export MBS=1 +# Dataloader: Max run N batches, optional +# If an empty string is provided (""), then the training will continue until time limit +# If we want to save a checkpoint, then this value must be set +export MAX_STEPS="400" +export EVAL_EVERY="230400" # skip 5 first evals + +# Experiment: starting steps +# This is the starting "offset" step from the checkpoint. +# For instance, if you are resuming from a checkpoint folder `checkpoint-par-0-20-steps/checkpoint`, +# which means that the model is trained for 20 steps to generate the checkpoint, +# then the value 20 is needed here. +export START_STEPS="0" +# Experiment manager: Number of experiments to launch +export NEXP=1 +# Experiment manager: how many consecutive jobs we want for each experiment +export NPAR=1 +# Experiment manager: provides seeds to the launched experiments, use space as delimiter, such as "1234 1235 1236" +# The training script will discard all excessive seeds, and generate seeds if given seeds < NEXP. +# To preserve randomness, we recommend not to set this value so that each time seeds can be randomly generated. +export SEEDS="14932" diff --git a/large_language_model_pretraining/nemo/pretrain_llama31.py b/large_language_model_pretraining/nemo/pretrain_llama31.py index 65df8a8a9..891b22176 100644 --- a/large_language_model_pretraining/nemo/pretrain_llama31.py +++ b/large_language_model_pretraining/nemo/pretrain_llama31.py @@ -75,7 +75,7 @@ def slurm_executor( mem="0", exclusive=True, gres="gpu:8", - packager=run.GitArchivePackager(), + packager=run.GitArchivePackager(subpath="large_language_model_pretraining/nemo", ref="HEAD"), dependencies=dependencies, ) diff --git a/large_language_model_pretraining/nemo/run_llama31.sh b/large_language_model_pretraining/nemo/run_llama31.sh index cc8d91021..d8e7bc695 100644 --- a/large_language_model_pretraining/nemo/run_llama31.sh +++ b/large_language_model_pretraining/nemo/run_llama31.sh @@ -139,4 +139,5 @@ python3 pretrain_llama31.py \ --step_time_atol $STEP_TIME_ATOL \ --ckpt_start_step $START_STEPS \ --max_retries $MAX_RETRIES \ +--eval_every $EVAL_EVERY \ $CMD_SUFFIX From e45d39aad59b49ad48b9b82b629be46344fa30b3 Mon Sep 17 00:00:00 2001 From: michalm Date: Thu, 19 Jun 2025 19:17:43 +0000 Subject: [PATCH 02/11] change eval sched --- large_language_model_pretraining/nemo/callbacks.py | 9 ++++++++- .../nemo/pretrain_llama31.py | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/large_language_model_pretraining/nemo/callbacks.py b/large_language_model_pretraining/nemo/callbacks.py index 7d4ad2439..ea41458c9 100644 --- a/large_language_model_pretraining/nemo/callbacks.py +++ b/large_language_model_pretraining/nemo/callbacks.py @@ -201,9 +201,16 @@ def on_train_end(self, trainer, pl_module): return super().on_train_end(trainer, pl_module) @rank_zero_only - def on_validation_start(self, trainer, pl_module): + def log_eval_start(self, trainer, pl_module): mllogger.end(key=constants.BLOCK_STOP, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)}) mllogger.start(key=constants.EVAL_START, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)}) + + def on_validation_start(self, trainer, pl_module): + print("changing eval freq") + trainer.val_check_interval = 20 + trainer.val_check_batch = 20 + self.log_eval_start(trainer, pl_module) + return super().on_validation_start(trainer, pl_module) def on_validation_end(self, trainer, pl_module): diff --git a/large_language_model_pretraining/nemo/pretrain_llama31.py b/large_language_model_pretraining/nemo/pretrain_llama31.py index 891b22176..843d67417 100644 --- a/large_language_model_pretraining/nemo/pretrain_llama31.py +++ b/large_language_model_pretraining/nemo/pretrain_llama31.py @@ -25,6 +25,8 @@ from nemo.collections.llm.gpt.data import build_pretraining_datamodule from callbacks import PreemptiveStop, MLPerfCallback, MetricsLogger +print("USING LOCAL COPY") + def slurm_executor( user: str, host: str, From 01aeed0f5fbef973de1927ffee88e65ab28f5667 Mon Sep 17 00:00:00 2001 From: michalm Date: Fri, 20 Jun 2025 07:43:00 +0000 Subject: [PATCH 03/11] properly plumb eval sched --- large_language_model_pretraining/nemo/callbacks.py | 9 ++++----- large_language_model_pretraining/nemo/config.sh | 1 + .../nemo/pretrain_llama31.py | 9 +++++++-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/large_language_model_pretraining/nemo/callbacks.py b/large_language_model_pretraining/nemo/callbacks.py index ea41458c9..99ab4d219 100644 --- a/large_language_model_pretraining/nemo/callbacks.py +++ b/large_language_model_pretraining/nemo/callbacks.py @@ -159,6 +159,7 @@ def __init__( micro_batch_size, sequence_length, init_global_step, + eval_every, configs={} ): mllogger.event(key=constants.CACHE_CLEAR, value=True) @@ -169,6 +170,7 @@ def __init__( self.gbs = global_batch_size self.mbs = micro_batch_size self.seq_len = sequence_length + self.eval_every = eval_every self.is_target_reached = False self.status = constants.ABORTED @@ -185,7 +187,6 @@ def set_success_status(self): def on_train_epoch_start(self, trainer, pl_module): mllogger.start(key=constants.EPOCH_START, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)}) mllogger.start(key=constants.BLOCK_START, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)}) - return super().on_train_epoch_start(trainer, pl_module) @rank_zero_only @@ -206,11 +207,9 @@ def log_eval_start(self, trainer, pl_module): mllogger.start(key=constants.EVAL_START, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)}) def on_validation_start(self, trainer, pl_module): - print("changing eval freq") - trainer.val_check_interval = 20 - trainer.val_check_batch = 20 + trainer.val_check_interval = self.eval_every + trainer.val_check_batch = self.eval_every self.log_eval_start(trainer, pl_module) - return super().on_validation_start(trainer, pl_module) def on_validation_end(self, trainer, pl_module): diff --git a/large_language_model_pretraining/nemo/config.sh b/large_language_model_pretraining/nemo/config.sh index b5f865f43..7e324530f 100644 --- a/large_language_model_pretraining/nemo/config.sh +++ b/large_language_model_pretraining/nemo/config.sh @@ -84,6 +84,7 @@ export MBS=1 # If an empty string is provided (""), then the training will continue until time limit # If we want to save a checkpoint, then this value must be set export MAX_STEPS="400" +export START_EVAL_AT="230400" export EVAL_EVERY="230400" # skip 5 first evals # Experiment: starting steps diff --git a/large_language_model_pretraining/nemo/pretrain_llama31.py b/large_language_model_pretraining/nemo/pretrain_llama31.py index 843d67417..7f3765804 100644 --- a/large_language_model_pretraining/nemo/pretrain_llama31.py +++ b/large_language_model_pretraining/nemo/pretrain_llama31.py @@ -95,6 +95,7 @@ def get_pretrain( nnodes: int, ngpus_per_node: int, data_module: run.Config, + start_eval_at: Optional[int]=None, eval_every: Optional[int]=None, eval_batches: Optional[int]=None, ) -> run.Partial: @@ -182,7 +183,7 @@ def get_pretrain( pretrain.trainer.max_steps = math.ceil(max_tokens / 8192 / gbs) pretrain.data = data_module - pretrain.trainer.val_check_interval = eval_every + pretrain.trainer.val_check_interval = start_eval_at pretrain.trainer.limit_val_batches = eval_batches pretrain.trainer.limit_test_batches = eval_batches @@ -302,7 +303,8 @@ def get_parser() -> argparse.ArgumentParser: data_group.add_argument("--gbs", type=int, default=1152, help="Global batch size, should be divisible by PP") data_group.add_argument("--mbs", type=int, default=1, help="Micro batch size") - data_group.add_argument("--eval_every", type=int, default=46080, help="Evaluate at least every N training sequences") + data_group.add_argument("--start_eval_at", type=int, default=262144, help="Start evaluating at N training sequences") + data_group.add_argument("--eval_every", type=int, default=16384, help="Evaluate at least every N training sequences") data_group.add_argument("--eval_tokens", type=int, default=5760, help="Evaluate using at least N evaluation sequences") data_group.add_argument('--max_steps', type=int, default=None, help="Maximum number of steps that each experiment partition will train on. None means no restriction on max steps. ") data_group.add_argument("--use_full_dataset", action="store_true", help="If set, then we use the full dataset, instead of the last 256/1024 shards") @@ -354,6 +356,7 @@ def get_parser() -> argparse.ArgumentParser: use_full_dataset=args.use_full_dataset, ) + start_eval_at = math.ceil(args.start_eval_at / args.gbs) eval_every_n_batches = math.ceil(args.eval_every / (args.gbs)) eval_batches = math.ceil(args.eval_tokens / (args.gbs)) @@ -362,6 +365,7 @@ def get_parser() -> argparse.ArgumentParser: nnodes=args.nodes, ngpus_per_node=args.gpus_per_node, data_module=data, + start_eval_at=start_eval_at, eval_every=eval_every_n_batches, eval_batches=eval_batches, ) @@ -499,6 +503,7 @@ def get_parser() -> argparse.ArgumentParser: micro_batch_size=args.mbs, sequence_length=8192, init_global_step=start_step, + eval_every=eval_every_n_batches, configs=configs, ), ] From 965b1f7108af2de332ba16a83445abe262ac28ae Mon Sep 17 00:00:00 2001 From: michalm Date: Fri, 20 Jun 2025 07:45:36 +0000 Subject: [PATCH 04/11] properly plumb eval sched --- large_language_model_pretraining/nemo/config.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/large_language_model_pretraining/nemo/config.sh b/large_language_model_pretraining/nemo/config.sh index 7e324530f..691c7437f 100644 --- a/large_language_model_pretraining/nemo/config.sh +++ b/large_language_model_pretraining/nemo/config.sh @@ -84,8 +84,8 @@ export MBS=1 # If an empty string is provided (""), then the training will continue until time limit # If we want to save a checkpoint, then this value must be set export MAX_STEPS="400" -export START_EVAL_AT="230400" -export EVAL_EVERY="230400" # skip 5 first evals +export START_EVAL_AT="23040" +export EVAL_EVERY="11520" # skip 5 first evals # Experiment: starting steps # This is the starting "offset" step from the checkpoint. From 57bce47e819bcb0e6af5ffb50fa65f35a073d0fb Mon Sep 17 00:00:00 2001 From: michalm Date: Fri, 20 Jun 2025 17:17:06 +0000 Subject: [PATCH 05/11] add start eval at --- large_language_model_pretraining/nemo/run_llama31.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/large_language_model_pretraining/nemo/run_llama31.sh b/large_language_model_pretraining/nemo/run_llama31.sh index d8e7bc695..f1bdfd99a 100644 --- a/large_language_model_pretraining/nemo/run_llama31.sh +++ b/large_language_model_pretraining/nemo/run_llama31.sh @@ -140,4 +140,5 @@ python3 pretrain_llama31.py \ --ckpt_start_step $START_STEPS \ --max_retries $MAX_RETRIES \ --eval_every $EVAL_EVERY \ +--start_eval_at $START_EVAL_AT \ $CMD_SUFFIX From 736c3efbc7547bca2685445c4553473d6c6f0d27 Mon Sep 17 00:00:00 2001 From: michalm Date: Fri, 20 Jun 2025 17:29:41 +0000 Subject: [PATCH 06/11] add start eval at --- large_language_model_pretraining/nemo/config_8b.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/large_language_model_pretraining/nemo/config_8b.sh b/large_language_model_pretraining/nemo/config_8b.sh index 7817fa8bd..856fcb5c4 100644 --- a/large_language_model_pretraining/nemo/config_8b.sh +++ b/large_language_model_pretraining/nemo/config_8b.sh @@ -13,10 +13,10 @@ # limitations under the License. TODAY_DATE="$(date +'%y%m%d')" -SUFFIX="gbs896" +SUFFIX="gbs1152" EXP_DIR="${TODAY_DATE}/${SUFFIX}" -export TAG="20250619" +export TAG="20250620" # SSH: username that connects to the remote cluster export USER="michalm" @@ -77,14 +77,15 @@ export SAVE_CKPT=0 # Model: size, to choose from 8b, 70b, 405b export SIZE="8b" # Dataloader: Global batch size -export GBS=896 +export GBS=1152 # Dataloader: Micro batch size export MBS=1 # Dataloader: Max run N batches, optional # If an empty string is provided (""), then the training will continue until time limit # If we want to save a checkpoint, then this value must be set export MAX_STEPS="400" -export EVAL_EVERY="230400" # skip 5 first evals +export EVAL_EVERY="11520" # skip 5 first evals +export START_EVAL_AT="23040" # Experiment: starting steps # This is the starting "offset" step from the checkpoint. From 64eb721224bf8e435d6b3d866c20ce82c0d69993 Mon Sep 17 00:00:00 2001 From: michalm Date: Fri, 20 Jun 2025 17:47:01 +0000 Subject: [PATCH 07/11] 8b works with start_eval_at --- large_language_model_pretraining/nemo/config_8b.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/large_language_model_pretraining/nemo/config_8b.sh b/large_language_model_pretraining/nemo/config_8b.sh index 856fcb5c4..e721b332b 100644 --- a/large_language_model_pretraining/nemo/config_8b.sh +++ b/large_language_model_pretraining/nemo/config_8b.sh @@ -18,6 +18,7 @@ EXP_DIR="${TODAY_DATE}/${SUFFIX}" export TAG="20250620" + # SSH: username that connects to the remote cluster export USER="michalm" # SSH: remote cluster URL From 7e445d79ab811d401170630c108dd1ba1e4ed7d9 Mon Sep 17 00:00:00 2001 From: michalm Date: Sun, 29 Jun 2025 16:29:59 +0000 Subject: [PATCH 08/11] fix paths --- large_language_model_pretraining/nemo/config.sh | 14 +++++++------- .../nemo/config_8b.sh | 16 ++++++++-------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/large_language_model_pretraining/nemo/config.sh b/large_language_model_pretraining/nemo/config.sh index 691c7437f..a83eda2f7 100644 --- a/large_language_model_pretraining/nemo/config.sh +++ b/large_language_model_pretraining/nemo/config.sh @@ -37,24 +37,24 @@ export MAX_RETRIES=0 # Folder mapping: # Output directory that holds logs, any path that you like. -export JOB_DIR="/lustre/fsw/portfolios/coreai/users/michalm/raw-logs/llama31_405b_reference/${EXP_DIR}" +export JOB_DIR="/lustre/fs1/portfolios/coreai/users/michalm/raw-logs/llama31_405b_reference/${EXP_DIR}" # Image / container path, either local cache file or remote URL -export IMAGE="/lustre/fsw/portfolios/coreai/users/michalm/containers/dl+mlperf+training_references+llama31_405b_${TAG}.sqsh" +export IMAGE="/lustre/fs1/portfolios/coreai/users/michalm/containers/dl+mlperf+training_references+llama31_405b_${TAG}.sqsh" # Dataset: C4 dataset location that contains the dataset after preprocessing # This corresponds to the PREPROCESSED_PATH in README section 3's dataset download part -export PREPROCESSED_PATH="/lustre/fsw/portfolios/coreai/projects/coreai_mlperf_training/data/c4" +export PREPROCESSED_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/c4" # Dataset: Numpy index working directory, contains shuffled dataset # This path must be able to hold >400GB data -export TMP_NPY_INDEX="/lustre/fsw/portfolios/coreai/users/michalm/llm-refresh-llama31/presistent_npy_index" +export TMP_NPY_INDEX="/lustre/fs1/portfolios/coreai/users/michalm/llm-refresh-llama31/presistent_npy_index" # Dataset: Tokenizer path # This corresponds to the TOKENIZER_PATH in README section 3's tokenizer download part -export TOKENIZER_PATH="/lustre/fsw/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/mixtral-tokenizer" +export TOKENIZER_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/mixtral-tokenizer" # Model: checkpoint and tokenizer path # This is the checkpoint that we want to start with. # Each checkpoint should be a folder containing two sub-folders: context and weights. # And we need to pass this folder's path (the folder containing context and weights) here. -export MODEL_CKPT="/lustre/fsw/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/nemo-ckpt/405b" +export MODEL_CKPT="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/nemo-ckpt/405b" # Model: Continual checkpoint directory to write and resume # This is the directory to hold all intermediate checkpoints. # Once a run is complete and we specify to save checkpoints, @@ -63,7 +63,7 @@ export MODEL_CKPT="/lustre/fsw/portfolios/coreai/projects/coreai_mlperf_training # Inside this directory, there should be a `checkpoint` directory that holds context and weights # which is the "actual checkpoint". # Notice that this path must be able to hold at least 5.2TB data since each checkpoint is 5.2TB. -export CONTINUAL_CKPT="/lustre/fsw/portfolios/coreai/users/yunzhoul/llm-reference/reference_working_directory/checkpoints" +export CONTINUAL_CKPT="/lustre/fs1/portfolios/coreai/users/yunzhoul/llm-reference/reference_working_directory/checkpoints" # Model: Whether we want to restore from MODEL_CKPT path. If 0, then we are not restoring. export USE_CKPT=1 # Model: Whether we are resuming from a NeMo-formatted HuggingFace checkpoint (weights only). diff --git a/large_language_model_pretraining/nemo/config_8b.sh b/large_language_model_pretraining/nemo/config_8b.sh index e721b332b..43d94396e 100644 --- a/large_language_model_pretraining/nemo/config_8b.sh +++ b/large_language_model_pretraining/nemo/config_8b.sh @@ -16,7 +16,7 @@ TODAY_DATE="$(date +'%y%m%d')" SUFFIX="gbs1152" EXP_DIR="${TODAY_DATE}/${SUFFIX}" -export TAG="20250620" +export TAG="20250629" # SSH: username that connects to the remote cluster @@ -38,24 +38,24 @@ export MAX_RETRIES=0 # Folder mapping: # Output directory that holds logs, any path that you like. -export JOB_DIR="/lustre/fsw/portfolios/coreai/users/michalm/raw-logs/llama31_405b_reference/${EXP_DIR}" +export JOB_DIR="/lustre/fs1/portfolios/coreai/users/michalm/raw-logs/llama31_405b_reference/${EXP_DIR}" # Image / container path, either local cache file or remote URL -export IMAGE="/lustre/fsw/portfolios/coreai/users/michalm/containers/dl+mlperf+training_references+llama31_405b_${TAG}.sqsh" +export IMAGE="/lustre/fs1/portfolios/coreai/users/michalm/containers/dl+mlperf+training_references+llama31_405b_${TAG}.sqsh" # Dataset: C4 dataset location that contains the dataset after preprocessing # This corresponds to the PREPROCESSED_PATH in README section 3's dataset download part -export PREPROCESSED_PATH="/lustre/fsw/portfolios/coreai/projects/coreai_mlperf_training/data/c4" +export PREPROCESSED_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/c4" # Dataset: Numpy index working directory, contains shuffled dataset # This path must be able to hold >400GB data -export TMP_NPY_INDEX="/lustre/fsw/portfolios/coreai/users/michalm/llm-refresh-llama31/presistent_npy_index" +export TMP_NPY_INDEX="/lustre/fs1/portfolios/coreai/users/michalm/llm-refresh-llama31/presistent_npy_index" # Dataset: Tokenizer path # This corresponds to the TOKENIZER_PATH in README section 3's tokenizer download part -export TOKENIZER_PATH="/lustre/fsw/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/mixtral-tokenizer" +export TOKENIZER_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/mixtral-tokenizer" # Model: checkpoint and tokenizer path # This is the checkpoint that we want to start with. # Each checkpoint should be a folder containing two sub-folders: context and weights. # And we need to pass this folder's path (the folder containing context and weights) here. -export MODEL_CKPT="/lustre/fsw/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/nemo-ckpt/405b" +export MODEL_CKPT="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/nemo-ckpt/405b" # Model: Continual checkpoint directory to write and resume # This is the directory to hold all intermediate checkpoints. # Once a run is complete and we specify to save checkpoints, @@ -64,7 +64,7 @@ export MODEL_CKPT="/lustre/fsw/portfolios/coreai/projects/coreai_mlperf_training # Inside this directory, there should be a `checkpoint` directory that holds context and weights # which is the "actual checkpoint". # Notice that this path must be able to hold at least 5.2TB data since each checkpoint is 5.2TB. -export CONTINUAL_CKPT="/lustre/fsw/portfolios/coreai/users/yunzhoul/llm-reference/reference_working_directory/checkpoints" +export CONTINUAL_CKPT="/lustre/fs1/portfolios/coreai/users/yunzhoul/llm-reference/reference_working_directory/checkpoints" # Model: Whether we want to restore from MODEL_CKPT path. If 0, then we are not restoring. export USE_CKPT=0 # Model: Whether we are resuming from a NeMo-formatted HuggingFace checkpoint (weights only). From 0ce3c51ae3704a1cac0f7f2274717ab6b33f7863 Mon Sep 17 00:00:00 2001 From: michalm Date: Mon, 30 Jun 2025 07:20:21 +0000 Subject: [PATCH 09/11] update config --- large_language_model_pretraining/nemo/config.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/large_language_model_pretraining/nemo/config.sh b/large_language_model_pretraining/nemo/config.sh index a83eda2f7..d659fa142 100644 --- a/large_language_model_pretraining/nemo/config.sh +++ b/large_language_model_pretraining/nemo/config.sh @@ -16,7 +16,7 @@ TODAY_DATE="$(date +'%y%m%d')" SUFFIX="gbs1152" EXP_DIR="${TODAY_DATE}/${SUFFIX}" -export TAG="20250617" +export TAG="20250630" # SSH: username that connects to the remote cluster export USER="michalm" @@ -29,7 +29,7 @@ export PARTITION="batch" # Slurm: job time limit, defaults to 4 hours export TIME="04:00:00" # Slurm: --nodes arguments, default to use 288 nodes -export NNODES=144 +export NNODES=288 # Slurm: --gpus_per_node and --ntasks_per_node argument, defaults to 8 GPUs per node export GPUS_PER_NODE=8 # Slurm: max job retries for transient job failures, defaults to retry 3 times @@ -84,8 +84,8 @@ export MBS=1 # If an empty string is provided (""), then the training will continue until time limit # If we want to save a checkpoint, then this value must be set export MAX_STEPS="400" -export START_EVAL_AT="23040" -export EVAL_EVERY="11520" # skip 5 first evals +export START_EVAL_AT="276480" +export EVAL_EVERY="17280" # Experiment: starting steps # This is the starting "offset" step from the checkpoint. @@ -101,3 +101,4 @@ export NPAR=1 # The training script will discard all excessive seeds, and generate seeds if given seeds < NEXP. # To preserve randomness, we recommend not to set this value so that each time seeds can be randomly generated. export SEEDS="14932" +unset SEEDS From e3926fb952110b789e5057e350b3c0820f141713 Mon Sep 17 00:00:00 2001 From: michalm Date: Mon, 14 Jul 2025 03:37:26 +0000 Subject: [PATCH 10/11] update 2k config --- large_language_model_pretraining/nemo/config.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/large_language_model_pretraining/nemo/config.sh b/large_language_model_pretraining/nemo/config.sh index d659fa142..ed942575a 100644 --- a/large_language_model_pretraining/nemo/config.sh +++ b/large_language_model_pretraining/nemo/config.sh @@ -13,10 +13,10 @@ # limitations under the License. TODAY_DATE="$(date +'%y%m%d')" -SUFFIX="gbs1152" +SUFFIX="gbs2304" EXP_DIR="${TODAY_DATE}/${SUFFIX}" -export TAG="20250630" +export TAG="20250714" # SSH: username that connects to the remote cluster export USER="michalm" @@ -77,15 +77,15 @@ export SAVE_CKPT=0 # Model: size, to choose from 8b, 70b, 405b export SIZE="405b" # Dataloader: Global batch size -export GBS=1152 +export GBS=2304 # Dataloader: Micro batch size export MBS=1 # Dataloader: Max run N batches, optional # If an empty string is provided (""), then the training will continue until time limit # If we want to save a checkpoint, then this value must be set export MAX_STEPS="400" -export START_EVAL_AT="276480" -export EVAL_EVERY="17280" +export START_EVAL_AT="331776" +export EVAL_EVERY="18432" # Experiment: starting steps # This is the starting "offset" step from the checkpoint. From 4b936758d536906ba5c97f5a10485bc1b3014b61 Mon Sep 17 00:00:00 2001 From: michalm Date: Mon, 21 Jul 2025 19:35:32 +0000 Subject: [PATCH 11/11] update --- large_language_model_pretraining/nemo/config.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/large_language_model_pretraining/nemo/config.sh b/large_language_model_pretraining/nemo/config.sh index ed942575a..d3e96ada5 100644 --- a/large_language_model_pretraining/nemo/config.sh +++ b/large_language_model_pretraining/nemo/config.sh @@ -13,10 +13,10 @@ # limitations under the License. TODAY_DATE="$(date +'%y%m%d')" -SUFFIX="gbs2304" +SUFFIX="gbs4608" EXP_DIR="${TODAY_DATE}/${SUFFIX}" -export TAG="20250714" +export TAG="20250630" # SSH: username that connects to the remote cluster export USER="michalm" @@ -77,14 +77,14 @@ export SAVE_CKPT=0 # Model: size, to choose from 8b, 70b, 405b export SIZE="405b" # Dataloader: Global batch size -export GBS=2304 +export GBS=4608 # Dataloader: Micro batch size export MBS=1 # Dataloader: Max run N batches, optional # If an empty string is provided (""), then the training will continue until time limit # If we want to save a checkpoint, then this value must be set -export MAX_STEPS="400" -export START_EVAL_AT="331776" +export MAX_STEPS="" +export START_EVAL_AT="368640" export EVAL_EVERY="18432" # Experiment: starting steps