Michalm/fix llama31 ref #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

mmarcinkiewicz wants to merge 12 commits into master from michalm/fix-llama31-ref

large_language_model_pretraining/nemo/callbacks.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -159,6 +159,7 @@ def __init__( @@
             micro_batch_size,
             sequence_length,
             init_global_step,
+            eval_every,
             configs={}
         ):
             mllogger.event(key=constants.CACHE_CLEAR, value=True)
@@ Expand All / @@ -169,6 +170,7 @@ def __init__( @@
             self.gbs = global_batch_size
             self.mbs = micro_batch_size
             self.seq_len = sequence_length
+            self.eval_every = eval_every
             self.is_target_reached = False
             self.status = constants.ABORTED
@@ Expand All / @@ -185,7 +187,6 @@ def set_success_status(self): @@
         def on_train_epoch_start(self, trainer, pl_module):
             mllogger.start(key=constants.EPOCH_START, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)})
             mllogger.start(key=constants.BLOCK_START, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)})
             return super().on_train_epoch_start(trainer, pl_module)
         @rank_zero_only
@@ Expand All / @@ -201,9 +202,14 @@ def on_train_end(self, trainer, pl_module): @@
             return super().on_train_end(trainer, pl_module)
         @rank_zero_only
-        def on_validation_start(self, trainer, pl_module):
+        def log_eval_start(self, trainer, pl_module):
             mllogger.end(key=constants.BLOCK_STOP, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)})
             mllogger.start(key=constants.EVAL_START, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)})
+        def on_validation_start(self, trainer, pl_module):
+            trainer.val_check_interval = self.eval_every
+            trainer.val_check_batch = self.eval_every
+            self.log_eval_start(trainer, pl_module)
             return super().on_validation_start(trainer, pl_module)
         def on_validation_end(self, trainer, pl_module):
@@ Expand Down @@

large_language_model_pretraining/nemo/config.sh

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -12,43 +12,49 @@
  
    # See the License for the specific language governing permissions and

    # limitations under the License.

    TODAY_DATE="$(date +'%y%m%d')"

    SUFFIX="gbs4608"

    EXP_DIR="${TODAY_DATE}/${SUFFIX}"

    export TAG="20250630"

    # SSH: username that connects to the remote cluster

    export USER=""

    export USER="michalm"

    # SSH: remote cluster URL

    export HOST=""

    export HOST="cw-dfw-cs-001-login-01.nvidia.com"

    # Slurm: account for job submission 

    export ACCOUNT=""

    export ACCOUNT="coreai_mlperf_training"

    # Slurm: partition for job submission

    export PARTITION=""

    export PARTITION="batch"

    # Slurm: job time limit, defaults to 4 hours

    export TIME="04:00:00"

    # Slurm: --nodes arguments, default to use 288 nodes

    export NNODES=288

    # Slurm: --gpus_per_node and --ntasks_per_node argument, defaults to 8 GPUs per node

    export GPUS_PER_NODE=8

    # Slurm: max job retries for transient job failures, defaults to retry 3 times

    export MAX_RETRIES=3

    export MAX_RETRIES=0

    # Folder mapping:

    # Output directory that holds logs, any path that you like. 

    export JOB_DIR=""

    export JOB_DIR="/lustre/fs1/portfolios/coreai/users/michalm/raw-logs/llama31_405b_reference/${EXP_DIR}"

    # Image / container path, either local cache file or remote URL

    export IMAGE=""

    export IMAGE="/lustre/fs1/portfolios/coreai/users/michalm/containers/dl+mlperf+training_references+llama31_405b_${TAG}.sqsh"

    # Dataset: C4 dataset location that contains the dataset after preprocessing

    # This corresponds to the PREPROCESSED_PATH in README section 3's dataset download part

    export PREPROCESSED_PATH=""

    export PREPROCESSED_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/c4"

    # Dataset: Numpy index working directory, contains shuffled dataset

    # This path must be able to hold >400GB data

    export TMP_NPY_INDEX=""

    export TMP_NPY_INDEX="/lustre/fs1/portfolios/coreai/users/michalm/llm-refresh-llama31/presistent_npy_index"

    # Dataset: Tokenizer path

    # This corresponds to the TOKENIZER_PATH in README section 3's tokenizer download part

    export TOKENIZER_PATH=""

    export TOKENIZER_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/mixtral-tokenizer"

    # Model: checkpoint and tokenizer path

    #     This is the checkpoint that we want to start with. 

    #     Each checkpoint should be a folder containing two sub-folders: context and weights. 

    #     And we need to pass this folder's path (the folder containing context and weights) here.  

    export MODEL_CKPT=""

    export MODEL_CKPT="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/nemo-ckpt/405b"

    # Model: Continual checkpoint directory to write and resume

    #     This is the directory to hold all intermediate checkpoints. 

    #     Once a run is complete and we specify to save checkpoints, 

    @@ -57,9 +63,9 @@ export MODEL_CKPT=""
  
    #     Inside this directory, there should be a `checkpoint` directory that holds context and weights

    #     which is the "actual checkpoint". 

    #     Notice that this path must be able to hold at least 5.2TB data since each checkpoint is 5.2TB. 

    export CONTINUAL_CKPT=""

    export CONTINUAL_CKPT="/lustre/fs1/portfolios/coreai/users/yunzhoul/llm-reference/reference_working_directory/checkpoints"

    # Model: Whether we want to restore from MODEL_CKPT path. If 0, then we are not restoring. 

    export USE_CKPT=0

    export USE_CKPT=1

    # Model: Whether we are resuming from a NeMo-formatted HuggingFace checkpoint (weights only). 

    #     If set to 1, then checkpoint resuming code will not try to load the optimizer states. 

    export FROM_HF=1

    @@ -71,13 +77,15 @@ export SAVE_CKPT=0
  
    # Model: size, to choose from 8b, 70b, 405b

    export SIZE="405b"

    # Dataloader: Global batch size

    export GBS=1152

    export GBS=4608

    # Dataloader: Micro batch size

    export MBS=1

    # Dataloader: Max run N batches, optional

    #     If an empty string is provided (""), then the training will continue until time limit

    #     If we want to save a checkpoint, then this value must be set

    export MAX_STEPS=""

    export START_EVAL_AT="368640"

    export EVAL_EVERY="18432" 

    # Experiment: starting steps

    #     This is the starting "offset" step from the checkpoint. 

    @@ -92,4 +100,5 @@ export NPAR=1
  
    # Experiment manager: provides seeds to the launched experiments, use space as delimiter, such as "1234 1235 1236"

    #     The training script will discard all excessive seeds, and generate seeds if given seeds < NEXP. 

    #     To preserve randomness, we recommend not to set this value so that each time seeds can be randomly generated. 

    export SEEDS=""
      
    export SEEDS="14932"

    unset SEEDS

large_language_model_pretraining/nemo/config_8b.sh

-Original file line number
+Diff line change
@@ -0,0 +1,104 @@
+    # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    TODAY_DATE="$(date +'%y%m%d')"
+    SUFFIX="gbs1152"
+    EXP_DIR="${TODAY_DATE}/${SUFFIX}"
+    export TAG="20250629"
+    # SSH: username that connects to the remote cluster
+    export USER="michalm"
+    # SSH: remote cluster URL
+    export HOST="cw-dfw-cs-001-login-01.nvidia.com"
+    # Slurm: account for job submission
+    export ACCOUNT="coreai_mlperf_training"
+    # Slurm: partition for job submission
+    export PARTITION="batch"
+    # Slurm: job time limit, defaults to 4 hours
+    export TIME="01:00:00"
+    # Slurm: --nodes arguments, default to use 288 nodes
+    export NNODES=8
+    # Slurm: --gpus_per_node and --ntasks_per_node argument, defaults to 8 GPUs per node
+    export GPUS_PER_NODE=8
+    # Slurm: max job retries for transient job failures, defaults to retry 3 times
+    export MAX_RETRIES=0
+    # Folder mapping:
+    # Output directory that holds logs, any path that you like.
+    export JOB_DIR="/lustre/fs1/portfolios/coreai/users/michalm/raw-logs/llama31_405b_reference/${EXP_DIR}"
+    # Image / container path, either local cache file or remote URL
+    export IMAGE="/lustre/fs1/portfolios/coreai/users/michalm/containers/dl+mlperf+training_references+llama31_405b_${TAG}.sqsh"
+    # Dataset: C4 dataset location that contains the dataset after preprocessing
+    # This corresponds to the PREPROCESSED_PATH in README section 3's dataset download part
+    export PREPROCESSED_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/c4"
+    # Dataset: Numpy index working directory, contains shuffled dataset
+    # This path must be able to hold >400GB data
+    export TMP_NPY_INDEX="/lustre/fs1/portfolios/coreai/users/michalm/llm-refresh-llama31/presistent_npy_index"
+    # Dataset: Tokenizer path
+    # This corresponds to the TOKENIZER_PATH in README section 3's tokenizer download part
+    export TOKENIZER_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/mixtral-tokenizer"
+    # Model: checkpoint and tokenizer path
+    #     This is the checkpoint that we want to start with.
+    #     Each checkpoint should be a folder containing two sub-folders: context and weights.
+    #     And we need to pass this folder's path (the folder containing context and weights) here.
+    export MODEL_CKPT="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/nemo-ckpt/405b"
+    # Model: Continual checkpoint directory to write and resume
+    #     This is the directory to hold all intermediate checkpoints.
+    #     Once a run is complete and we specify to save checkpoints,
+    #     we should see a checkpoint written in this folder
+    #     with name `checkpoint-par-x-y-steps`
+    #     Inside this directory, there should be a `checkpoint` directory that holds context and weights
+    #     which is the "actual checkpoint".
+    #     Notice that this path must be able to hold at least 5.2TB data since each checkpoint is 5.2TB.
+    export CONTINUAL_CKPT="/lustre/fs1/portfolios/coreai/users/yunzhoul/llm-reference/reference_working_directory/checkpoints"
+    # Model: Whether we want to restore from MODEL_CKPT path. If 0, then we are not restoring.
+    export USE_CKPT=0
+    # Model: Whether we are resuming from a NeMo-formatted HuggingFace checkpoint (weights only).
+    #     If set to 1, then checkpoint resuming code will not try to load the optimizer states.
+    export FROM_HF=1
+    # Model: Whether we want to save a checkpoint. Must be 1 if NPAR > 1. If 1, then we save a checkpoint at the end.
+    export SAVE_CKPT=0
+    # Training Configs:
+    # Model: size, to choose from 8b, 70b, 405b
+    export SIZE="8b"
+    # Dataloader: Global batch size
+    export GBS=1152
+    # Dataloader: Micro batch size
+    export MBS=1
+    # Dataloader: Max run N batches, optional
+    #     If an empty string is provided (""), then the training will continue until time limit
+    #     If we want to save a checkpoint, then this value must be set
+    export MAX_STEPS="400"
+    export EVAL_EVERY="11520" # skip 5 first evals
+    export START_EVAL_AT="23040"
+    # Experiment: starting steps
+    #     This is the starting "offset" step from the checkpoint.
+    #     For instance, if you are resuming from a checkpoint folder `checkpoint-par-0-20-steps/checkpoint`,
+    #     which means that the model is trained for 20 steps to generate the checkpoint,
+    #     then the value 20 is needed here.
+    export START_STEPS="0"
+    # Experiment manager: Number of experiments to launch
+    export NEXP=1
+    # Experiment manager: how many consecutive jobs we want for each experiment
+    export NPAR=1
+    # Experiment manager: provides seeds to the launched experiments, use space as delimiter, such as "1234 1235 1236"
+    #     The training script will discard all excessive seeds, and generate seeds if given seeds < NEXP.
+    #     To preserve randomness, we recommend not to set this value so that each time seeds can be randomly generated.
+    export SEEDS="14932"

large_language_model_pretraining/nemo/pretrain_llama31.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -25,6 +25,8 @@ @@
     from nemo.collections.llm.gpt.data import build_pretraining_datamodule
     from callbacks import PreemptiveStop, MLPerfCallback, MetricsLogger
+    print("USING LOCAL COPY")
     def slurm_executor(
         user: str,
         host: str,
@@ Expand Down Expand Up / @@ -93,6 +95,7 @@ def get_pretrain( @@
         nnodes: int,
         ngpus_per_node: int,
         data_module: run.Config,
+        start_eval_at: Optional[int]=None,
         eval_every: Optional[int]=None,
         eval_batches: Optional[int]=None,
     ) -> run.Partial:
@@ Expand Down Expand Up / @@ -180,7 +183,7 @@ def get_pretrain( @@
         pretrain.trainer.max_steps = math.ceil(max_tokens / 8192 / gbs)
         pretrain.data = data_module
-        pretrain.trainer.val_check_interval = eval_every
+        pretrain.trainer.val_check_interval = start_eval_at
         pretrain.trainer.limit_val_batches = eval_batches
         pretrain.trainer.limit_test_batches = eval_batches
@@ Expand Down Expand Up / @@ -300,7 +303,8 @@ def get_parser() -> argparse.ArgumentParser: @@
         data_group.add_argument("--gbs", type=int, default=1152, help="Global batch size, should be divisible by PP")
         data_group.add_argument("--mbs", type=int, default=1, help="Micro batch size")
-        data_group.add_argument("--eval_every", type=int, default=46080, help="Evaluate at least every N training sequences")
+        data_group.add_argument("--start_eval_at", type=int, default=262144, help="Start evaluating at N training sequences")
+        data_group.add_argument("--eval_every", type=int, default=16384, help="Evaluate at least every N training sequences")
         data_group.add_argument("--eval_tokens", type=int, default=5760, help="Evaluate using at least N evaluation sequences")
         data_group.add_argument('--max_steps', type=int, default=None, help="Maximum number of steps that each experiment partition will train on. None means no restriction on max steps. ")
         data_group.add_argument("--use_full_dataset", action="store_true", help="If set, then we use the full dataset, instead of the last 256/1024 shards")
@@ Expand Down Expand Up / @@ -352,6 +356,7 @@ def get_parser() -> argparse.ArgumentParser: @@
             use_full_dataset=args.use_full_dataset,
         )
+        start_eval_at = math.ceil(args.start_eval_at / args.gbs)
         eval_every_n_batches = math.ceil(args.eval_every / (args.gbs))
         eval_batches = math.ceil(args.eval_tokens / (args.gbs))
@@ Expand All / @@ -360,6 +365,7 @@ def get_parser() -> argparse.ArgumentParser: @@
             nnodes=args.nodes,
             ngpus_per_node=args.gpus_per_node,
             data_module=data,
+            start_eval_at=start_eval_at,
             eval_every=eval_every_n_batches,
             eval_batches=eval_batches,
         )
@@ Expand Down Expand Up / @@ -497,6 +503,7 @@ def get_parser() -> argparse.ArgumentParser: @@
                                     micro_batch_size=args.mbs,
                                     sequence_length=8192,
                                     init_global_step=start_step,
+                                    eval_every=eval_every_n_batches,
                                     configs=configs,
                                 ),
                             ]
@@ Expand Down @@

large_language_model_pretraining/nemo/run_llama31.sh

-Original file line number
+Diff line change
@@ Expand Up / @@ -139,4 +139,6 @@ python3 pretrain_llama31.py \ @@
     --step_time_atol $STEP_TIME_ATOL \
     --ckpt_start_step $START_STEPS \
     --max_retries $MAX_RETRIES \
+    --eval_every $EVAL_EVERY \
+    --start_eval_at $START_EVAL_AT \
     $CMD_SUFFIX

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Michalm/fix llama31 ref #2

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Michalm/fix llama31 ref #2

Are you sure you want to change the base?

Uh oh!

Michalm/fix llama31 ref #2

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing