diff --git a/large_language_model_pretraining/nemo/callbacks.py b/large_language_model_pretraining/nemo/callbacks.py
index 7d4ad2439..99ab4d219 100644
--- a/large_language_model_pretraining/nemo/callbacks.py
+++ b/large_language_model_pretraining/nemo/callbacks.py
@@ -159,6 +159,7 @@ def __init__(
         micro_batch_size,
         sequence_length,
         init_global_step,
+        eval_every,
         configs={}
     ):
         mllogger.event(key=constants.CACHE_CLEAR, value=True)
@@ -169,6 +170,7 @@ def __init__(
         self.gbs = global_batch_size
         self.mbs = micro_batch_size
         self.seq_len = sequence_length
+        self.eval_every = eval_every
 
         self.is_target_reached = False
         self.status = constants.ABORTED
@@ -185,7 +187,6 @@ def set_success_status(self):
     def on_train_epoch_start(self, trainer, pl_module):
         mllogger.start(key=constants.EPOCH_START, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)})
         mllogger.start(key=constants.BLOCK_START, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)})
-
         return super().on_train_epoch_start(trainer, pl_module)
     
     @rank_zero_only
@@ -201,9 +202,14 @@ def on_train_end(self, trainer, pl_module):
         return super().on_train_end(trainer, pl_module)
     
     @rank_zero_only
-    def on_validation_start(self, trainer, pl_module):
+    def log_eval_start(self, trainer, pl_module):
         mllogger.end(key=constants.BLOCK_STOP, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)})
         mllogger.start(key=constants.EVAL_START, metadata={constants.SAMPLES_COUNT: self.consumed_samples(trainer)})
+
+    def on_validation_start(self, trainer, pl_module):
+        trainer.val_check_interval = self.eval_every
+        trainer.val_check_batch = self.eval_every
+        self.log_eval_start(trainer, pl_module)
         return super().on_validation_start(trainer, pl_module)
 
     def on_validation_end(self, trainer, pl_module):
diff --git a/large_language_model_pretraining/nemo/config.sh b/large_language_model_pretraining/nemo/config.sh
index 5df2a222b..d3e96ada5 100644
--- a/large_language_model_pretraining/nemo/config.sh
+++ b/large_language_model_pretraining/nemo/config.sh
@@ -12,14 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+TODAY_DATE="$(date +'%y%m%d')"
+SUFFIX="gbs4608"
+EXP_DIR="${TODAY_DATE}/${SUFFIX}"
+
+export TAG="20250630"
+
 # SSH: username that connects to the remote cluster
-export USER=""
+export USER="michalm"
 # SSH: remote cluster URL
-export HOST=""
+export HOST="cw-dfw-cs-001-login-01.nvidia.com"
 # Slurm: account for job submission 
-export ACCOUNT=""
+export ACCOUNT="coreai_mlperf_training"
 # Slurm: partition for job submission
-export PARTITION=""
+export PARTITION="batch"
 # Slurm: job time limit, defaults to 4 hours
 export TIME="04:00:00"
 # Slurm: --nodes arguments, default to use 288 nodes
@@ -27,28 +33,28 @@ export NNODES=288
 # Slurm: --gpus_per_node and --ntasks_per_node argument, defaults to 8 GPUs per node
 export GPUS_PER_NODE=8
 # Slurm: max job retries for transient job failures, defaults to retry 3 times
-export MAX_RETRIES=3
+export MAX_RETRIES=0
 
 # Folder mapping:
 # Output directory that holds logs, any path that you like. 
-export JOB_DIR=""
+export JOB_DIR="/lustre/fs1/portfolios/coreai/users/michalm/raw-logs/llama31_405b_reference/${EXP_DIR}"
 # Image / container path, either local cache file or remote URL
-export IMAGE=""
+export IMAGE="/lustre/fs1/portfolios/coreai/users/michalm/containers/dl+mlperf+training_references+llama31_405b_${TAG}.sqsh"
 # Dataset: C4 dataset location that contains the dataset after preprocessing
 # This corresponds to the PREPROCESSED_PATH in README section 3's dataset download part
-export PREPROCESSED_PATH=""
+export PREPROCESSED_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/c4"
 # Dataset: Numpy index working directory, contains shuffled dataset
 # This path must be able to hold >400GB data
-export TMP_NPY_INDEX=""
+export TMP_NPY_INDEX="/lustre/fs1/portfolios/coreai/users/michalm/llm-refresh-llama31/presistent_npy_index"
 # Dataset: Tokenizer path
 # This corresponds to the TOKENIZER_PATH in README section 3's tokenizer download part
-export TOKENIZER_PATH=""
+export TOKENIZER_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/mixtral-tokenizer"
 
 # Model: checkpoint and tokenizer path
 #     This is the checkpoint that we want to start with. 
 #     Each checkpoint should be a folder containing two sub-folders: context and weights. 
 #     And we need to pass this folder's path (the folder containing context and weights) here.  
-export MODEL_CKPT=""
+export MODEL_CKPT="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/nemo-ckpt/405b"
 # Model: Continual checkpoint directory to write and resume
 #     This is the directory to hold all intermediate checkpoints. 
 #     Once a run is complete and we specify to save checkpoints, 
@@ -57,9 +63,9 @@ export MODEL_CKPT=""
 #     Inside this directory, there should be a `checkpoint` directory that holds context and weights
 #     which is the "actual checkpoint". 
 #     Notice that this path must be able to hold at least 5.2TB data since each checkpoint is 5.2TB. 
-export CONTINUAL_CKPT=""
+export CONTINUAL_CKPT="/lustre/fs1/portfolios/coreai/users/yunzhoul/llm-reference/reference_working_directory/checkpoints"
 # Model: Whether we want to restore from MODEL_CKPT path. If 0, then we are not restoring. 
-export USE_CKPT=0
+export USE_CKPT=1
 # Model: Whether we are resuming from a NeMo-formatted HuggingFace checkpoint (weights only). 
 #     If set to 1, then checkpoint resuming code will not try to load the optimizer states. 
 export FROM_HF=1
@@ -71,13 +77,15 @@ export SAVE_CKPT=0
 # Model: size, to choose from 8b, 70b, 405b
 export SIZE="405b"
 # Dataloader: Global batch size
-export GBS=1152
+export GBS=4608
 # Dataloader: Micro batch size
 export MBS=1
 # Dataloader: Max run N batches, optional
 #     If an empty string is provided (""), then the training will continue until time limit
 #     If we want to save a checkpoint, then this value must be set
 export MAX_STEPS=""
+export START_EVAL_AT="368640"
+export EVAL_EVERY="18432" 
 
 # Experiment: starting steps
 #     This is the starting "offset" step from the checkpoint. 
@@ -92,4 +100,5 @@ export NPAR=1
 # Experiment manager: provides seeds to the launched experiments, use space as delimiter, such as "1234 1235 1236"
 #     The training script will discard all excessive seeds, and generate seeds if given seeds < NEXP. 
 #     To preserve randomness, we recommend not to set this value so that each time seeds can be randomly generated. 
-export SEEDS=""
\ No newline at end of file
+export SEEDS="14932"
+unset SEEDS
diff --git a/large_language_model_pretraining/nemo/config_8b.sh b/large_language_model_pretraining/nemo/config_8b.sh
new file mode 100644
index 000000000..43d94396e
--- /dev/null
+++ b/large_language_model_pretraining/nemo/config_8b.sh
@@ -0,0 +1,104 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+TODAY_DATE="$(date +'%y%m%d')"
+SUFFIX="gbs1152"
+EXP_DIR="${TODAY_DATE}/${SUFFIX}"
+
+export TAG="20250629"
+
+
+# SSH: username that connects to the remote cluster
+export USER="michalm"
+# SSH: remote cluster URL
+export HOST="cw-dfw-cs-001-login-01.nvidia.com"
+# Slurm: account for job submission 
+export ACCOUNT="coreai_mlperf_training"
+# Slurm: partition for job submission
+export PARTITION="batch"
+# Slurm: job time limit, defaults to 4 hours
+export TIME="01:00:00"
+# Slurm: --nodes arguments, default to use 288 nodes
+export NNODES=8
+# Slurm: --gpus_per_node and --ntasks_per_node argument, defaults to 8 GPUs per node
+export GPUS_PER_NODE=8
+# Slurm: max job retries for transient job failures, defaults to retry 3 times
+export MAX_RETRIES=0
+
+# Folder mapping:
+# Output directory that holds logs, any path that you like. 
+export JOB_DIR="/lustre/fs1/portfolios/coreai/users/michalm/raw-logs/llama31_405b_reference/${EXP_DIR}"
+# Image / container path, either local cache file or remote URL
+export IMAGE="/lustre/fs1/portfolios/coreai/users/michalm/containers/dl+mlperf+training_references+llama31_405b_${TAG}.sqsh"
+# Dataset: C4 dataset location that contains the dataset after preprocessing
+# This corresponds to the PREPROCESSED_PATH in README section 3's dataset download part
+export PREPROCESSED_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/c4"
+# Dataset: Numpy index working directory, contains shuffled dataset
+# This path must be able to hold >400GB data
+export TMP_NPY_INDEX="/lustre/fs1/portfolios/coreai/users/michalm/llm-refresh-llama31/presistent_npy_index"
+# Dataset: Tokenizer path
+# This corresponds to the TOKENIZER_PATH in README section 3's tokenizer download part
+export TOKENIZER_PATH="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/mixtral-tokenizer"
+
+# Model: checkpoint and tokenizer path
+#     This is the checkpoint that we want to start with. 
+#     Each checkpoint should be a folder containing two sub-folders: context and weights. 
+#     And we need to pass this folder's path (the folder containing context and weights) here.  
+export MODEL_CKPT="/lustre/fs1/portfolios/coreai/projects/coreai_mlperf_training/data/llama31/nemo-ckpt/405b"
+# Model: Continual checkpoint directory to write and resume
+#     This is the directory to hold all intermediate checkpoints. 
+#     Once a run is complete and we specify to save checkpoints, 
+#     we should see a checkpoint written in this folder
+#     with name `checkpoint-par-x-y-steps`
+#     Inside this directory, there should be a `checkpoint` directory that holds context and weights
+#     which is the "actual checkpoint". 
+#     Notice that this path must be able to hold at least 5.2TB data since each checkpoint is 5.2TB. 
+export CONTINUAL_CKPT="/lustre/fs1/portfolios/coreai/users/yunzhoul/llm-reference/reference_working_directory/checkpoints"
+# Model: Whether we want to restore from MODEL_CKPT path. If 0, then we are not restoring. 
+export USE_CKPT=0
+# Model: Whether we are resuming from a NeMo-formatted HuggingFace checkpoint (weights only). 
+#     If set to 1, then checkpoint resuming code will not try to load the optimizer states. 
+export FROM_HF=1
+# Model: Whether we want to save a checkpoint. Must be 1 if NPAR > 1. If 1, then we save a checkpoint at the end.
+export SAVE_CKPT=0
+
+
+# Training Configs: 
+# Model: size, to choose from 8b, 70b, 405b
+export SIZE="8b"
+# Dataloader: Global batch size
+export GBS=1152
+# Dataloader: Micro batch size
+export MBS=1
+# Dataloader: Max run N batches, optional
+#     If an empty string is provided (""), then the training will continue until time limit
+#     If we want to save a checkpoint, then this value must be set
+export MAX_STEPS="400"
+export EVAL_EVERY="11520" # skip 5 first evals
+export START_EVAL_AT="23040"
+
+# Experiment: starting steps
+#     This is the starting "offset" step from the checkpoint. 
+#     For instance, if you are resuming from a checkpoint folder `checkpoint-par-0-20-steps/checkpoint`, 
+#     which means that the model is trained for 20 steps to generate the checkpoint, 
+#     then the value 20 is needed here. 
+export START_STEPS="0"
+# Experiment manager: Number of experiments to launch
+export NEXP=1
+# Experiment manager: how many consecutive jobs we want for each experiment
+export NPAR=1
+# Experiment manager: provides seeds to the launched experiments, use space as delimiter, such as "1234 1235 1236"
+#     The training script will discard all excessive seeds, and generate seeds if given seeds < NEXP. 
+#     To preserve randomness, we recommend not to set this value so that each time seeds can be randomly generated. 
+export SEEDS="14932"
diff --git a/large_language_model_pretraining/nemo/pretrain_llama31.py b/large_language_model_pretraining/nemo/pretrain_llama31.py
index 891b22176..7f3765804 100644
--- a/large_language_model_pretraining/nemo/pretrain_llama31.py
+++ b/large_language_model_pretraining/nemo/pretrain_llama31.py
@@ -25,6 +25,8 @@
 from nemo.collections.llm.gpt.data import build_pretraining_datamodule
 from callbacks import PreemptiveStop, MLPerfCallback, MetricsLogger
 
+print("USING LOCAL COPY")
+
 def slurm_executor(
     user: str,
     host: str,
@@ -93,6 +95,7 @@ def get_pretrain(
     nnodes: int, 
     ngpus_per_node: int,
     data_module: run.Config,
+    start_eval_at: Optional[int]=None,
     eval_every: Optional[int]=None, 
     eval_batches: Optional[int]=None,
 ) -> run.Partial:
@@ -180,7 +183,7 @@ def get_pretrain(
     pretrain.trainer.max_steps = math.ceil(max_tokens / 8192 / gbs)
 
     pretrain.data = data_module
-    pretrain.trainer.val_check_interval = eval_every
+    pretrain.trainer.val_check_interval = start_eval_at
     pretrain.trainer.limit_val_batches = eval_batches
     pretrain.trainer.limit_test_batches = eval_batches
 
@@ -300,7 +303,8 @@ def get_parser() -> argparse.ArgumentParser:
     
     data_group.add_argument("--gbs", type=int, default=1152, help="Global batch size, should be divisible by PP")
     data_group.add_argument("--mbs", type=int, default=1, help="Micro batch size")
-    data_group.add_argument("--eval_every", type=int, default=46080, help="Evaluate at least every N training sequences")
+    data_group.add_argument("--start_eval_at", type=int, default=262144, help="Start evaluating at N training sequences")
+    data_group.add_argument("--eval_every", type=int, default=16384, help="Evaluate at least every N training sequences")
     data_group.add_argument("--eval_tokens", type=int, default=5760, help="Evaluate using at least N evaluation sequences")
     data_group.add_argument('--max_steps', type=int, default=None, help="Maximum number of steps that each experiment partition will train on. None means no restriction on max steps. ")
     data_group.add_argument("--use_full_dataset", action="store_true", help="If set, then we use the full dataset, instead of the last 256/1024 shards")
@@ -352,6 +356,7 @@ def get_parser() -> argparse.ArgumentParser:
         use_full_dataset=args.use_full_dataset,
     )
 
+    start_eval_at = math.ceil(args.start_eval_at / args.gbs)
     eval_every_n_batches = math.ceil(args.eval_every / (args.gbs))
     eval_batches = math.ceil(args.eval_tokens / (args.gbs))
 
@@ -360,6 +365,7 @@ def get_parser() -> argparse.ArgumentParser:
         nnodes=args.nodes, 
         ngpus_per_node=args.gpus_per_node,
         data_module=data,
+        start_eval_at=start_eval_at,
         eval_every=eval_every_n_batches,
         eval_batches=eval_batches,
     )
@@ -497,6 +503,7 @@ def get_parser() -> argparse.ArgumentParser:
                                 micro_batch_size=args.mbs, 
                                 sequence_length=8192,
                                 init_global_step=start_step,
+                                eval_every=eval_every_n_batches,
                                 configs=configs,
                             ),
                         ]
diff --git a/large_language_model_pretraining/nemo/run_llama31.sh b/large_language_model_pretraining/nemo/run_llama31.sh
index cc8d91021..f1bdfd99a 100644
--- a/large_language_model_pretraining/nemo/run_llama31.sh
+++ b/large_language_model_pretraining/nemo/run_llama31.sh
@@ -139,4 +139,6 @@ python3 pretrain_llama31.py \
 --step_time_atol $STEP_TIME_ATOL \
 --ckpt_start_step $START_STEPS \
 --max_retries $MAX_RETRIES \
+--eval_every $EVAL_EVERY \
+--start_eval_at $START_EVAL_AT \
 $CMD_SUFFIX