From 1512e71869dd3c9c30ddcbb5c6dee8e19c7ce089 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Thu, 5 Mar 2026 13:17:25 +0530
Subject: [PATCH 1/6] nt nano optimizations

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../nemotronh/nemotron_3_nano_llm_pretrain.py       |  2 ++
 .../nemotron_3_nano_workload_base_configs.py        | 13 ++++++++-----
 scripts/performance/perf_plugins.py                 |  3 +++
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py
index 4f23420451..3ecccd6f54 100644
--- a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py
+++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py
@@ -30,6 +30,8 @@ def set_nemotron_3_nano_common_configs(cfg: ConfigContainer) -> None:
     cfg.mixed_precision.grad_reduce_in_fp32 = False
     cfg.ddp.grad_reduce_in_fp32 = False
 
+    cfg.model.moe_router_force_load_balancing = True
+
 
 def nemotron_3_nano_pretrain_config_gb300(
     precision: str = "bf16", mock: bool = True, config_variant: str = "v1"
diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
index 7c85f88543..fb2c465814 100644
--- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
+++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
@@ -31,37 +31,38 @@
 BASE_NEMOTRON_3_NANO_CONFIG = WorkloadBaseConfig(
     num_gpus=8,
     global_batch_size=512,
-    micro_batch_size=2,
     tensor_model_parallel_size=1,
     expert_tensor_parallel_size=1,
     expert_model_parallel_size=8,
     moe_flex_dispatcher_backend="hybridep",
+    cuda_graph_impl="transformer_engine",
+    cuda_graph_scope=["attn", "mamba", "moe_router", "moe_preprocess"],
 )
 
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,
-    tensor_model_parallel_size=1,
+    micro_batch_size=4,
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1
 
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,
-    tensor_model_parallel_size=1,
+    micro_batch_size=2,
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1
 
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,
-    tensor_model_parallel_size=1,
+    micro_batch_size=4,
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1
 
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,
-    tensor_model_parallel_size=1,
+    micro_batch_size=2,
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1
@@ -72,6 +73,8 @@
     global_batch_size=1024,
     micro_batch_size=1,
     recompute_modules=["moe", "layernorm"],
+    cuda_graph_impl="transformer_engine",
+    cuda_graph_scope=["attn", "mamba"],
 )
 
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1 = _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100
diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py
index 7bbd090497..fcf5b2dfd2 100644
--- a/scripts/performance/perf_plugins.py
+++ b/scripts/performance/perf_plugins.py
@@ -301,6 +301,9 @@ def _set_model_specific_environment_variables(
             if gpu in ["h100"] and model_recipe_name in ["llama3_70b"] and compute_dtype == "fp8_cs":
                 executor.env_vars["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
                 executor.env_vars["NCCL_GRAPH_REGISTER"] = "0"
+        if model_recipe_name in ["nemotron_3_nano"]:
+            del_cudnn_ln = False
+
         if del_cudnn_ln:
             if "NVTE_NORM_FWD_USE_CUDNN" in executor.env_vars:
                 executor.env_vars.pop("NVTE_NORM_FWD_USE_CUDNN")

From 8d99ed052cd7a04fa29a428d477e90474a58fd03 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Thu, 5 Mar 2026 18:02:29 +0530
Subject: [PATCH 2/6] h100 fp8 cfg

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../nemotron_3_nano_workload_base_configs.py          | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
index fb2c465814..1b6ab3815b 100644
--- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
+++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
@@ -72,13 +72,18 @@
     num_gpus=16,
     global_batch_size=1024,
     micro_batch_size=1,
-    recompute_modules=["moe", "layernorm"],
     cuda_graph_impl="transformer_engine",
     cuda_graph_scope=["attn", "mamba"],
 )
 
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1 = _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1 = replace(
+    _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100,
+    recompute_modules=["moe", "layernorm"],
+)
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = replace(
+    _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100,
+    recompute_modules=["moe", "layernorm", "mlp", "core_attn", "moe_act"],
+)
 
 __all__ = [
     "NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1",

From 21d53e4ee8dddf988b270a983264647980224e09 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Thu, 5 Mar 2026 18:51:24 +0530
Subject: [PATCH 3/6] h100 fp8 recompute modules

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../configs/nemotronh/nemotron_3_nano_workload_base_configs.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
index 1b6ab3815b..0f8ebe2bb6 100644
--- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
+++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
@@ -82,7 +82,7 @@
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = replace(
     _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100,
-    recompute_modules=["moe", "layernorm", "mlp", "core_attn", "moe_act"],
+    recompute_modules=["moe", "layernorm", "core_attn", "moe_act"],
 )
 
 __all__ = [

From ec932d5930eb89d8e331870c5667b755916786c3 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Thu, 5 Mar 2026 20:34:17 +0530
Subject: [PATCH 4/6] h100 fp8 CG scope reduced

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../configs/nemotronh/nemotron_3_nano_workload_base_configs.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
index 0f8ebe2bb6..fe1e86f3be 100644
--- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
+++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
@@ -73,15 +73,16 @@
     global_batch_size=1024,
     micro_batch_size=1,
     cuda_graph_impl="transformer_engine",
-    cuda_graph_scope=["attn", "mamba"],
 )
 
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1 = replace(
     _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100,
     recompute_modules=["moe", "layernorm"],
+    cuda_graph_scope=["attn", "mamba"],
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = replace(
     _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100,
+    cuda_graph_scope=["attn"],
     recompute_modules=["moe", "layernorm", "core_attn", "moe_act"],
 )
 

From 0d3ab68c58d789b33083b7aafcb2d44a2416155e Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Thu, 5 Mar 2026 20:49:43 +0530
Subject: [PATCH 5/6] h100 fp8 CG scope reduced

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../configs/nemotronh/nemotron_3_nano_workload_base_configs.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
index fe1e86f3be..5a7a7eff0d 100644
--- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
+++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
@@ -82,7 +82,7 @@
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = replace(
     _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100,
-    cuda_graph_scope=["attn"],
+    cuda_graph_scope=["mamaba"],
     recompute_modules=["moe", "layernorm", "core_attn", "moe_act"],
 )
 

From c348e6a648268358c523a1d41272a6ec4ff1e039 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Thu, 5 Mar 2026 21:22:12 +0530
Subject: [PATCH 6/6] h100 fp8 CG scope reduced

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../configs/nemotronh/nemotron_3_nano_workload_base_configs.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
index 5a7a7eff0d..03abcf8adb 100644
--- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
+++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
@@ -82,7 +82,7 @@
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = replace(
     _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100,
-    cuda_graph_scope=["mamaba"],
+    cuda_graph_scope=["mamba"],
     recompute_modules=["moe", "layernorm", "core_attn", "moe_act"],
 )