From 1512e71869dd3c9c30ddcbb5c6dee8e19c7ce089 Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Thu, 5 Mar 2026 13:17:25 +0530 Subject: [PATCH 1/6] nt nano optimizations Signed-off-by: Malay Nagda --- .../nemotronh/nemotron_3_nano_llm_pretrain.py | 2 ++ .../nemotron_3_nano_workload_base_configs.py | 13 ++++++++----- scripts/performance/perf_plugins.py | 3 +++ 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py index 4f23420451..3ecccd6f54 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py @@ -30,6 +30,8 @@ def set_nemotron_3_nano_common_configs(cfg: ConfigContainer) -> None: cfg.mixed_precision.grad_reduce_in_fp32 = False cfg.ddp.grad_reduce_in_fp32 = False + cfg.model.moe_router_force_load_balancing = True + def nemotron_3_nano_pretrain_config_gb300( precision: str = "bf16", mock: bool = True, config_variant: str = "v1" diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py index 7c85f88543..fb2c465814 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py @@ -31,37 +31,38 @@ BASE_NEMOTRON_3_NANO_CONFIG = WorkloadBaseConfig( num_gpus=8, global_batch_size=512, - micro_batch_size=2, tensor_model_parallel_size=1, expert_tensor_parallel_size=1, expert_model_parallel_size=8, moe_flex_dispatcher_backend="hybridep", + cuda_graph_impl="transformer_engine", + cuda_graph_scope=["attn", "mamba", "moe_router", "moe_preprocess"], ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 = replace( BASE_NEMOTRON_3_NANO_CONFIG, - tensor_model_parallel_size=1, + micro_batch_size=4, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 = replace( BASE_NEMOTRON_3_NANO_CONFIG, - tensor_model_parallel_size=1, + micro_batch_size=2, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 = replace( BASE_NEMOTRON_3_NANO_CONFIG, - tensor_model_parallel_size=1, + micro_batch_size=4, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 = replace( BASE_NEMOTRON_3_NANO_CONFIG, - tensor_model_parallel_size=1, + micro_batch_size=2, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 @@ -72,6 +73,8 @@ global_batch_size=1024, micro_batch_size=1, recompute_modules=["moe", "layernorm"], + cuda_graph_impl="transformer_engine", + cuda_graph_scope=["attn", "mamba"], ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1 = _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py index 7bbd090497..fcf5b2dfd2 100644 --- a/scripts/performance/perf_plugins.py +++ b/scripts/performance/perf_plugins.py @@ -301,6 +301,9 @@ def _set_model_specific_environment_variables( if gpu in ["h100"] and model_recipe_name in ["llama3_70b"] and compute_dtype == "fp8_cs": executor.env_vars["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" executor.env_vars["NCCL_GRAPH_REGISTER"] = "0" + if model_recipe_name in ["nemotron_3_nano"]: + del_cudnn_ln = False + if del_cudnn_ln: if "NVTE_NORM_FWD_USE_CUDNN" in executor.env_vars: executor.env_vars.pop("NVTE_NORM_FWD_USE_CUDNN") From 8d99ed052cd7a04fa29a428d477e90474a58fd03 Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Thu, 5 Mar 2026 18:02:29 +0530 Subject: [PATCH 2/6] h100 fp8 cfg Signed-off-by: Malay Nagda --- .../nemotron_3_nano_workload_base_configs.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py index fb2c465814..1b6ab3815b 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py @@ -72,13 +72,18 @@ num_gpus=16, global_batch_size=1024, micro_batch_size=1, - recompute_modules=["moe", "layernorm"], cuda_graph_impl="transformer_engine", cuda_graph_scope=["attn", "mamba"], ) -NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1 = _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 -NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 +NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1 = replace( + _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100, + recompute_modules=["moe", "layernorm"], +) +NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = replace( + _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100, + recompute_modules=["moe", "layernorm", "mlp", "core_attn", "moe_act"], +) __all__ = [ "NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1", From 21d53e4ee8dddf988b270a983264647980224e09 Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Thu, 5 Mar 2026 18:51:24 +0530 Subject: [PATCH 3/6] h100 fp8 recompute modules Signed-off-by: Malay Nagda --- .../configs/nemotronh/nemotron_3_nano_workload_base_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py index 1b6ab3815b..0f8ebe2bb6 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py @@ -82,7 +82,7 @@ ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = replace( _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100, - recompute_modules=["moe", "layernorm", "mlp", "core_attn", "moe_act"], + recompute_modules=["moe", "layernorm", "core_attn", "moe_act"], ) __all__ = [ From ec932d5930eb89d8e331870c5667b755916786c3 Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Thu, 5 Mar 2026 20:34:17 +0530 Subject: [PATCH 4/6] h100 fp8 CG scope reduced Signed-off-by: Malay Nagda --- .../configs/nemotronh/nemotron_3_nano_workload_base_configs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py index 0f8ebe2bb6..fe1e86f3be 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py @@ -73,15 +73,16 @@ global_batch_size=1024, micro_batch_size=1, cuda_graph_impl="transformer_engine", - cuda_graph_scope=["attn", "mamba"], ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1 = replace( _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100, recompute_modules=["moe", "layernorm"], + cuda_graph_scope=["attn", "mamba"], ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = replace( _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100, + cuda_graph_scope=["attn"], recompute_modules=["moe", "layernorm", "core_attn", "moe_act"], ) From 0d3ab68c58d789b33083b7aafcb2d44a2416155e Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Thu, 5 Mar 2026 20:49:43 +0530 Subject: [PATCH 5/6] h100 fp8 CG scope reduced Signed-off-by: Malay Nagda --- .../configs/nemotronh/nemotron_3_nano_workload_base_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py index fe1e86f3be..5a7a7eff0d 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py @@ -82,7 +82,7 @@ ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = replace( _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100, - cuda_graph_scope=["attn"], + cuda_graph_scope=["mamaba"], recompute_modules=["moe", "layernorm", "core_attn", "moe_act"], ) From c348e6a648268358c523a1d41272a6ec4ff1e039 Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Thu, 5 Mar 2026 21:22:12 +0530 Subject: [PATCH 6/6] h100 fp8 CG scope reduced Signed-off-by: Malay Nagda --- .../configs/nemotronh/nemotron_3_nano_workload_base_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py index 5a7a7eff0d..03abcf8adb 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py @@ -82,7 +82,7 @@ ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = replace( _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100, - cuda_graph_scope=["mamaba"], + cuda_graph_scope=["mamba"], recompute_modules=["moe", "layernorm", "core_attn", "moe_act"], )