From 0a0de668c4cc52978fc2fac960b6e06b51231139 Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Tue, 3 Mar 2026 15:34:30 +0530 Subject: [PATCH 1/5] nemotron_3_nano_nvfp4_fix_260201 Signed-off-by: Malay Nagda --- .../nemotronh/nemotron_3_nano_workload_base_configs.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py index 7c85f88543..433180f118 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py @@ -50,7 +50,10 @@ tensor_model_parallel_size=1, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 -NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 +NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_NVFP4_V1 = replace( + NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1, + expert_tensor_parallel_size=None, +) NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 = replace( BASE_NEMOTRON_3_NANO_CONFIG, @@ -64,7 +67,10 @@ tensor_model_parallel_size=1, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 -NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 +NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = replace( + NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1, + expert_tensor_parallel_size=None, +) _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 = replace( BASE_NEMOTRON_3_NANO_CONFIG, From 5e3ee0f946265c180c1c93925820a1d033fce208 Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Tue, 3 Mar 2026 15:53:22 +0530 Subject: [PATCH 2/5] b300 etp=None Signed-off-by: Malay Nagda --- .../nemotronh/nemotron_3_nano_workload_base_configs.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py index 433180f118..9cab1f0e1d 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py @@ -60,17 +60,17 @@ tensor_model_parallel_size=1, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 -NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 +NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_NVFP4_V1 = replace( + NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1, + expert_tensor_parallel_size=None, +) NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 = replace( BASE_NEMOTRON_3_NANO_CONFIG, tensor_model_parallel_size=1, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 -NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = replace( - NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1, - expert_tensor_parallel_size=None, -) +NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 = replace( BASE_NEMOTRON_3_NANO_CONFIG, From ba9fbe812ff19f216e552ca60d4344a9ed057670 Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Tue, 3 Mar 2026 16:04:31 +0530 Subject: [PATCH 3/5] gbs=128,ep=4,etp=n/a Signed-off-by: Malay Nagda --- .../nemotron_3_nano_workload_base_configs.py | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py index 9cab1f0e1d..aa2fd139d5 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py @@ -43,34 +43,38 @@ tensor_model_parallel_size=1, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 -NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 +NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_NVFP4_V1 = replace( + NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1, + global_batch_size=128, + expert_tensor_parallel_size=None, + expert_model_parallel_size=4, +) NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 = replace( BASE_NEMOTRON_3_NANO_CONFIG, tensor_model_parallel_size=1, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 -NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_NVFP4_V1 = replace( - NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1, - expert_tensor_parallel_size=None, -) +NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 = replace( BASE_NEMOTRON_3_NANO_CONFIG, tensor_model_parallel_size=1, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 -NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_NVFP4_V1 = replace( - NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1, - expert_tensor_parallel_size=None, -) +NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 = replace( BASE_NEMOTRON_3_NANO_CONFIG, tensor_model_parallel_size=1, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 -NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 +NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = replace( + NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1, + global_batch_size=128, + expert_tensor_parallel_size=None, + expert_model_parallel_size=4, +) _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 = replace( BASE_NEMOTRON_3_NANO_CONFIG, From 17bfadb4846a26dbee473df01bd1553b12e43178 Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Tue, 3 Mar 2026 16:32:01 +0530 Subject: [PATCH 4/5] no grouped gemm Signed-off-by: Malay Nagda --- .../nemotronh/nemotron_3_nano_llm_pretrain.py | 6 ++++++ .../nemotron_3_nano_workload_base_configs.py | 14 ++------------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py index 4f23420451..134b09b9da 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py @@ -76,6 +76,9 @@ def nemotron_3_nano_pretrain_config_gb200( if base_cfg.moe_flex_dispatcher_backend is not None: cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + if precision == "nvfp4": + cfg.model.moe_grouped_gemm = False + return cfg @@ -100,6 +103,9 @@ def nemotron_3_nano_pretrain_config_b300( if base_cfg.moe_flex_dispatcher_backend is not None: cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + if precision == "nvfp4": + cfg.model.moe_grouped_gemm = False + return cfg diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py index aa2fd139d5..7c85f88543 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py @@ -43,12 +43,7 @@ tensor_model_parallel_size=1, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 -NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_NVFP4_V1 = replace( - NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1, - global_batch_size=128, - expert_tensor_parallel_size=None, - expert_model_parallel_size=4, -) +NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 = replace( BASE_NEMOTRON_3_NANO_CONFIG, @@ -69,12 +64,7 @@ tensor_model_parallel_size=1, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 -NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = replace( - NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1, - global_batch_size=128, - expert_tensor_parallel_size=None, - expert_model_parallel_size=4, -) +NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 = replace( BASE_NEMOTRON_3_NANO_CONFIG, From b88b04328beaffb8a89d22b704d53e682be64b89 Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Wed, 4 Mar 2026 11:17:20 +0530 Subject: [PATCH 5/5] nvfp4 9004 flags Signed-off-by: Malay Nagda --- .../configs/nemotronh/nemotron_3_nano_llm_pretrain.py | 6 ------ scripts/performance/perf_plugins.py | 5 +++++ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py index 134b09b9da..4f23420451 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py @@ -76,9 +76,6 @@ def nemotron_3_nano_pretrain_config_gb200( if base_cfg.moe_flex_dispatcher_backend is not None: cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend - if precision == "nvfp4": - cfg.model.moe_grouped_gemm = False - return cfg @@ -103,9 +100,6 @@ def nemotron_3_nano_pretrain_config_b300( if base_cfg.moe_flex_dispatcher_backend is not None: cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend - if precision == "nvfp4": - cfg.model.moe_grouped_gemm = False - return cfg diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py index 7bbd090497..b94341161d 100644 --- a/scripts/performance/perf_plugins.py +++ b/scripts/performance/perf_plugins.py @@ -307,6 +307,11 @@ def _set_model_specific_environment_variables( if "NVTE_NORM_BWD_USE_CUDNN" in executor.env_vars: executor.env_vars.pop("NVTE_NORM_BWD_USE_CUDNN") + if model_recipe_name in ["nemotron_3_nano"] and compute_dtype == "nvfp4": + executor.env_vars["NVTE_NVFP4_DISABLE_RHT"] = "1" + executor.env_vars["NVTE_NVFP4_DISABLE_STOCHASTIC_ROUNDING"] = "1" + executor.env_vars["NVTE_NVFP4_DISABLE_2D_QUANTIZATION"] = "1" + def _set_layernorm_sm_margin( self, task: Union["run.Partial", "run.Script"],