From 0a0de668c4cc52978fc2fac960b6e06b51231139 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Tue, 3 Mar 2026 15:34:30 +0530
Subject: [PATCH 1/5] nemotron_3_nano_nvfp4_fix_260201

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../nemotronh/nemotron_3_nano_workload_base_configs.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
index 7c85f88543..433180f118 100644
--- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
+++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
@@ -50,7 +50,10 @@
     tensor_model_parallel_size=1,
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_NVFP4_V1 = replace(
+    NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1,
+    expert_tensor_parallel_size=None,
+)
 
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,
@@ -64,7 +67,10 @@
     tensor_model_parallel_size=1,
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = replace(
+    NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1,
+    expert_tensor_parallel_size=None,
+)
 
 _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,

From 5e3ee0f946265c180c1c93925820a1d033fce208 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Tue, 3 Mar 2026 15:53:22 +0530
Subject: [PATCH 2/5] b300 etp=None

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../nemotronh/nemotron_3_nano_workload_base_configs.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
index 433180f118..9cab1f0e1d 100644
--- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
+++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
@@ -60,17 +60,17 @@
     tensor_model_parallel_size=1,
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_NVFP4_V1 = replace(
+    NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1,
+    expert_tensor_parallel_size=None,
+)
 
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,
     tensor_model_parallel_size=1,
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = replace(
-    NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1,
-    expert_tensor_parallel_size=None,
-)
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1
 
 _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,

From ba9fbe812ff19f216e552ca60d4344a9ed057670 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Tue, 3 Mar 2026 16:04:31 +0530
Subject: [PATCH 3/5] gbs=128,ep=4,etp=n/a

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../nemotron_3_nano_workload_base_configs.py  | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
index 9cab1f0e1d..aa2fd139d5 100644
--- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
+++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
@@ -43,34 +43,38 @@
     tensor_model_parallel_size=1,
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_NVFP4_V1 = replace(
+    NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1,
+    global_batch_size=128,
+    expert_tensor_parallel_size=None,
+    expert_model_parallel_size=4,
+)
 
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,
     tensor_model_parallel_size=1,
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_NVFP4_V1 = replace(
-    NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1,
-    expert_tensor_parallel_size=None,
-)
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1
 
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,
     tensor_model_parallel_size=1,
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_NVFP4_V1 = replace(
-    NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1,
-    expert_tensor_parallel_size=None,
-)
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1
 
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,
     tensor_model_parallel_size=1,
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = replace(
+    NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1,
+    global_batch_size=128,
+    expert_tensor_parallel_size=None,
+    expert_model_parallel_size=4,
+)
 
 _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,

From 17bfadb4846a26dbee473df01bd1553b12e43178 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Tue, 3 Mar 2026 16:32:01 +0530
Subject: [PATCH 4/5] no grouped gemm

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../nemotronh/nemotron_3_nano_llm_pretrain.py      |  6 ++++++
 .../nemotron_3_nano_workload_base_configs.py       | 14 ++------------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py
index 4f23420451..134b09b9da 100644
--- a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py
+++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py
@@ -76,6 +76,9 @@ def nemotron_3_nano_pretrain_config_gb200(
     if base_cfg.moe_flex_dispatcher_backend is not None:
         cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
 
+    if precision == "nvfp4":
+        cfg.model.moe_grouped_gemm = False
+
     return cfg
 
 
@@ -100,6 +103,9 @@ def nemotron_3_nano_pretrain_config_b300(
     if base_cfg.moe_flex_dispatcher_backend is not None:
         cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
 
+    if precision == "nvfp4":
+        cfg.model.moe_grouped_gemm = False
+
     return cfg
 
 
diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
index aa2fd139d5..7c85f88543 100644
--- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
+++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
@@ -43,12 +43,7 @@
     tensor_model_parallel_size=1,
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_NVFP4_V1 = replace(
-    NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1,
-    global_batch_size=128,
-    expert_tensor_parallel_size=None,
-    expert_model_parallel_size=4,
-)
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1
 
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,
@@ -69,12 +64,7 @@
     tensor_model_parallel_size=1,
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = replace(
-    NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1,
-    global_batch_size=128,
-    expert_tensor_parallel_size=None,
-    expert_model_parallel_size=4,
-)
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1
 
 _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,

From b88b04328beaffb8a89d22b704d53e682be64b89 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Wed, 4 Mar 2026 11:17:20 +0530
Subject: [PATCH 5/5] nvfp4 9004 flags

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../configs/nemotronh/nemotron_3_nano_llm_pretrain.py       | 6 ------
 scripts/performance/perf_plugins.py                         | 5 +++++
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py
index 134b09b9da..4f23420451 100644
--- a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py
+++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py
@@ -76,9 +76,6 @@ def nemotron_3_nano_pretrain_config_gb200(
     if base_cfg.moe_flex_dispatcher_backend is not None:
         cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
 
-    if precision == "nvfp4":
-        cfg.model.moe_grouped_gemm = False
-
     return cfg
 
 
@@ -103,9 +100,6 @@ def nemotron_3_nano_pretrain_config_b300(
     if base_cfg.moe_flex_dispatcher_backend is not None:
         cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
 
-    if precision == "nvfp4":
-        cfg.model.moe_grouped_gemm = False
-
     return cfg
 
 
diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py
index 7bbd090497..b94341161d 100644
--- a/scripts/performance/perf_plugins.py
+++ b/scripts/performance/perf_plugins.py
@@ -307,6 +307,11 @@ def _set_model_specific_environment_variables(
             if "NVTE_NORM_BWD_USE_CUDNN" in executor.env_vars:
                 executor.env_vars.pop("NVTE_NORM_BWD_USE_CUDNN")
 
+        if model_recipe_name in ["nemotron_3_nano"] and compute_dtype == "nvfp4":
+            executor.env_vars["NVTE_NVFP4_DISABLE_RHT"] = "1"
+            executor.env_vars["NVTE_NVFP4_DISABLE_STOCHASTIC_ROUNDING"] = "1"
+            executor.env_vars["NVTE_NVFP4_DISABLE_2D_QUANTIZATION"] = "1"
+
     def _set_layernorm_sm_margin(
         self,
         task: Union["run.Partial", "run.Script"],