NVIDIA-NeMo · malay-nagda · Mar 4, 2026 · Mar 4, 2026
diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py
@@ -30,6 +30,8 @@ def set_nemotron_3_nano_common_configs(cfg: ConfigContainer) -> None:
     cfg.mixed_precision.grad_reduce_in_fp32 = False
     cfg.ddp.grad_reduce_in_fp32 = False
 
+    cfg.model.moe_router_force_load_balancing = True
+
 
 def nemotron_3_nano_pretrain_config_gb300(
     precision: str = "bf16", mock: bool = True, config_variant: str = "v1"

diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py
@@ -206,6 +206,7 @@ class PerfEnvPlugin(Plugin):
     cp_size: int | None = None
     pp_size: int | None = None
     ep_size: int | None = None
+    etp_size: int | None = None
     script_args_converter_fn: Optional[Callable[[PerfEnvPluginScriptArgs], List[str]]] = None
     moe_a2a_overlap: bool = False
     model_family_name: str
@@ -325,19 +326,24 @@ def _set_nvl_domain_size(
         moe_flex_dispatcher_backend: str,
         gpu: str,
         ep_size: int,
+        etp_size: int,
     ):
         if moe_flex_dispatcher_backend == "hybridep":
+            # HybridEP's communication group spans EP * ETP ranks (the tp_ep_group in Megatron-LM).
+            hybrid_ep_group_size = ep_size * etp_size
             if gpu in ["h100", "b200", "b300"]:
                 # Hopper/B200/B300 use NVL8 topology
                 executor.env_vars["NVLINK_DOMAIN_SIZE"] = "8"
                 executor.env_vars["USE_MNNVL"] = "0"
-                executor.env_vars["NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN"] = "8" if ep_size > 8 else str(ep_size)
+                executor.env_vars["NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN"] = (
+                    "8" if hybrid_ep_group_size > 8 else str(hybrid_ep_group_size)
+                )
             else:
                 # GB200/GB300 use NVL72 topology
-                assert ep_size <= 72, "ep_size must be less than or equal to 72"
+                assert hybrid_ep_group_size <= 72, "ep_size * etp_size must be less than or equal to 72"
                 executor.env_vars["NVLINK_DOMAIN_SIZE"] = "72"
                 executor.env_vars["USE_MNNVL"] = "1"
-                executor.env_vars["NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN"] = str(ep_size)
+                executor.env_vars["NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN"] = str(hybrid_ep_group_size)
 
     def _set_nccl_pp_comm_chunksize(
         self,
@@ -418,6 +424,9 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo
         pp_size = self.pp_size if self.pp_size is not None else workload_base_config.pipeline_model_parallel_size
         cp_size = self.cp_size if self.cp_size is not None else workload_base_config.context_parallel_size
         ep_size = self.ep_size if self.ep_size is not None else workload_base_config.expert_model_parallel_size
+        etp_size = (
+            self.etp_size if self.etp_size is not None else (workload_base_config.expert_tensor_parallel_size or 1)
+        )
 
         # Force program order kernel launch for TP, CP overlap
         moe_flex_dispatcher_backend = getattr(workload_base_config, "moe_flex_dispatcher_backend", None)
@@ -449,6 +458,7 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo
             moe_flex_dispatcher_backend,
             self.gpu,
             ep_size,
+            etp_size,
         )
 
         # Set the chunk size of P2P communications

@@ -191,6 +191,7 @@ def main(
     pp_size: Optional[int],
     cp_size: Optional[int],
     ep_size: Optional[int],
+    etp_size: Optional[int],
     wandb_key: str,
     wandb_project_name: str,
     wandb_experiment_name: str,
@@ -337,6 +338,7 @@ def main(
                 pp_size=pp_size,
                 cp_size=cp_size,
                 ep_size=ep_size,
+                etp_size=etp_size,
                 model_family_name=model_family_name,
                 model_recipe_name=model_recipe_name,
                 gpu=gpu,
@@ -550,6 +552,7 @@ def main(
         pp_size=args.pipeline_model_parallel_size,
         cp_size=args.context_parallel_size,
         ep_size=args.expert_model_parallel_size,
+        etp_size=args.expert_tensor_parallel_size,
         wandb_key=args.wandb_key,
         wandb_project_name=args.wandb_project_name,
         wandb_experiment_name=args.wandb_experiment_name,