diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py index 4f23420451..3ecccd6f54 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py @@ -30,6 +30,8 @@ def set_nemotron_3_nano_common_configs(cfg: ConfigContainer) -> None: cfg.mixed_precision.grad_reduce_in_fp32 = False cfg.ddp.grad_reduce_in_fp32 = False + cfg.model.moe_router_force_load_balancing = True + def nemotron_3_nano_pretrain_config_gb300( precision: str = "bf16", mock: bool = True, config_variant: str = "v1" diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py index 7c85f88543..31310b3fea 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py @@ -41,6 +41,9 @@ NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 = replace( BASE_NEMOTRON_3_NANO_CONFIG, tensor_model_parallel_size=1, + micro_batch_size=4, + cuda_graph_impl="transformer_engine", + cuda_graph_scope=["attn", "moe_router", "moe_preprocess", "mamba"], ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1