diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py index 4f23420451..3ecccd6f54 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py @@ -30,6 +30,8 @@ def set_nemotron_3_nano_common_configs(cfg: ConfigContainer) -> None: cfg.mixed_precision.grad_reduce_in_fp32 = False cfg.ddp.grad_reduce_in_fp32 = False + cfg.model.moe_router_force_load_balancing = True + def nemotron_3_nano_pretrain_config_gb300( precision: str = "bf16", mock: bool = True, config_variant: str = "v1" diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py index 7c85f88543..03abcf8adb 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py @@ -31,37 +31,38 @@ BASE_NEMOTRON_3_NANO_CONFIG = WorkloadBaseConfig( num_gpus=8, global_batch_size=512, - micro_batch_size=2, tensor_model_parallel_size=1, expert_tensor_parallel_size=1, expert_model_parallel_size=8, moe_flex_dispatcher_backend="hybridep", + cuda_graph_impl="transformer_engine", + cuda_graph_scope=["attn", "mamba", "moe_router", "moe_preprocess"], ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 = replace( BASE_NEMOTRON_3_NANO_CONFIG, - tensor_model_parallel_size=1, + micro_batch_size=4, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 = replace( BASE_NEMOTRON_3_NANO_CONFIG, - tensor_model_parallel_size=1, + micro_batch_size=2, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 = replace( BASE_NEMOTRON_3_NANO_CONFIG, - tensor_model_parallel_size=1, + micro_batch_size=4, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 = replace( BASE_NEMOTRON_3_NANO_CONFIG, - tensor_model_parallel_size=1, + micro_batch_size=2, ) NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 @@ -71,11 +72,19 @@ num_gpus=16, global_batch_size=1024, micro_batch_size=1, - recompute_modules=["moe", "layernorm"], + cuda_graph_impl="transformer_engine", ) -NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1 = _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 -NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 +NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1 = replace( + _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100, + recompute_modules=["moe", "layernorm"], + cuda_graph_scope=["attn", "mamba"], +) +NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = replace( + _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100, + cuda_graph_scope=["mamba"], + recompute_modules=["moe", "layernorm", "core_attn", "moe_act"], +) __all__ = [ "NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1", diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py index 7bbd090497..fcf5b2dfd2 100644 --- a/scripts/performance/perf_plugins.py +++ b/scripts/performance/perf_plugins.py @@ -301,6 +301,9 @@ def _set_model_specific_environment_variables( if gpu in ["h100"] and model_recipe_name in ["llama3_70b"] and compute_dtype == "fp8_cs": executor.env_vars["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" executor.env_vars["NCCL_GRAPH_REGISTER"] = "0" + if model_recipe_name in ["nemotron_3_nano"]: + del_cudnn_ln = False + if del_cudnn_ln: if "NVTE_NORM_FWD_USE_CUDNN" in executor.env_vars: executor.env_vars.pop("NVTE_NORM_FWD_USE_CUDNN")