Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ def set_nemotron_3_nano_common_configs(cfg: ConfigContainer) -> None:
cfg.mixed_precision.grad_reduce_in_fp32 = False
cfg.ddp.grad_reduce_in_fp32 = False

cfg.model.moe_router_force_load_balancing = True


def nemotron_3_nano_pretrain_config_gb300(
precision: str = "bf16", mock: bool = True, config_variant: str = "v1"
Expand Down
16 changes: 13 additions & 3 deletions scripts/performance/perf_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ class PerfEnvPlugin(Plugin):
cp_size: int | None = None
pp_size: int | None = None
ep_size: int | None = None
etp_size: int | None = None
script_args_converter_fn: Optional[Callable[[PerfEnvPluginScriptArgs], List[str]]] = None
moe_a2a_overlap: bool = False
model_family_name: str
Expand Down Expand Up @@ -325,19 +326,24 @@ def _set_nvl_domain_size(
moe_flex_dispatcher_backend: str,
gpu: str,
ep_size: int,
etp_size: int,
):
if moe_flex_dispatcher_backend == "hybridep":
# HybridEP's communication group spans EP * ETP ranks (the tp_ep_group in Megatron-LM).
hybrid_ep_group_size = ep_size * etp_size
if gpu in ["h100", "b200", "b300"]:
# Hopper/B200/B300 use NVL8 topology
executor.env_vars["NVLINK_DOMAIN_SIZE"] = "8"
executor.env_vars["USE_MNNVL"] = "0"
executor.env_vars["NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN"] = "8" if ep_size > 8 else str(ep_size)
executor.env_vars["NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN"] = (
"8" if hybrid_ep_group_size > 8 else str(hybrid_ep_group_size)
)
else:
# GB200/GB300 use NVL72 topology
assert ep_size <= 72, "ep_size must be less than or equal to 72"
assert hybrid_ep_group_size <= 72, "ep_size * etp_size must be less than or equal to 72"
executor.env_vars["NVLINK_DOMAIN_SIZE"] = "72"
executor.env_vars["USE_MNNVL"] = "1"
executor.env_vars["NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN"] = str(ep_size)
executor.env_vars["NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN"] = str(hybrid_ep_group_size)

def _set_nccl_pp_comm_chunksize(
self,
Expand Down Expand Up @@ -418,6 +424,9 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo
pp_size = self.pp_size if self.pp_size is not None else workload_base_config.pipeline_model_parallel_size
cp_size = self.cp_size if self.cp_size is not None else workload_base_config.context_parallel_size
ep_size = self.ep_size if self.ep_size is not None else workload_base_config.expert_model_parallel_size
etp_size = (
self.etp_size if self.etp_size is not None else (workload_base_config.expert_tensor_parallel_size or 1)
)

# Force program order kernel launch for TP, CP overlap
moe_flex_dispatcher_backend = getattr(workload_base_config, "moe_flex_dispatcher_backend", None)
Expand Down Expand Up @@ -449,6 +458,7 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo
moe_flex_dispatcher_backend,
self.gpu,
ep_size,
etp_size,
)

# Set the chunk size of P2P communications
Expand Down
3 changes: 3 additions & 0 deletions scripts/performance/setup_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ def main(
pp_size: Optional[int],
cp_size: Optional[int],
ep_size: Optional[int],
etp_size: Optional[int],
wandb_key: str,
wandb_project_name: str,
wandb_experiment_name: str,
Expand Down Expand Up @@ -337,6 +338,7 @@ def main(
pp_size=pp_size,
cp_size=cp_size,
ep_size=ep_size,
etp_size=etp_size,
model_family_name=model_family_name,
model_recipe_name=model_recipe_name,
gpu=gpu,
Expand Down Expand Up @@ -550,6 +552,7 @@ def main(
pp_size=args.pipeline_model_parallel_size,
cp_size=args.context_parallel_size,
ep_size=args.expert_model_parallel_size,
etp_size=args.expert_tensor_parallel_size,
wandb_key=args.wandb_key,
wandb_project_name=args.wandb_project_name,
wandb_experiment_name=args.wandb_experiment_name,
Expand Down