NVIDIA · sidsingh-nvidia · Jan 14, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026
diff --git a/gpt_builders.py b/gpt_builders.py
@@ -53,7 +53,6 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_
                     )
                 )
             elif args.num_experts:
-                assert not (config.transformer_impl == "inference_optimized")
                 # Define the decoder block spec
                 transformer_layer_spec = get_gpt_decoder_block_spec(
                     config,

@@ -133,7 +133,7 @@ def adjust_batch_dims_for_expert_parallelism(
         strict: bool,
         decode_only_cuda_graphs: bool,
         explicit_chunked_prefill: bool,
-        cuda_graph_mixed_prefill_count: int,
+        smallest_non_decode_cuda_graph_size: int,
         ep_group: Optional[torch.distributed.ProcessGroup] = None,
     ) -> Optional["InferenceBatchDimensions"]:
         """Adjusted cuda graph batch dimensions for expert parallelism.
@@ -176,9 +176,9 @@ def adjust_batch_dims_for_expert_parallelism(
         is_any_ep_rank_in_non_decode = sync_tensor[1].item() == 1
 
         # We force eager mode for scenarios where some ranks will run with CUDA graphs
-        # while others will not. Without this check, the all-to-all communication in the
+        # while others will not. Without this check, communication in the
         # expert routing layer would pad up to the maximum capacity only for the ranks that
-        # are using CUDA graphs in this step, leading to a NCCL hang.
+        # are using CUDA graphs in this step, leading to a hang.
         # This can happen in the following cases:
         #   1. If we only allow decode CUDA graphs but some ranks are running non-decode batches
         #   2. Some ranks are running explicit chunked prefill requests
@@ -203,7 +203,7 @@ def adjust_batch_dims_for_expert_parallelism(
         # graph while prefill ranks match a coarser mixed graph, which would
         # produce inconsistent token counts across EP ranks.
         if is_any_ep_rank_in_non_decode and not strict:
-            adjusted_token_count = max(adjusted_token_count, cuda_graph_mixed_prefill_count)
+            adjusted_token_count = max(adjusted_token_count, smallest_non_decode_cuda_graph_size)
 
         adjusted_batch_dim = InferenceBatchDimensions(
             token_count=adjusted_token_count,
@@ -303,7 +303,7 @@ def generate_cuda_graph_batch_dimensions_list(
         tp_size: int,
         num_cuda_graphs: Optional[int],
         cuda_graph_max_tokens: int,
-        cuda_graph_mixed_prefill_count: Optional[int],
+        cuda_graph_mixed_prefill_request_count: Optional[int],
         max_requests: int,
         max_tokens: int,
         max_sequence_length: int,
@@ -339,7 +339,7 @@ def generate_cuda_graph_batch_dimensions_list(
             tp_size: Tensor parallel size
             num_cuda_graphs: Number of CUDA graphs to generate
             cuda_graph_max_tokens: Maximum tokens for CUDA graphs
-            cuda_graph_mixed_prefill_count: Number of mixed prefill requests for CUDA graphs
+            cuda_graph_mixed_prefill_request_count: Number of mixed prefill requests for CUDA graphs
             max_requests: Maximum number of requests
             max_tokens: Maximum total tokens
             max_sequence_length: Maximum sequence length
@@ -409,8 +409,8 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
         if num_cuda_graphs is None:
             cuda_graph_batch_dimensions_list = []
         elif (
-            not cuda_graph_mixed_prefill_count
-            or cuda_graph_mixed_prefill_count <= 0
+            not cuda_graph_mixed_prefill_request_count
+            or cuda_graph_mixed_prefill_request_count <= 0
             or not use_cuda_graphs_for_non_decode_steps
         ):  # decode only
             # Use decode-specific token counts for decode-only graphs
@@ -426,14 +426,14 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
             for size in cuda_graph_prefill_token_counts:
                 add_if_valid(
                     token_count=size,
-                    prefill_req_count=min(cuda_graph_mixed_prefill_count, max_requests),
+                    prefill_req_count=min(cuda_graph_mixed_prefill_request_count, max_requests),
                     decode_req_count=min(size, max_requests)
-                    - min(cuda_graph_mixed_prefill_count, max_requests),
+                    - min(cuda_graph_mixed_prefill_request_count, max_requests),
                 )
                 # We need to ensure the prefill requests are shorter than the max sequence length,
                 # considering the one decode token is used for prefill request construction
                 prefill_only_minimal_num = max(
-                    cuda_graph_mixed_prefill_count,
+                    cuda_graph_mixed_prefill_request_count,
                     math.ceil(size / max(1, max_sequence_length - 1)),
                 )
                 if prefill_only_minimal_num < max_requests:
@@ -474,7 +474,7 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
     def match_graph_config(
         real_batch_dim: InferenceBatchDimensions,
         cuda_graph_batch_dimensions_list: List[InferenceBatchDimensions],
-        cuda_graph_mixed_prefill_count: int,
+        smallest_non_decode_cuda_graph_size: int,
         strict: bool = False,
         decode_only_cuda_graphs: bool = False,
         explicit_chunked_prefill: bool = False,
@@ -509,7 +509,7 @@ def match_graph_config(
             decode_only_cuda_graphs=decode_only_cuda_graphs,
             explicit_chunked_prefill=explicit_chunked_prefill,
             ep_group=ep_group,
-            cuda_graph_mixed_prefill_count=cuda_graph_mixed_prefill_count,
+            smallest_non_decode_cuda_graph_size=smallest_non_decode_cuda_graph_size,
         )
 
         if adjusted_batch_dim is None:

@@ -1,4 +1,5 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
-from .collectives import multimem_all_gather, multimem_reduce_scatter
+from .collectives import multimem_all_gather, multimem_all_gather_fused, multimem_reduce_scatter
 from .fused_collectives import fused_multimem_rs_add_norm_ag
+from .utils import are_tensors_nvls_eligible, is_device_nvls_capable